From f448631ace2e6ea31e1f506d9444185a099981c0 Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Mon, 12 Jun 2023 10:51:24 -0400
Subject: [PATCH 01/55] [SYCL][ESIMD][E2E] Fix compile fail for
 lsc_gather_scatter_stateless_64.cpp test (#9816)

This never compiled since I added it but we didn't catch it since it
only runs on PVC.

We need to provide more template arguments for the conversion operator
to kick in for the host compile.

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 .../lsc/lsc_gather_scatter_stateless_64.cpp   | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_gather_scatter_stateless_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_gather_scatter_stateless_64.cpp
index 6ff0dc9af80fa..02610c944b86c 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_gather_scatter_stateless_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_gather_scatter_stateless_64.cpp
@@ -40,18 +40,26 @@ int main() {
          uint64_t offsetStart = (Size - VL) * sizeof(uint64_t);
          simd<uint64_t, VL> offset(offsetStart, sizeof(uint64_t));
          simd<uint64_t, VL> beginning(0, sizeof(uint64_t));
-         simd<uint64_t, VL> va = lsc_gather<uint64_t>(PA, beginning);
+         simd<uint64_t, VL> va =
+             lsc_gather<uint64_t, 1, lsc_data_size::default_size,
+                        cache_hint::none, cache_hint::none, VL>(PA, beginning);
          simd_mask<VL> pred = 1;
          simd<uint64_t, VL> old_values = 0;
          lsc_prefetch<uint64_t, 1, lsc_data_size::default_size,
-                      cache_hint::cached, cache_hint::cached>(PA, offset);
+                      cache_hint::cached, cache_hint::cached, VL>(PA, offset);
          simd<uint64_t, VL> vb =
-             lsc_gather<uint64_t>(PA, offset, pred, old_values);
-         simd<uint64_t, VL> vc = lsc_gather<uint64_t>(PA, offset);
+             lsc_gather<uint64_t, 1, lsc_data_size::default_size,
+                        cache_hint::none, cache_hint::none, VL>(
+                 PA, offset, pred, old_values);
+         simd<uint64_t, VL> vc =
+             lsc_gather<uint64_t, 1, lsc_data_size::default_size,
+                        cache_hint::none, cache_hint::none, VL>(PA, offset);
          va *= 5;
          vb += vc;
-         lsc_scatter<uint64_t>(PA, beginning, va);
-         lsc_scatter<uint64_t>(PA, offset, vb);
+         lsc_scatter<uint64_t, 1, lsc_data_size::default_size, cache_hint::none,
+                     cache_hint::none, VL>(PA, beginning, va);
+         lsc_scatter<uint64_t, 1, lsc_data_size::default_size, cache_hint::none,
+                     cache_hint::none, VL>(PA, offset, vb);
        });
      }).wait();
   } catch (sycl::exception const &e) {

From 2910add75c712a5fab3e2fecc6bfc61fd4884417 Mon Sep 17 00:00:00 2001
From: Alexey Sachkov <alexey.sachkov@intel.com>
Date: Mon, 12 Jun 2023 17:19:12 +0200
Subject: [PATCH 02/55] [SYCL] Enhance device code split call graph analysis
 (#8589)

This patch introduces significant changes to how device code split
detects functions and global variables which should be included into a
cloned module.

There are two main changes done to that:
1. analysis algorithm now traces uses of global variables to allow
adding all globals into every split module
2. analysis algorithm now traces indirect calls, trying to define a list
of all functions which are potentially called indirectly to avoid the
need to disabled device code split completely in presence of indirect
calls

Both things are implemented through new `DependencyGraph` entity, which
replaces `CallGraph` entity we used.
Instead of calls, that new graph is built over _uses_ of functions and
variables to understand which functions and global variables are used by
which functions and global variables.

The most tricky part here is indirect calls: we can't understand which
exact function is being called by an indirect call. However, we can
compile a list of _potentially_-called function by comparing function
signatures with signature of an indirect call.

On top of that, ESIMD handling is refactored by this patch:
- outlined ESIMD-specific handling into a separate function
- created new ESIMD-specific device code split helper

New ESIMD-specific device code split helper is needed, because we should
use different rules for ESIMD and non-ESIMD parts of a module when
splitting it to two. For ESIMD part we want to grab all ESIMD-functions
even if they were not considered as entry points in the original module.
For non-ESIMD part we **don't want** to grab _any_ ESIMD-functions, even
if they are referenced/used by non-ESIMD functions.

Both of those special rules come from `invoke_simd` feature support:
non-ESIMD kernel can indirectly reference a ESIMD function. Since those
different kind of functions require different processing, we have to
completely separate them before processing step. Non-ESIMD module could
be incomplete as a result of such split, but it will be merged back with
ESIMD module after ESIMD lowering. That merge step is required for
`invoke_simd` functionality.

---------

Co-authored-by: Cai, Justin <justin.cai@intel.com>
---
 .../assert/indirect-with-split-2.ll           |  14 +-
 .../assert/indirect-with-split.ll             |  22 +-
 .../device-code-split/auto-module-split-2.ll  |  60 +++-
 .../device-code-split/auto-module-split-3.ll  |  52 ++-
 .../auto-module-split-func-ptr.ll             |  20 +-
 .../complex-indirect-call-chain.ll            |  89 +++++
 .../device-code-split/vtable.ll               | 170 ++++++++++
 .../tools/sycl-post-link/registerallocmode.ll |   3 +-
 .../sycl-esimd/invoke-esimd-double.ll         |  71 ++++
 .../no-sycl-esimd-split-shared-func.ll        |  50 +--
 llvm/tools/sycl-post-link/ModuleSplitter.cpp  | 311 ++++++++++++------
 llvm/tools/sycl-post-link/ModuleSplitter.h    |   6 +-
 llvm/tools/sycl-post-link/sycl-post-link.cpp  | 115 +++----
 13 files changed, 756 insertions(+), 227 deletions(-)
 create mode 100644 llvm/test/tools/sycl-post-link/device-code-split/complex-indirect-call-chain.ll
 create mode 100644 llvm/test/tools/sycl-post-link/device-code-split/vtable.ll
 create mode 100644 llvm/test/tools/sycl-post-link/sycl-esimd/invoke-esimd-double.ll

diff --git a/llvm/test/tools/sycl-post-link/assert/indirect-with-split-2.ll b/llvm/test/tools/sycl-post-link/assert/indirect-with-split-2.ll
index 74ff5eaac045e..65e1420d81356 100644
--- a/llvm/test/tools/sycl-post-link/assert/indirect-with-split-2.ll
+++ b/llvm/test/tools/sycl-post-link/assert/indirect-with-split-2.ll
@@ -9,8 +9,16 @@
 ; marked as using asserts.
 
 ; RUN: sycl-post-link -split=auto -symbols -S < %s -o %t.table
-; RUN: FileCheck %s -input-file=%t_0.prop -check-prefix=PRESENCE-CHECK
-; RUN: FileCheck %s -input-file=%t_0.prop -check-prefix=ABSENCE-CHECK
+; RUN: FileCheck %s -input-file=%t_0.prop -check-prefixes=CHECK,CHECK0 \
+; RUN:     --implicit-check-not TU1
+; RUN: FileCheck %s -input-file=%t_1.prop -check-prefixes=CHECK,CHECK1 \
+; RUN:     --implicit-check-not TU0
+;
+; CHECK: [SYCL/assert used]
+; CHECK0-DAG: main_TU1_kernel0
+; CHECK0-DAG: main_TU1_kernel1
+;
+; CHECK1: main_TU0_kernel0
 
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
 target triple = "spir64-unknown-linux"
@@ -40,7 +48,7 @@ entry:
 }
 
 ; ABSENCE-CHECK-NOT: empty_kernel
-define dso_local spir_kernel void @empty_kernel() {
+define dso_local spir_kernel void @empty_kernel() #2 {
   %1 = ptrtoint void ()* @bar to i64
   ret void
 }
diff --git a/llvm/test/tools/sycl-post-link/assert/indirect-with-split.ll b/llvm/test/tools/sycl-post-link/assert/indirect-with-split.ll
index c678633b9f841..e7ba6c43bb240 100644
--- a/llvm/test/tools/sycl-post-link/assert/indirect-with-split.ll
+++ b/llvm/test/tools/sycl-post-link/assert/indirect-with-split.ll
@@ -7,7 +7,21 @@
 ; marked as using asserts.
 
 ; RUN: sycl-post-link -split=auto -symbols -S < %s -o %t.table
-; RUN: FileCheck %s -input-file=%t_0.prop
+; RUN: FileCheck %s -input-file=%t_0.prop --check-prefixes=CHECK,CHECK1 \
+; RUN:     --implicit-check-not TU0
+; RUN: FileCheck %s -input-file=%t_1.prop --check-prefixes=CHECK,CHECK0 \
+; RUN:     --implicit-check-not TU1 --implicit-check-not kernel1
+;
+; With recent improvements to device code split, this file is actually being
+; split to two modules and one of them does not contain "indirectly-referenced"
+; function, meaning that only direct users of 'assert' will be mentioned in
+; device image properties.
+;
+; CHECK: [SYCL/assert used]
+; CHECK0: main_TU0_kernel0
+;
+; CHECK1-DAG: main_TU1_kernel0
+; CHECK1-DAG: main_TU1_kernel1
 
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
 target triple = "spir64-unknown-linux"
@@ -20,9 +34,6 @@ target triple = "spir64-unknown-linux"
 @__spirv_BuiltInLocalInvocationId = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32
 @_ZL10assert_fmt = internal addrspace(2) constant [85 x i8] c"%s:%d: %s: global id: [%lu,%lu,%lu], local id: [%lu,%lu,%lu] Assertion `%s` failed.\0A\00", align 1
 
-; CHECK: [SYCL/assert used]
-
-; CHECK-DAG: main_TU0_kernel0
 define dso_local spir_kernel void @main_TU0_kernel0() #0 {
 entry:
   call spir_func void @_Z3foov()
@@ -40,7 +51,6 @@ entry:
   ret void
 }
 
-; CHECK-DAG: main_TU0_kernel1
 define dso_local spir_kernel void @main_TU0_kernel1() #0 {
 entry:
   call spir_func void @_Z4foo1v()
@@ -55,14 +65,12 @@ entry:
   ret void
 }
 
-; CHECK-DAG: main_TU1_kernel0
 define dso_local spir_kernel void @main_TU1_kernel0() #2 {
 entry:
   call spir_func void @_Z3foov()
   ret void
 }
 
-; CHECK-DAG: main_TU1_kernel1
 define dso_local spir_kernel void @main_TU1_kernel1() #2 {
 entry:
   call spir_func void @_Z4foo2v()
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-2.ll b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-2.ll
index 29a2688397d49..dec4d40d561e8 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-2.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-2.ll
@@ -1,17 +1,30 @@
 ; RUN: sycl-post-link -split=auto -symbols -S < %s -o %t.table
-; In precense of indirectly callable function auto mode is equal to no split,
-; which means that separate LLVM IR file for device is not generated and we only
-; need to check generated symbol table
-; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK
+;
+; This is the same as auto-module-split-1 test with the only difference is that
+; @_Z3foov is marked with "referenced-indirectly" attribute.
+; The purpose of this test is to make sure that we can still perform device code
+; split as usual, because that function is not a part of any indirect calls
+;
+; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-TU0,CHECK
+; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-TU1,CHECK
+; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT
+; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT
 
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
 target triple = "spir64-unknown-linux"
 
 $_Z3barIiET_S0_ = comdat any
 
+; CHECK-TU1-NOT: @{{.*}}GV{{.*}}
+; CHECK-TU0: @{{.*}}GV{{.*}} = internal addrspace(1) constant [1 x i32] [i32 42], align 4
 @_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4
 
-; CHECK: {{.*}}TU0_kernel0{{.*}}
+; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}}
+; CHECK-TU1-TXT: {{.*}}TU0_kernel0{{.*}}
+; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel0{{.*}}
+; CHECK-TU0-TXT-NOT: {{.*}}TU0_kernel0{{.*}}
+
+; CHECK-TU1: call spir_func void @{{.*}}foo{{.*}}()
 
 define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 {
 entry:
@@ -19,6 +32,11 @@ entry:
   ret void
 }
 
+; CHECK-TU1: define dso_local spir_func void @{{.*}}foo{{.*}}()
+; CHECK-TU0-NOT: define dso_local spir_func void @{{.*}}foo{{.*}}()
+
+; CHECK-TU1: call spir_func i32 @{{.*}}bar{{.*}}(i32 1)
+
 define dso_local spir_func void @_Z3foov() #2 {
 entry:
   %a = alloca i32, align 4
@@ -28,6 +46,9 @@ entry:
   ret void
 }
 
+; CHECK-TU1: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg)
+; CHECK-TU0-NOT: define {{.*}} spir_func i32 @{{.*}}bar{{.*}}(i32 %arg)
+
 ; Function Attrs: nounwind
 define linkonce_odr dso_local spir_func i32 @_Z3barIiET_S0_(i32 %arg) comdat {
 entry:
@@ -37,7 +58,12 @@ entry:
   ret i32 %0
 }
 
-; CHECK: {{.*}}TU0_kernel1{{.*}}
+; CHECK-TU1: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}()
+; CHECK-TU1-TXT: {{.*}}TU0_kernel1{{.*}}
+; CHECK-TU0-NOT: define dso_local spir_kernel void @{{.*}}TU0_kernel1{{.*}}()
+; CHECK-TU0-TXT-NOT: {{.*}}TU0_kernel1{{.*}}
+
+; CHECK-TU1: call spir_func void @{{.*}}foo1{{.*}}()
 
 define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 {
 entry:
@@ -45,6 +71,9 @@ entry:
   ret void
 }
 
+; CHECK-TU1: define dso_local spir_func void @{{.*}}foo1{{.*}}()
+; CHECK-TU0-NOT: define dso_local spir_func void @{{.*}}foo1{{.*}}()
+
 ; Function Attrs: nounwind
 define dso_local spir_func void @_Z4foo1v() {
 entry:
@@ -52,7 +81,13 @@ entry:
   store i32 2, i32* %a, align 4
   ret void
 }
-; CHECK: {{.*}}TU1_kernel{{.*}}
+
+; CHECK-TU1-NOT: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}()
+; CHECK-TU1-TXT-NOT: {{.*}}TU1_kernel{{.*}}
+; CHECK-TU0: define dso_local spir_kernel void @{{.*}}TU1_kernel{{.*}}()
+; CHECK-TU0-TXT: {{.*}}TU1_kernel{{.*}}
+
+; CHECK-TU0: call spir_func void @{{.*}}foo2{{.*}}()
 
 define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 {
 entry:
@@ -60,10 +95,14 @@ entry:
   ret void
 }
 
+; CHECK-TU1-NOT: define dso_local spir_func void @{{.*}}foo2{{.*}}()
+; CHECK-TU0: define dso_local spir_func void @{{.*}}foo2{{.*}}()
+
 ; Function Attrs: nounwind
 define dso_local spir_func void @_Z4foo2v() {
 entry:
   %a = alloca i32, align 4
+; CHECK-TU0: %0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @{{.*}}GV{{.*}} to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4
   %0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4
   %add = add nsw i32 4, %0
   store i32 %add, i32* %a, align 4
@@ -74,8 +113,15 @@ attributes #0 = { "sycl-module-id"="TU1.cpp" }
 attributes #1 = { "sycl-module-id"="TU2.cpp" }
 attributes #2 = { "referenced-indirectly" }
 
+; Metadata is saved in both modules.
+; CHECK: !opencl.spir.version = !{!0, !0}
+; CHECK: !spirv.Source = !{!1, !1}
+
 !opencl.spir.version = !{!0, !0}
 !spirv.Source = !{!1, !1}
 
+; CHECK: !0 = !{i32 1, i32 2}
+; CHECK: !1 = !{i32 4, i32 100000}
+
 !0 = !{i32 1, i32 2}
 !1 = !{i32 4, i32 100000}
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-3.ll b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-3.ll
index f8063cf53892b..9ff11e200c64c 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-3.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-3.ll
@@ -1,8 +1,31 @@
 ; RUN: sycl-post-link -split=auto -symbols -S < %s -o %t.table
-; In precense of indirect calls auto mode is equal to no split,
-; which means that separate LLVM IR file for device is not generated and we only
-; need to check generated symbol table
-; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK
+;
+; In precense of indirect calls we start matching functions using their
+; signatures, i.e. we have an indirect call to i32(i32) function within
+; @_Z3foov, which means that all functions with i32(i32) signature should be
+; placed in the same module as @_Z3foov.
+;
+; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-TU0-IR \
+; RUN:     --implicit-check-not TU0_kernel --implicit-check-not _Z3foov
+; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-TU1-IR \
+; RUN:     --implicit-check-not TU1_kernel --implicit-check-not _Z4foo2v
+; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-SYM
+; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-SYM
+;
+; CHECK-TU0-SYM: _ZTSZ4mainE11TU1_kernel0
+; CHECK-TU0-SYM: _ZTSZ4mainE11TU1_kernel1
+;
+; CHECK-TU1-SYM: _ZTSZ4mainE10TU0_kernel
+;
+; CHECK-TU0-IR: @_ZL2GV = internal addrspace(1) constant
+; CHECK-TU0-IR: define dso_local spir_kernel void @_ZTSZ4mainE11TU1_kernel0
+; CHECK-TU0-IR: define dso_local spir_func i32 @_Z4foo1v
+; CHECK-TU0-IR: define dso_local spir_kernel void @_ZTSZ4mainE11TU1_kernel1
+; CHECK-TU0-IR: define dso_local spir_func void @_Z4foo2v
+;
+; CHECK-TU1-IR: define dso_local spir_kernel void @_ZTSZ4mainE10TU0_kernel
+; CHECK-TU1-IR: define dso_local spir_func void @_Z3foov
+; CHECK-TU1-IR: define dso_local spir_func i32 @_Z4foo1v
 
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
 target triple = "spir64-unknown-linux"
@@ -11,9 +34,7 @@ $_Z3barIiET_S0_ = comdat any
 
 @_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4
 
-; CHECK: {{.*}}TU0_kernel0{{.*}}
-
-define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel0() #0 {
+define dso_local spir_kernel void @_ZTSZ4mainE10TU0_kernel() #0 {
 entry:
   call spir_func void @_Z3foov()
   ret void
@@ -38,24 +59,23 @@ entry:
   ret i32 %0
 }
 
-; CHECK: {{.*}}TU0_kernel1{{.*}}
-
-define dso_local spir_kernel void @_ZTSZ4mainE11TU0_kernel1() #0 {
+define dso_local spir_kernel void @_ZTSZ4mainE11TU1_kernel0() #1 {
 entry:
-  call spir_func void @_Z4foo1v()
+  %a = alloca i32, align 4
+  %arg = load i32, i32* %a, align 4
+  %call = call spir_func i32 @_Z4foo1v(i32 %arg)
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local spir_func void @_Z4foo1v() {
+define dso_local spir_func i32 @_Z4foo1v(i32 %arg) {
 entry:
   %a = alloca i32, align 4
-  store i32 2, i32* %a, align 4
-  ret void
+  store i32 %arg, i32* %a, align 4
+  ret i32 %arg
 }
-; CHECK: {{.*}}TU1_kernel{{.*}}
 
-define dso_local spir_kernel void @_ZTSZ4mainE10TU1_kernel() #1 {
+define dso_local spir_kernel void @_ZTSZ4mainE11TU1_kernel1() #1 {
 entry:
   call spir_func void @_Z4foo2v()
   ret void
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-func-ptr.ll b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-func-ptr.ll
index 43da659e7f85b..a7e17701ba01d 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-func-ptr.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-func-ptr.ll
@@ -1,8 +1,20 @@
 ; RUN: sycl-post-link -split=auto -symbols -S < %s -o %t.table
-; RUN: FileCheck %s -input-file=%t_0.sym
-
-; This test checkes that module is not split if function pointer's user is not
-; CallInst.
+; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix=CHECK-SYM0
+; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix=CHECK-SYM1
+; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix=CHECK-IR0
+; RUN: FileCheck %s -input-file=%t_1.ll --check-prefix=CHECK-IR1
+
+; This test checkes that we can properly perform device code split by tracking
+; all uses of functions (not only direct calls)
+
+; CHECK-SYM0: kernel2
+; CHECK-SYM1: kernel1
+;
+; CHECK-IR0: define dso_local spir_kernel void @kernel2
+;
+; CHECK-IR1: @_Z2f1iTable = weak global [1 x i32 (i32)*] [i32 (i32)* @_Z2f1i]
+; CHECK-IR1: define dso_local spir_func i32 @_Z2f1i
+; CHECK-IR1: define weak_odr dso_local spir_kernel void @kernel1
 
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
 target triple = "spir64_x86_64-unknown-unknown"
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/complex-indirect-call-chain.ll b/llvm/test/tools/sycl-post-link/device-code-split/complex-indirect-call-chain.ll
new file mode 100644
index 0000000000000..3c2b87e4fe4de
--- /dev/null
+++ b/llvm/test/tools/sycl-post-link/device-code-split/complex-indirect-call-chain.ll
@@ -0,0 +1,89 @@
+; The idea of the test is to ensure that sycl-post-link can trace through more
+; complex call stacks involving several nested indirect calls
+
+; RUN: sycl-post-link -split=auto -S < %s -o %t.table
+; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix CHECK0 \
+; RUN:     --implicit-check-not @foo --implicit-check-not @kernel_A \
+; RUN:     --implicit-check-not @kernel_B
+; RUN: FileCheck %s -input-file=%t_1.ll --check-prefix CHECK1 \
+; RUN:     --implicit-check-not @kernel_A --implicit-check-not @kernel_C
+; RUN: FileCheck %s -input-file=%t_2.ll --check-prefix CHECK2 \
+; RUN:     --implicit-check-not @foo --implicit-check-not @bar \
+; RUN:     --implicit-check-not @baz_2 --implicit-check-not @kernel_B \
+; RUN:     --implicit-check-not @kernel_C
+;
+; RUN: sycl-post-link -split=source -S < %s -o %t.table
+; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix CHECK0 \
+; RUN:     --implicit-check-not @foo --implicit-check-not @kernel_A \
+; RUN:     --implicit-check-not @kernel_B
+; RUN: FileCheck %s -input-file=%t_1.ll --check-prefix CHECK1 \
+; RUN:     --implicit-check-not @kernel_A --implicit-check-not @kernel_C
+; RUN: FileCheck %s -input-file=%t_2.ll --check-prefix CHECK2 \
+; RUN:     --implicit-check-not @foo --implicit-check-not @bar \
+; RUN:     --implicit-check-not @baz_2 --implicit-check-not @kernel_B \
+; RUN:     --implicit-check-not @kernel_C
+;
+; RUN: sycl-post-link -split=kernel -S < %s -o %t.table
+; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix CHECK0 \
+; RUN:     --implicit-check-not @foo --implicit-check-not @kernel_A \
+; RUN:     --implicit-check-not @kernel_B
+; RUN: FileCheck %s -input-file=%t_1.ll --check-prefix CHECK1 \
+; RUN:     --implicit-check-not @kernel_A --implicit-check-not @kernel_C
+; RUN: FileCheck %s -input-file=%t_2.ll --check-prefix CHECK2 \
+; RUN:     --implicit-check-not @foo --implicit-check-not @bar \
+; RUN:     --implicit-check-not @baz_2 --implicit-check-not @kernel_B \
+; RUN:     --implicit-check-not @kernel_C
+
+; CHECK0-DAG: define spir_kernel void @kernel_C
+; CHECK0-DAG: define spir_func i32 @bar
+; CHECK0-DAG: define spir_func void @baz
+; CHECK0-DAG: define spir_func void @baz_2
+
+; CHECK1-DAG: define spir_kernel void @kernel_B
+; CHECK1-DAG: define spir_func i32 @foo
+; CHECK1-DAG: define spir_func i32 @bar
+; CHECK1-DAG: define spir_func void @baz
+; CHECK1-DAG: define spir_func void @baz_2
+
+; CHECK2-DAG: define spir_kernel void @kernel_A
+; CHECK2-DAG: define spir_func void @baz
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+define spir_func i32 @foo(i32 (i32, void ()*)* %ptr1, void ()* %ptr2) {
+  %1 = call spir_func i32 %ptr1(i32 42, void ()* %ptr2)
+  ret i32 %1
+}
+
+define spir_func i32 @bar(i32 %arg, void ()* %ptr) {
+  call spir_func void %ptr()
+  ret i32 %arg
+}
+
+define spir_func void @baz() {
+  ret void
+}
+
+define spir_func void @baz_2() {
+  ret void
+}
+
+define spir_kernel void @kernel_A() #0 {
+  call spir_func void @baz()
+  ret void
+}
+
+define spir_kernel void @kernel_B() #1 {
+  call spir_func i32 @foo(i32 (i32, void ()*)* null, void ()* null)
+  ret void
+}
+
+define spir_kernel void @kernel_C() #2 {
+  call spir_func i32 @bar(i32 42, void ()* null)
+  ret void
+}
+
+attributes #0 = { "sycl-module-id"="TU1.cpp" }
+attributes #1 = { "sycl-module-id"="TU2.cpp" }
+attributes #2 = { "sycl-module-id"="TU3.cpp" }
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/vtable.ll b/llvm/test/tools/sycl-post-link/device-code-split/vtable.ll
new file mode 100644
index 0000000000000..44aa049feabbb
--- /dev/null
+++ b/llvm/test/tools/sycl-post-link/device-code-split/vtable.ll
@@ -0,0 +1,170 @@
+; This test ensures that sycl-post-link properly handles cases when one global
+; object used in a kernel, is being initialized with another global object.
+;
+; To make the example more realistic, this IR comes from the following SYCL
+; snippet:
+;
+; class Base {
+; public:
+;   virtual int display() { return 1; }
+; };
+; 
+; class Derived1 : public Base {
+; public:
+;   int display() { return 2; }
+; };
+; 
+; class Derived2 : public Base {
+; public:
+;   int display() { return 3; }
+; };
+; 
+; int main() {
+;   sycl::queue Q;
+; 
+;   auto *Storage =
+;       sycl::malloc_device<char>(sizeof(Derived1) + sizeof(Derived2), Q);
+;   auto *Ptrs = sycl::malloc_device<Base *>(2, Q);
+; 
+;   Q.single_task([=] {
+;      Ptrs[0] = new (&Storage[0]) Derived1;
+;      Ptrs[1] = new (&Storage[sizeof(Derived1)]) Derived2;
+;    }).wait();
+; }
+;
+; Compiled with clang++ -fsycl -fsycl-device-only -O2
+;     -Xclang -fsycl-allow-virtual-functions -fno-sycl-instrument-device-code
+;
+; The aim of the test is to check that 'display' method referenced from global
+; variables storing vtable, are also included into the final module, even though
+; they are not directly used in a kernel otherwise.
+;
+; RUN: sycl-post-link -split=auto -S < %s -o %t.table
+; RUN: FileCheck %s -input-file=%t_0.ll
+;
+; CHECK-DAG: @_ZTV8Derived1 = {{.*}} @_ZN8Derived17displayEv
+; CHECK-DAG: @_ZTV8Derived2 = {{.*}} @_ZN8Derived27displayEv
+;
+; CHECK-DAG: define {{.*}} i32 @_ZN8Derived17displayEv
+; CHECK-DAG: define {{.*}} i32 @_ZN8Derived27displayEv
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+%class.Base = type { i32 (...)** }
+%class.Derived1 = type { %class.Base }
+%class.Derived2 = type { %class.Base }
+
+@_ZTV8Derived1 = linkonce_odr dso_local unnamed_addr constant { [3 x i8*] } { [3 x i8*] [i8* null, i8* bitcast ({ i8 addrspace(4)*, i8*, i8* }* @_ZTI8Derived1 to i8*), i8* bitcast (i32 (%class.Derived1 addrspace(4)*)* @_ZN8Derived17displayEv to i8*)] }, align 8
+@_ZTVN10__cxxabiv120__si_class_type_infoE = external dso_local addrspace(1) global i8 addrspace(4)*
+@_ZTS8Derived1 = linkonce_odr dso_local constant [10 x i8] c"8Derived1\00", align 1
+@_ZTVN10__cxxabiv117__class_type_infoE = external dso_local addrspace(1) global i8 addrspace(4)*
+@_ZTS4Base = linkonce_odr dso_local constant [6 x i8] c"4Base\00", align 1
+@_ZTI4Base = linkonce_odr dso_local constant { i8 addrspace(4)*, i8* } { i8 addrspace(4)* bitcast (i8 addrspace(4)* addrspace(4)* getelementptr inbounds (i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* addrspacecast (i8 addrspace(4)* addrspace(1)* @_ZTVN10__cxxabiv117__class_type_infoE to i8 addrspace(4)* addrspace(4)*), i64 2) to i8 addrspace(4)*), i8* getelementptr inbounds ([6 x i8], [6 x i8]* @_ZTS4Base, i32 0, i32 0) }, align 8
+@_ZTI8Derived1 = linkonce_odr dso_local constant { i8 addrspace(4)*, i8*, i8* } { i8 addrspace(4)* bitcast (i8 addrspace(4)* addrspace(4)* getelementptr inbounds (i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* addrspacecast (i8 addrspace(4)* addrspace(1)* @_ZTVN10__cxxabiv120__si_class_type_infoE to i8 addrspace(4)* addrspace(4)*), i64 2) to i8 addrspace(4)*), i8* getelementptr inbounds ([10 x i8], [10 x i8]* @_ZTS8Derived1, i32 0, i32 0), i8* bitcast ({ i8 addrspace(4)*, i8* }* @_ZTI4Base to i8*) }, align 8
+@_ZTV8Derived2 = linkonce_odr dso_local unnamed_addr constant { [3 x i8*] } { [3 x i8*] [i8* null, i8* bitcast ({ i8 addrspace(4)*, i8*, i8* }* @_ZTI8Derived2 to i8*), i8* bitcast (i32 (%class.Derived2 addrspace(4)*)* @_ZN8Derived27displayEv to i8*)] }, align 8
+@_ZTS8Derived2 = linkonce_odr dso_local constant [10 x i8] c"8Derived2\00", align 1
+@_ZTI8Derived2 = linkonce_odr dso_local constant { i8 addrspace(4)*, i8*, i8* } { i8 addrspace(4)* bitcast (i8 addrspace(4)* addrspace(4)* getelementptr inbounds (i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* addrspacecast (i8 addrspace(4)* addrspace(1)* @_ZTVN10__cxxabiv120__si_class_type_infoE to i8 addrspace(4)* addrspace(4)*), i64 2) to i8 addrspace(4)*), i8* getelementptr inbounds ([10 x i8], [10 x i8]* @_ZTS8Derived2, i32 0, i32 0), i8* bitcast ({ i8 addrspace(4)*, i8* }* @_ZTI4Base to i8*) }, align 8
+
+; Function Attrs: mustprogress norecurse nounwind
+define weak_odr dso_local spir_kernel void @_ZTSZ4mainEUlvE_(%class.Base addrspace(4)* addrspace(1)* noundef align 8 %_arg_Ptrs, i8 addrspace(1)* noundef align 1 %_arg_Storage) local_unnamed_addr #0 !srcloc !48 !kernel_arg_buffer_location !49 !sycl_fixed_targets !50 !sycl_kernel_omit_args !51 {
+entry:
+  %0 = bitcast i8 addrspace(1)* %_arg_Storage to %class.Derived1 addrspace(1)*
+  %1 = addrspacecast i8 addrspace(1)* %_arg_Storage to %class.Derived1 addrspace(4)*
+  %2 = getelementptr %class.Derived1, %class.Derived1 addrspace(1)* %0, i64 0, i32 0, i32 0
+  store i32 (...)** bitcast (i8** getelementptr inbounds ({ [3 x i8*] }, { [3 x i8*] }* @_ZTV8Derived1, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)** addrspace(1)* %2, align 8, !tbaa !52
+  %3 = getelementptr %class.Derived1, %class.Derived1 addrspace(4)* %1, i64 0, i32 0
+  store %class.Base addrspace(4)* %3, %class.Base addrspace(4)* addrspace(1)* %_arg_Ptrs, align 8, !tbaa !55
+  %arrayidx4.i5 = getelementptr inbounds i8, i8 addrspace(1)* %_arg_Storage, i64 8
+  %arrayidx4.i = addrspacecast i8 addrspace(1)* %arrayidx4.i5 to i8 addrspace(4)*
+  %4 = bitcast i8 addrspace(1)* %arrayidx4.i5 to i32 (...)** addrspace(1)*
+  store i32 (...)** bitcast (i8** getelementptr inbounds ({ [3 x i8*] }, { [3 x i8*] }* @_ZTV8Derived2, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)** addrspace(1)* %4, align 8, !tbaa !52
+  %arrayidx6.i6 = getelementptr inbounds %class.Base addrspace(4)*, %class.Base addrspace(4)* addrspace(1)* %_arg_Ptrs, i64 1
+  %5 = bitcast %class.Base addrspace(4)* addrspace(1)* %arrayidx6.i6 to i8 addrspace(4)* addrspace(1)*
+  store i8 addrspace(4)* %arrayidx4.i, i8 addrspace(4)* addrspace(1)* %5, align 8, !tbaa !55
+  ret void
+}
+
+; Function Attrs: mustprogress norecurse nounwind
+define linkonce_odr dso_local spir_func noundef i32 @_ZN8Derived17displayEv(%class.Derived1 addrspace(4)* noundef align 8 dereferenceable_or_null(8) %this) unnamed_addr #1 align 2 !srcloc !58 {
+entry:
+  ret i32 2
+}
+
+; Function Attrs: mustprogress norecurse nounwind
+define linkonce_odr dso_local spir_func noundef i32 @_ZN8Derived27displayEv(%class.Derived2 addrspace(4)* noundef align 8 dereferenceable_or_null(8) %this) unnamed_addr #1 align 2 !srcloc !59 {
+entry:
+  ret i32 3
+}
+
+declare dso_local spir_func i32 @_Z18__spirv_ocl_printfPU3AS2Kcz(i8 addrspace(2)*, ...)
+
+attributes #0 = { mustprogress norecurse nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-module-id"="vf2.cpp" "sycl-optlevel"="2" "sycl-single-task" "uniform-work-group-size"="true" }
+attributes #1 = { mustprogress norecurse nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-optlevel"="2" }
+
+!llvm.module.flags = !{!0, !1}
+!opencl.spir.version = !{!2}
+!spirv.Source = !{!3}
+!sycl_aspects = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46}
+!llvm.ident = !{!47}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{i32 1, i32 2}
+!3 = !{i32 4, i32 100000}
+!4 = !{!"cpu", i32 1}
+!5 = !{!"gpu", i32 2}
+!6 = !{!"accelerator", i32 3}
+!7 = !{!"custom", i32 4}
+!8 = !{!"fp16", i32 5}
+!9 = !{!"fp64", i32 6}
+!10 = !{!"image", i32 9}
+!11 = !{!"online_compiler", i32 10}
+!12 = !{!"online_linker", i32 11}
+!13 = !{!"queue_profiling", i32 12}
+!14 = !{!"usm_device_allocations", i32 13}
+!15 = !{!"usm_host_allocations", i32 14}
+!16 = !{!"usm_shared_allocations", i32 15}
+!17 = !{!"usm_system_allocations", i32 17}
+!18 = !{!"ext_intel_pci_address", i32 18}
+!19 = !{!"ext_intel_gpu_eu_count", i32 19}
+!20 = !{!"ext_intel_gpu_eu_simd_width", i32 20}
+!21 = !{!"ext_intel_gpu_slices", i32 21}
+!22 = !{!"ext_intel_gpu_subslices_per_slice", i32 22}
+!23 = !{!"ext_intel_gpu_eu_count_per_subslice", i32 23}
+!24 = !{!"ext_intel_max_mem_bandwidth", i32 24}
+!25 = !{!"ext_intel_mem_channel", i32 25}
+!26 = !{!"usm_atomic_host_allocations", i32 26}
+!27 = !{!"usm_atomic_shared_allocations", i32 27}
+!28 = !{!"atomic64", i32 28}
+!29 = !{!"ext_intel_device_info_uuid", i32 29}
+!30 = !{!"ext_oneapi_srgb", i32 30}
+!31 = !{!"ext_oneapi_native_assert", i32 31}
+!32 = !{!"host_debuggable", i32 32}
+!33 = !{!"ext_intel_gpu_hw_threads_per_eu", i32 33}
+!34 = !{!"ext_oneapi_cuda_async_barrier", i32 34}
+!35 = !{!"ext_oneapi_bfloat16_math_functions", i32 35}
+!36 = !{!"ext_intel_free_memory", i32 36}
+!37 = !{!"ext_intel_device_id", i32 37}
+!38 = !{!"ext_intel_memory_clock_rate", i32 38}
+!39 = !{!"ext_intel_memory_bus_width", i32 39}
+!40 = !{!"emulated", i32 40}
+!41 = !{!"ext_intel_legacy_image", i32 41}
+!42 = !{!"int64_base_atomics", i32 7}
+!43 = !{!"int64_extended_atomics", i32 8}
+!44 = !{!"usm_system_allocator", i32 17}
+!45 = !{!"usm_restricted_shared_allocations", i32 16}
+!46 = !{!"host", i32 0}
+!47 = !{!"clang version 17.0.0 "}
+!48 = !{i32 546}
+!49 = !{i32 -1, i32 -1}
+!50 = !{}
+!51 = !{i1 false, i1 false}
+!52 = !{!53, !53, i64 0}
+!53 = !{!"vtable pointer", !54, i64 0}
+!54 = !{!"Simple C++ TBAA"}
+!55 = !{!56, !56, i64 0}
+!56 = !{!"any pointer", !57, i64 0}
+!57 = !{!"omnipotent char", !54, i64 0}
+!58 = !{i32 193}
+!59 = !{i32 273}
diff --git a/llvm/test/tools/sycl-post-link/registerallocmode.ll b/llvm/test/tools/sycl-post-link/registerallocmode.ll
index 1afdc3023a6df..751fe6de2667a 100644
--- a/llvm/test/tools/sycl-post-link/registerallocmode.ll
+++ b/llvm/test/tools/sycl-post-link/registerallocmode.ll
@@ -13,10 +13,9 @@
 ; RUN: FileCheck %s -input-file=%t_esimd_0.sym --check-prefixes CHECK-ESIMD-LargeGRF-SYM
 
 ; CHECK: [Code|Properties|Symbols]
-; CHECK: {{.*}}_0.ll|{{.*}}_0.prop|{{.*}}_0.sym
 ; CHECK: {{.*}}_esimd_0.ll|{{.*}}_esimd_0.prop|{{.*}}_esimd_0.sym
-; CHECK: {{.*}}_1.ll|{{.*}}_1.prop|{{.*}}_1.sym
 ; CHECK: {{.*}}_esimd_1.ll|{{.*}}_esimd_1.prop|{{.*}}_esimd_1.sym
+; CHECK: {{.*}}_1.ll|{{.*}}_1.prop|{{.*}}_1.sym
 
 ; CHECK-ESIMD-LargeGRF-PROP: isEsimdImage=1|1
 ; CHECK-ESIMD-LargeGRF-PROP: sycl-register-alloc-mode=1|2
diff --git a/llvm/test/tools/sycl-post-link/sycl-esimd/invoke-esimd-double.ll b/llvm/test/tools/sycl-post-link/sycl-esimd/invoke-esimd-double.ll
new file mode 100644
index 0000000000000..c4f8733a11f19
--- /dev/null
+++ b/llvm/test/tools/sycl-post-link/sycl-esimd/invoke-esimd-double.ll
@@ -0,0 +1,71 @@
+; RUN: sycl-post-link -symbols -split=auto -S < %s -o %t.table
+; RUN: FileCheck %s -input-file=%t.table --check-prefixes CHECK-TABLE
+; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS
+; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS
+; RUN: FileCheck %s -input-file=%t_1.ll --implicit-check-not double
+
+; Two module should be generated, one contains double kernel, other contains float kernel
+; CHECK-TABLE: {{.*}}_0.ll|{{.*}}_0.prop|{{.*}}_0.sym
+; CHECK-TABLE: {{.*}}_1.ll|{{.*}}_1.prop|{{.*}}_1.sym
+; CHECK-TABLE-EMPTY:
+
+; CHECK-M0-SYMS:      simd_func_double
+; CHECK-M0-SYMS-NEXT: helper_double_[[#]]
+; CHECK-M0-SYMS-NEXT: double_kernel
+; CHECK-M0-SYMS-EMPTY:
+
+; CHECK-M1-SYMS:      simd_func_float
+; CHECK-M1-SYMS-NEXT: helper_float_[[#]]
+; CHECK-M1-SYMS-NEXT: float_kernel
+; CHECK-M1-SYMS-EMPTY:
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+; ================ float kernel ================
+
+declare dso_local x86_regcallcc noundef float @_Z33__regcall3____builtin_invoke_simdfloat(ptr noundef, ptr noundef, float noundef, float noundef)
+
+define weak_odr dso_local spir_kernel void @float_kernel() #0 {
+entry:
+  %res = tail call x86_regcallcc noundef float @_Z33__regcall3____builtin_invoke_simdfloat(ptr noundef nonnull @helper_float, ptr noundef nonnull @simd_func_float, float noundef 1.0, float noundef 2.0)
+  ret void
+}
+
+define linkonce_odr dso_local x86_regcallcc <2 x float> @helper_float(ptr noundef nonnull %f, <2 x float> %simd_args.coerce, float noundef %simd_args3) #0 {
+entry:
+  %call = tail call x86_regcallcc <2 x float> %f(<2 x float> %simd_args.coerce, float noundef %simd_args3)
+  ret <2 x float> %call
+}
+
+define linkonce_odr dso_local x86_regcallcc <2 x float> @simd_func_float(<2 x float> %x.coerce, float noundef %n) #0 !sycl_explicit_simd !0 !intel_reqd_sub_group_size !1 {
+entry:
+  ret <2 x float> zeroinitializer
+}
+
+; ================ double kernel ================
+
+declare dso_local x86_regcallcc noundef double @_Z33__regcall3____builtin_invoke_simddouble(ptr noundef, ptr noundef, double noundef, double noundef)
+
+define weak_odr dso_local spir_kernel void @double_kernel() #0 !sycl_used_aspects !2 {
+entry:
+  %res = tail call x86_regcallcc noundef double @_Z33__regcall3____builtin_invoke_simddouble(ptr noundef nonnull @helper_double, ptr noundef nonnull @simd_func_double, double noundef 1.0, double noundef 2.0)
+  ret void
+}
+
+define linkonce_odr dso_local x86_regcallcc <2 x double> @helper_double(ptr noundef nonnull %f, <2 x double> %simd_args.coerce, double noundef %simd_args3) #0 !sycl_used_aspects !2 {
+entry:
+  %call = tail call x86_regcallcc <2 x double> %f(<2 x double> %simd_args.coerce, double noundef %simd_args3)
+  ret <2 x double> %call
+}
+
+define linkonce_odr dso_local x86_regcallcc <2 x double> @simd_func_double(<2 x double> %x.coerce, double noundef %n) #0 !sycl_explicit_simd !0 !intel_reqd_sub_group_size !1 !sycl_used_aspects !2 {
+entry:
+  ret <2 x double> zeroinitializer
+}
+
+attributes #0 = { "sycl-module-id"="TU1.cpp" }
+
+!0 = !{}
+!1 = !{i32 1}
+!2 = !{i32 6}
diff --git a/llvm/test/tools/sycl-post-link/sycl-esimd/no-sycl-esimd-split-shared-func.ll b/llvm/test/tools/sycl-post-link/sycl-esimd/no-sycl-esimd-split-shared-func.ll
index 0812817d78803..aae4d6278fda6 100644
--- a/llvm/test/tools/sycl-post-link/sycl-esimd/no-sycl-esimd-split-shared-func.ll
+++ b/llvm/test/tools/sycl-post-link/sycl-esimd/no-sycl-esimd-split-shared-func.ll
@@ -7,20 +7,29 @@
 
 ; RUN: sycl-post-link -lower-esimd -symbols -split=auto -S < %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t.table --check-prefixes CHECK-TABLE
-; RUN: FileCheck %s -input-file=%t_0.ll
-; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-SYM
+; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-HELPERS-SYM
+; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-ESIMD-SYM
+; RUN: FileCheck %s -input-file=%t_1.ll
 
 ;---------------- Verify generated table file.
 ; CHECK-TABLE: [Code|Properties|Symbols]
+; Extra module is emitted because non-kernel functions are also treated
+; as entry points (SPMD_CALLER, SIMD_CALL_HELPER_1)
+; TODO: we should consider removing sycl-module-id attribute from
+; SIMD_CALL_HELPER_* functions generated by the compiler, because we don't
+; expect them to be referenced externally.
 ; CHECK-TABLE: {{.*}}_0.ll|{{.*}}_0.prop|{{.*}}_0.sym
+; CHECK-TABLE: {{.*}}_1.ll|{{.*}}_1.prop|{{.*}}_1.sym
 ; CHECK-TABLE-EMPTY:
 
 ;---------------- Verify generated symbol file.
-; CHECK-SYM: SPMD_CALLER
-; CHECK-SYM: SYCL_kernel
-; CHECK-SYM: ESIMD_kernel
-; CHECK-SYM: SIMD_CALL_HELPER_{{[0-9]+}}
-; CHECK-SYM-EMPTY:
+; CHECK-HELPERS-SYM: SIMD_CALL_HELPER_[[#]]
+; CHECK-HELPERS-SYM: SPMD_CALLER
+; CHECK-HELPERS-SYM-EMPTY:
+;
+; CHECK-ESIMD-SYM: ESIMD_kernel
+; CHECK-ESIMD-SYM: SYCL_kernel
+; CHECK-ESIMD-SYM-EMPTY:
 
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
 target triple = "spir64-unknown-unknown"
@@ -76,7 +85,7 @@ entry:
 }
 
 ;---- ESIMD kernel, an entry point
-define dso_local spir_kernel void @ESIMD_kernel(float addrspace(1)* %ptr) #2 !sycl_explicit_simd !0 {
+define dso_local spir_kernel void @ESIMD_kernel(float addrspace(1)* %ptr) #1 !sycl_explicit_simd !0 {
 entry:
   %ptr_as4 = addrspacecast float addrspace(1)* %ptr to float addrspace(4)*
   %res = call x86_regcallcc <4 x float> @SIMD_CALLEE(<4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
@@ -87,7 +96,6 @@ entry:
 
 attributes #0 = { "sycl-module-id"="invoke_simd.cpp" }
 attributes #1 = { "sycl-module-id"="a.cpp" }
-attributes #2 = { "sycl-module-id"="b.cpp" }
 attributes #3 = { "referenced-indirectly" }
 
 !0 = !{}
@@ -95,17 +103,7 @@ attributes #3 = { "referenced-indirectly" }
 !2 = !{i32 4}
 
 ;---------------- Verify IR. Outlined to avoid complications with reordering.
-; Check the original version (for SYCL call graph) is retained
-; CHECK: define dso_local spir_func <4 x float> @SHARED_F(
-
-; Verify __builtin_invoke_simd lowering
-; 1) the second argument (function pointer) is removed
-; 2) The call target (helper) is changed to the optimized one
-; CHECK: define dso_local spir_func float @SPMD_CALLER(float %{{.*}})
-; CHECK:   %{{.*}} = call spir_func float @_Z33__regcall3____builtin_invoke_simdXX_{{.+}}(<4 x float> (<4 x float>)* @[[NEW_HELPER_NAME:SIMD_CALL_HELPER_[0-9]+]], float %{{.*}})
-; CHECK:   ret float %{{.*}}
-
-; Check the function is cloned for ESIMD call graph.
+; Check the function is modified in ESIMD module.
 ; CHECK: define dso_local spir_func <4 x float> @SHARED_F.esimd(i64 %{{.*}}) #[[SHARED_F_ATTRS:[0-9]+]] {
 ; CHECK:   %{{.*}} = call spir_func <4 x float> @__intrin(i64 %{{.*}})
 ; CHECK:   ret <4 x float> %{{.*}}
@@ -116,11 +114,21 @@ attributes #3 = { "referenced-indirectly" }
 ;---- * linkonce_odr linkage replaced with weak_odr
 ;---- * sycl_explicit_simd and intel_reqd_sub_group_size attributes added, which
 ;----   is required for correct processing by LowerESIMD
-; CHECK: define weak_odr dso_local x86_regcallcc <4 x float> @[[NEW_HELPER_NAME]](<4 x float> %{{.*}}) #[[NEW_HELPER_ATTRS:[0-9]+]] !sycl_explicit_simd !1 !intel_reqd_sub_group_size !2 {
+; CHECK: define weak_odr dso_local x86_regcallcc <4 x float> @[[NEW_HELPER_NAME:SIMD_CALL_HELPER_[0-9]+]](<4 x float> %{{.*}}) #[[NEW_HELPER_ATTRS:[0-9]+]] !sycl_explicit_simd ![[#]] !intel_reqd_sub_group_size ![[#]] {
 ; CHECK-NEXT:  %{{.*}} = call spir_func <4 x float> @SHARED_F.esimd(i64 100)
 ; CHECK-NEXT:  %{{.*}} = fadd <4 x float> %{{.*}}, %{{.*}}
 ; CHECK-NEXT:  ret <4 x float> {{.*}}
 
+; Check the original version (for SYCL call graph) is retained
+; CHECK: define dso_local spir_func <4 x float> @SHARED_F(
+
+; Verify __builtin_invoke_simd lowering
+; 1) the second argument (function pointer) is removed
+; 2) The call target (helper) is changed to the optimized one
+; CHECK: define dso_local spir_func float @SPMD_CALLER(float %{{.*}})
+; CHECK:   %{{.*}} = call spir_func float @_Z33__regcall3____builtin_invoke_simdXX_{{.+}}(<4 x float> (<4 x float>)* @[[NEW_HELPER_NAME]], float %{{.*}})
+; CHECK:   ret float %{{.*}}
+
 ; Check that VCStackCall attribute is added to the invoke_simd helpers functions:
 ; CHECK: attributes #[[SHARED_F_ATTRS]] = { noinline "VCFunction" }
 ; CHECK: attributes #[[NEW_HELPER_ATTRS]] = { "VCFunction" "VCStackCall" "sycl-module-id"="invoke_simd.cpp" }
diff --git a/llvm/tools/sycl-post-link/ModuleSplitter.cpp b/llvm/tools/sycl-post-link/ModuleSplitter.cpp
index 49aa88b6290db..02446737900f4 100644
--- a/llvm/tools/sycl-post-link/ModuleSplitter.cpp
+++ b/llvm/tools/sycl-post-link/ModuleSplitter.cpp
@@ -12,6 +12,7 @@
 #include "Support.h"
 
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
@@ -36,40 +37,12 @@ using namespace llvm;
 using namespace llvm::module_split;
 
 namespace {
-
 // Identifying name for global scope
 constexpr char GLOBAL_SCOPE_NAME[] = "<GLOBAL>";
 constexpr char SYCL_SCOPE_NAME[] = "<SYCL>";
 constexpr char ESIMD_SCOPE_NAME[] = "<ESIMD>";
 constexpr char ESIMD_MARKER_MD[] = "sycl_explicit_simd";
 
-bool hasIndirectFunctionsOrCalls(const Module &M) {
-  for (const auto &F : M.functions()) {
-    // There are functions marked with [[intel::device_indirectly_callable]]
-    // attribute, because it instructs us to make this function available to the
-    // whole program as it was compiled as a single module.
-    if (F.hasFnAttribute("referenced-indirectly"))
-      return true;
-    if (F.isDeclaration())
-      continue;
-    // There are indirect calls in the module, which means that we don't know
-    // how to group functions so both caller and callee of indirect call are in
-    // the same module.
-    for (const auto &I : instructions(F)) {
-      if (auto *CI = dyn_cast<CallInst>(&I))
-        if (!CI->getCalledFunction())
-          return true;
-    }
-
-    // Function pointer is used somewhere. Follow the same rule as above.
-    for (const auto *U : F.users())
-      if (!isa<CallInst>(U))
-        return true;
-  }
-
-  return false;
-}
-
 EntryPointsGroupScope selectDeviceCodeGroupScope(const Module &M,
                                                  IRSplitMode Mode,
                                                  bool AutoSplitIsGlobalScope) {
@@ -81,7 +54,7 @@ EntryPointsGroupScope selectDeviceCodeGroupScope(const Module &M,
     return Scope_PerKernel;
 
   case SPLIT_AUTO: {
-    if (hasIndirectFunctionsOrCalls(M) || AutoSplitIsGlobalScope)
+    if (AutoSplitIsGlobalScope)
       return Scope_Global;
 
     // At the moment, we assume that per-source split is the best way of
@@ -169,7 +142,6 @@ groupEntryPointsByKernelType(ModuleDesc &MD,
     if (!isEntryPoint(F, EmitOnlyKernelsAsEntryPoints) ||
         !MD.isEntryPointCandidate(F))
       continue;
-
     if (isESIMDFunction(F))
       EntryPointMap[ESIMD_SCOPE_NAME].insert(&F);
     else
@@ -199,93 +171,150 @@ groupEntryPointsByKernelType(ModuleDesc &MD,
   return EntryPointGroups;
 }
 
-// Represents a call graph between functions in a module. Nodes are functions,
-// edges are "calls" relation.
-class CallGraph {
+// Represents "dependency" or "use" graph of global objects (functions and
+// global variables) in a module. It is used during device code split to
+// understand which global variables and functions (other than entry points)
+// should be included into a split module.
+//
+// Nodes of the graph represent LLVM's GlobalObjects, edges "A" -> "B" represent
+// the fact that if "A" is included into a module, then "B" should be included
+// as well.
+//
+// Examples of dependencies which are represented in this graph:
+// - Function FA calls function FB
+// - Function FA uses global variable GA
+// - Global variable GA references (initialized with) function FB
+// - Function FA stores address of a function FB somewhere
+//
+// The following cases are treated as dependencies between global objects:
+// 1. Global object A is used within by a global object B in any way (store,
+//    bitcast, phi node, call, etc.): "A" -> "B" edge will be added to the
+//    graph;
+// 2. function A performs an indirect call of a function with signature S and
+//    there is a function B with signature S. "A" -> "B" edge will be added to
+//    the graph;
+class DependencyGraph {
 public:
-  using FunctionSet = SmallPtrSet<const Function *, 16>;
-
-private:
-  std::unordered_map<const Function *, FunctionSet> Graph;
-  SmallPtrSet<const Function *, 1> EmptySet;
-  FunctionSet AddrTakenFunctions;
+  using GlobalSet = SmallPtrSet<const GlobalValue *, 16>;
+
+  DependencyGraph(const Module &M) {
+    // Group functions by their signature to handle case (2) described above
+    DenseMap<const FunctionType *, DependencyGraph::GlobalSet>
+        FuncTypeToFuncsMap;
+    for (const auto &F : M.functions()) {
+      // Kernels can't be called (either directly or indirectly) in SYCL
+      if (!isKernel(F))
+        FuncTypeToFuncsMap[F.getFunctionType()].insert(&F);
+    }
 
-public:
-  CallGraph(const Module &M) {
-    for (const auto &F : M) {
-      for (const Value *U : F.users()) {
-        if (const auto *I = dyn_cast<CallInst>(U)) {
-          if (I->getCalledFunction() == &F) {
-            const Function *F1 = I->getFunction();
-            Graph[F1].insert(&F);
-          }
-        }
-      }
-      if (F.hasAddressTaken()) {
-        AddrTakenFunctions.insert(&F);
+    // We add every function into the graph
+    for (const auto &F : M.functions()) {
+      // case (1), see comment above the class definition
+      for (const Value *U : F.users())
+        addUserToGraphRecursively(cast<const User>(U), &F);
+
+      // case (2), see comment above the class definition
+      for (const auto &I : instructions(F)) {
+        const auto *CI = dyn_cast<CallInst>(&I);
+        if (!CI || !CI->isIndirectCall()) // Direct calls were handled above
+          continue;
+
+        // TODO: consider limiting set of potential callees to functions marked
+        // with special attribute (like [[intel::device_indirectly_callable]])
+        const FunctionType *Signature = CI->getFunctionType();
+        // Note: strictly speaking, virtual functions are allowed to use
+        // co-variant return types, i.e. we can actually miss a potential callee
+        // here, because it has different signature (different return type).
+        // However, this is not a problem for two reasons:
+        // - opaque pointers will be enabled at some point and will make
+        //   signatures the same in that case
+        // - all virtual functions are referenced from vtable and therefore will
+        //   anyway be preserved in a module
+        const auto &PotentialCallees = FuncTypeToFuncsMap[Signature];
+        Graph[&F].insert(PotentialCallees.begin(), PotentialCallees.end());
       }
     }
+
+    // And every global variable (but their handling is a bit simpler)
+    for (const auto &GV : M.globals())
+      for (const Value *U : GV.users())
+        addUserToGraphRecursively(cast<const User>(U), &GV);
   }
 
-  iterator_range<FunctionSet::const_iterator>
-  successors(const Function *F) const {
-    auto It = Graph.find(F);
+  iterator_range<GlobalSet::const_iterator>
+  dependencies(const GlobalValue *Val) const {
+    auto It = Graph.find(Val);
     return (It == Graph.end())
                ? make_range(EmptySet.begin(), EmptySet.end())
                : make_range(It->second.begin(), It->second.end());
   }
 
-  iterator_range<FunctionSet::const_iterator> addrTakenFunctions() const {
-    return make_range(AddrTakenFunctions.begin(), AddrTakenFunctions.end());
+private:
+  void addUserToGraphRecursively(const User *Root, const GlobalValue *V) {
+
+    SmallVector<const User *, 8> WorkList;
+    WorkList.push_back(Root);
+
+    while (!WorkList.empty()) {
+      const User *U = WorkList.pop_back_val();
+      if (const auto *I = dyn_cast<const Instruction>(U)) {
+        const auto *UFunc = I->getFunction();
+        Graph[UFunc].insert(V);
+      } else if (isa<const Constant>(U)) {
+        if (const auto *GV = dyn_cast<const GlobalVariable>(U))
+          Graph[GV].insert(V);
+        // This could be a global variable or some constant expression (like
+        // bitcast or gep). We trace users of this constant further to reach
+        // global objects they are used by and add them to the graph.
+        for (const auto *UU : U->users())
+          WorkList.push_back(UU);
+      } else {
+        llvm_unreachable("Unhandled type of function user");
+      }
+    }
   }
+
+  DenseMap<const GlobalValue *, GlobalSet> Graph;
+  SmallPtrSet<const GlobalValue *, 1> EmptySet;
 };
 
-void collectFunctionsToExtract(SetVector<const GlobalValue *> &GVs,
-                               const EntryPointGroup &ModuleEntryPoints,
-                               const CallGraph &Deps) {
+void collectFunctionsAndGlobalVariablesToExtract(
+    SetVector<const GlobalValue *> &GVs, const Module &M,
+    const EntryPointGroup &ModuleEntryPoints, const DependencyGraph &Deps,
+    const std::function<bool(const Function *)> &IncludeFunctionPredicate =
+        nullptr) {
+  // We start with module entry points
   for (const auto *F : ModuleEntryPoints.Functions)
     GVs.insert(F);
-  // It is conservatively assumed that any address-taken function can be invoked
-  // or otherwise used by any function in any module split from the initial one.
-  // So such functions along with the call graphs they start are always
-  // extracted (and duplicated in each split module). They are not treated as
-  // entry points, as SYCL runtime requires that intersection of entry point
-  // sets of different device binaries (for the same target) must be empty.
-  // TODO: try to determine which split modules really use address-taken
-  // functions and only duplicate the functions in such modules. Note that usage
-  // may include e.g. function address comparison w/o actual invocation.
-  for (const auto *F : Deps.addrTakenFunctions()) {
-    if (!isKernel(*F) && (isESIMDFunction(*F) == ModuleEntryPoints.isEsimd()))
-      GVs.insert(F);
+
+  // Non-discardable global variables are also include into the initial set
+  for (const auto &GV : M.globals()) {
+    if (!GV.isDiscardableIfUnused())
+      GVs.insert(&GV);
   }
 
   // GVs has SetVector type. This type inserts a value only if it is not yet
   // present there. So, recursion is not expected here.
   decltype(GVs.size()) Idx = 0;
   while (Idx < GVs.size()) {
-    const auto *F = cast<Function>(GVs[Idx++]);
+    const auto *Obj = GVs[Idx++];
+
+    for (const GlobalValue *Dep : Deps.dependencies(Obj)) {
+      if (const auto *Func = dyn_cast<const Function>(Dep)) {
+        if (Func->isDeclaration())
+          continue;
 
-    for (const Function *F1 : Deps.successors(F)) {
-      if (!F1->isDeclaration())
-        GVs.insert(F1);
+        // Functions can be additionally filtered
+        if (!IncludeFunctionPredicate || IncludeFunctionPredicate(Func))
+          GVs.insert(Func);
+      } else {
+        // Global variables are added unconditionally
+        GVs.insert(Dep);
+      }
     }
   }
 }
 
-void collectGlobalVarsToExtract(SetVector<const GlobalValue *> &GVs,
-                                const Module &M) {
-  // It's not easy to trace global variable's uses inside needed functions
-  // because global variable can be used inside a combination of operators, so
-  // mark all global variables as needed and remove dead ones after cloning.
-  // Notice. For device global variables with the 'device_image_scope' property,
-  // removing dead ones is a must, the 'checkImageScopedDeviceGlobals' function
-  // checks that there are no usages of a single device global variable with the
-  // 'device_image_scope' property from multiple modules and the splitter must
-  // not add such usages after the check.
-  for (const auto &G : M.globals())
-    GVs.insert(&G);
-}
-
 ModuleDesc extractSubModule(const ModuleDesc &MD,
                             const SetVector<const GlobalValue *> GVs,
                             EntryPointGroup &&ModuleEntryPoints) {
@@ -310,10 +339,34 @@ ModuleDesc extractSubModule(const ModuleDesc &MD,
 // points that are specified in ModuleEntryPoints vector.
 ModuleDesc extractCallGraph(const ModuleDesc &MD,
                             EntryPointGroup &&ModuleEntryPoints,
-                            const CallGraph &CG) {
+                            const DependencyGraph &CG,
+                            const std::function<bool(const Function *)>
+                                &IncludeFunctionPredicate = nullptr) {
+  SetVector<const GlobalValue *> GVs;
+  collectFunctionsAndGlobalVariablesToExtract(
+      GVs, MD.getModule(), ModuleEntryPoints, CG, IncludeFunctionPredicate);
+
+  ModuleDesc SplitM = extractSubModule(MD, GVs, std::move(ModuleEntryPoints));
+  SplitM.cleanup();
+
+  return SplitM;
+}
+
+// The function is similar to 'extractCallGraph', but it produces a copy of
+// input LLVM IR module M with _all_ ESIMD functions and kernels included,
+// regardless of whether or not they are listed in ModuleEntryPoints.
+ModuleDesc extractESIMDSubModule(const ModuleDesc &MD,
+                                 EntryPointGroup &&ModuleEntryPoints,
+                                 const DependencyGraph &CG,
+                                 const std::function<bool(const Function *)>
+                                     &IncludeFunctionPredicate = nullptr) {
   SetVector<const GlobalValue *> GVs;
-  collectFunctionsToExtract(GVs, ModuleEntryPoints, CG);
-  collectGlobalVarsToExtract(GVs, MD.getModule());
+  for (const auto &F : MD.getModule().functions())
+    if (isESIMDFunction(F))
+      GVs.insert(&F);
+
+  collectFunctionsAndGlobalVariablesToExtract(
+      GVs, MD.getModule(), ModuleEntryPoints, CG, IncludeFunctionPredicate);
 
   ModuleDesc SplitM = extractSubModule(MD, GVs, std::move(ModuleEntryPoints));
   SplitM.cleanup();
@@ -341,26 +394,12 @@ class ModuleSplitter : public ModuleSplitterBase {
   }
 
 private:
-  CallGraph CG;
+  DependencyGraph CG;
 };
-
 } // namespace
-
 namespace llvm {
 namespace module_split {
 
-std::unique_ptr<ModuleSplitterBase>
-getSplitterByKernelType(ModuleDesc &&MD, bool EmitOnlyKernelsAsEntryPoints) {
-  EntryPointGroupVec Groups =
-      groupEntryPointsByKernelType(MD, EmitOnlyKernelsAsEntryPoints);
-  bool DoSplit = (Groups.size() > 1);
-
-  if (DoSplit)
-    return std::make_unique<ModuleSplitter>(std::move(MD), std::move(Groups));
-  else
-    return std::make_unique<ModuleCopier>(std::move(MD), std::move(Groups));
-}
-
 void ModuleSplitterBase::verifyNoCrossModuleDeviceGlobalUsage() {
   const Module &M = getInputModule();
   // Early exit if there is only one group
@@ -891,5 +930,61 @@ getDeviceCodeSplitter(ModuleDesc &&MD, IRSplitMode Mode, bool IROutputOnly,
   return std::make_unique<ModuleCopier>(std::move(MD), std::move(Groups));
 }
 
+// Splits input module into two:
+// - one containing _all_ ESIMD kernels, ESIMD functions and everything they use
+// - another one which contains everything else
+//
+// The most interesting part here is that if a regular SYCL kernel uses a ESIMD
+// function (through invoke_simd), it won't be included in non-ESIMD module.
+//
+// The reason for that is because ESIMD functions should undergo special
+// handling and therefore we isolate them all into a separate module completely
+// to do so. Due to design choices in passes provided by vc-intrinsics repo, we
+// can't handle ESIMD functions _only_ in a mixed module.
+//
+// Functions, which are used from both ESIMD and non-ESIMD code will be
+// duplicated into each module.
+//
+// If there are dependenceis between ESIMD and non-ESIMD code (produced by
+// inoke_simd, for example), the modules has to be linked back together to avoid
+// undefined behavior at later stages. That is done at higher level, outside of
+// this function.
+SmallVector<ModuleDesc, 2> splitByESIMD(ModuleDesc &&MD,
+                                        bool EmitOnlyKernelsAsEntryPoints) {
+
+  SmallVector<module_split::ModuleDesc, 2> Result;
+
+  EntryPointGroupVec EntryPointGroups =
+      groupEntryPointsByKernelType(MD, EmitOnlyKernelsAsEntryPoints);
+
+  if (EntryPointGroups.size() == 1) {
+    Result.emplace_back(std::move(MD.releaseModulePtr()),
+                        std::move(EntryPointGroups[0]), MD.Props);
+    return Result;
+  }
+
+  DependencyGraph CG(MD.getModule());
+  for (auto &Group : EntryPointGroups) {
+    if (Group.isEsimd()) {
+      // For ESIMD module, we use full call graph of all entry points and all
+      // ESIMD functions.
+      Result.emplace_back(
+          std::move(extractESIMDSubModule(MD, std::move(Group), CG)));
+    } else {
+      // For non-ESIMD module we only use non-ESIMD functions. Additional filter
+      // is needed, because there could be uses of ESIMD functions from
+      // non-ESIMD functions through invoke_simd. If that is the case, both
+      // modules are expected to be linked back together after ESIMD functions
+      // were processed and therefore it is fine to return an "incomplete"
+      // module here.
+      Result.emplace_back(std::move(extractCallGraph(
+          MD, std::move(Group), CG,
+          [=](const Function *F) -> bool { return !isESIMDFunction(*F); })));
+    }
+  }
+
+  return Result;
+}
+
 } // namespace module_split
 } // namespace llvm
diff --git a/llvm/tools/sycl-post-link/ModuleSplitter.h b/llvm/tools/sycl-post-link/ModuleSplitter.h
index 3652f756ce1e1..6a4591820b290 100644
--- a/llvm/tools/sycl-post-link/ModuleSplitter.h
+++ b/llvm/tools/sycl-post-link/ModuleSplitter.h
@@ -238,13 +238,13 @@ class ModuleSplitterBase {
   bool hasMoreSplits() const { return remainingSplits() > 0; }
 };
 
+SmallVector<ModuleDesc, 2> splitByESIMD(ModuleDesc &&MD,
+                                        bool EmitOnlyKernelsAsEntryPoints);
+
 std::unique_ptr<ModuleSplitterBase>
 getDeviceCodeSplitter(ModuleDesc &&MD, IRSplitMode Mode, bool IROutputOnly,
                       bool EmitOnlyKernelsAsEntryPoints);
 
-std::unique_ptr<ModuleSplitterBase>
-getSplitterByKernelType(ModuleDesc &&MD, bool EmitOnlyKernelsAsEntryPoints);
-
 #ifndef NDEBUG
 void dumpEntryPoints(const EntryPointSet &C, const char *msg = "", int Tab = 0);
 void dumpEntryPoints(const Module &M, bool OnlyKernelsAreEntryPoints = false,
diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp
index c07bffa8a3207..d142b87c71862 100644
--- a/llvm/tools/sycl-post-link/sycl-post-link.cpp
+++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp
@@ -751,6 +751,63 @@ static bool removeSYCLKernelsConstRefArray(Module &M) {
   return true;
 }
 
+SmallVector<module_split::ModuleDesc, 2>
+handleESIMD(module_split::ModuleDesc &&MDesc, bool &Modified,
+            bool &SplitOccurred) {
+  // Do SYCL/ESIMD splitting. It happens always, as ESIMD and SYCL must
+  // undergo different set of LLVMIR passes. After this they are linked back
+  // together to form single module with disjoint SYCL and ESIMD call graphs
+  // unless -split-esimd option is specified. The graphs become disjoint
+  // when linked back because functions shared between graphs are cloned and
+  // renamed.
+  SmallVector<module_split::ModuleDesc, 2> Result = module_split::splitByESIMD(
+      std::move(MDesc), EmitOnlyKernelsAsEntryPoints);
+
+  if (Result.size() > 1 && SplitOccurred &&
+      (SplitMode == module_split::SPLIT_PER_KERNEL) && !SplitEsimd) {
+    // Controversial state reached - SYCL and ESIMD entry points resulting
+    // from SYCL/ESIMD split (which is done always) are linked back, since
+    // -split-esimd is not specified, but per-kernel split is requested.
+    warning("SYCL and ESIMD entry points detected and split mode is "
+            "per-kernel, so " +
+            SplitEsimd.ValueStr + " must also be specified");
+  }
+  SplitOccurred |= Result.size() > 1;
+
+  for (auto &MD : Result) {
+    DUMP_ENTRY_POINTS(MD.entries(), MD.Name.c_str(), 3);
+    Modified |= processSpecConstants(MD);
+    if (LowerEsimd && MD.isESIMD())
+      Modified |= lowerEsimdConstructs(MD);
+  }
+
+  if (!SplitEsimd && Result.size() > 1) {
+    // SYCL/ESIMD splitting is not requested, link back into single module.
+    assert(Result.size() == 2 &&
+           "Unexpected number of modules as results of ESIMD split");
+    int ESIMDInd = Result[0].isESIMD() ? 0 : 1;
+    int SYCLInd = 1 - ESIMDInd;
+    assert(Result[SYCLInd].isSYCL() &&
+           "no non-ESIMD module as a result ESIMD split?");
+
+    // ... but before that, make sure no link conflicts will occur.
+    Result[ESIMDInd].renameDuplicatesOf(Result[SYCLInd].getModule(), ".esimd");
+    module_split::ModuleDesc Linked =
+        link(std::move(Result[0]), std::move(Result[1]));
+    Linked.restoreLinkageOfDirectInvokeSimdTargets();
+    string_vector Names;
+    Linked.saveEntryPointNames(Names);
+    Linked.cleanup(); // may remove some entry points, need to save/rebuild
+    Linked.rebuildEntryPoints(Names);
+    Result.clear();
+    Result.emplace_back(std::move(Linked));
+    DUMP_ENTRY_POINTS(Result.back().entries(), Result.back().Name.c_str(), 3);
+    Modified = true;
+  }
+
+  return Result;
+}
+
 std::unique_ptr<util::SimpleTable>
 processInputModule(std::unique_ptr<Module> M) {
   // Construct the resulting table which will accumulate all the outputs.
@@ -825,62 +882,8 @@ processInputModule(std::unique_ptr<Module> M) {
     DUMP_ENTRY_POINTS(MDesc.entries(), MDesc.Name.c_str(), 1);
 
     MDesc.fixupLinkageOfDirectInvokeSimdTargets();
-
-    // Do SYCL/ESIMD splitting. It happens always, as ESIMD and SYCL must
-    // undergo different set of LLVMIR passes. After this they are linked back
-    // together to form single module with disjoint SYCL and ESIMD call graphs
-    // unless -split-esimd option is specified. The graphs become disjoint
-    // when linked back because functions shared between graphs are cloned and
-    // renamed.
-    std::unique_ptr<module_split::ModuleSplitterBase> ESIMDSplitter =
-        module_split::getSplitterByKernelType(std::move(MDesc),
-                                              EmitOnlyKernelsAsEntryPoints);
-    bool ESIMDSplitOccurred = ESIMDSplitter->remainingSplits() > 1;
-
-    if (ESIMDSplitOccurred && SplitOccurred &&
-        (SplitMode == module_split::SPLIT_PER_KERNEL) && !SplitEsimd) {
-      // Controversial state reached - SYCL and ESIMD entry points resulting
-      // from SYCL/ESIMD split (which is done always) are linked back, since
-      // -split-esimd is not specified, but per-kernel split is requested.
-      warning("SYCL and ESIMD entry points detected and split mode is "
-              "per-kernel, so " +
-              SplitEsimd.ValueStr + " must also be specified");
-    }
-    SmallVector<module_split::ModuleDesc, 2> MMs;
-    SplitOccurred |= ESIMDSplitOccurred;
-    Modified |= SplitOccurred;
-
-    while (ESIMDSplitter->hasMoreSplits()) {
-      module_split::ModuleDesc MDesc2 = ESIMDSplitter->nextSplit();
-      DUMP_ENTRY_POINTS(MDesc2.entries(), MDesc2.Name.c_str(), 3);
-      Modified |= processSpecConstants(MDesc2);
-
-      if (!MDesc2.isSYCL() && LowerEsimd) {
-        assert(MDesc2.isESIMD() && "NYI");
-        Modified |= lowerEsimdConstructs(MDesc2);
-      }
-      MMs.emplace_back(std::move(MDesc2));
-    }
-    if (!SplitEsimd && (MMs.size() > 1)) {
-      // SYCL/ESIMD splitting is not requested, link back into single module.
-      assert(MMs.size() == 2);
-      assert((MMs[0].isESIMD() && MMs[1].isSYCL()) ||
-             (MMs[1].isESIMD() && MMs[0].isSYCL()));
-      int ESIMDInd = MMs[0].isESIMD() ? 0 : 1;
-      int SYCLInd = MMs[0].isESIMD() ? 1 : 0;
-      // ... but before that, make sure no link conflicts will occur.
-      MMs[ESIMDInd].renameDuplicatesOf(MMs[SYCLInd].getModule(), ".esimd");
-      module_split::ModuleDesc M2 = link(std::move(MMs[0]), std::move(MMs[1]));
-      M2.restoreLinkageOfDirectInvokeSimdTargets();
-      string_vector Names;
-      M2.saveEntryPointNames(Names);
-      M2.cleanup(); // may remove some entry points, need to save/rebuild
-      M2.rebuildEntryPoints(Names);
-      MMs.clear();
-      MMs.emplace_back(std::move(M2));
-      DUMP_ENTRY_POINTS(MMs.back().entries(), MMs.back().Name.c_str(), 3);
-      Modified = true;
-    }
+    SmallVector<module_split::ModuleDesc, 2> MMs =
+        handleESIMD(std::move(MDesc), Modified, SplitOccurred);
 
     if (IROutputOnly) {
       if (SplitOccurred) {

From dc48d7ce4741c1ed7b4002dda07dc7050ddcba72 Mon Sep 17 00:00:00 2001
From: 0x12CC <68250218+0x12CC@users.noreply.github.com>
Date: Tue, 13 Jun 2023 02:30:41 -0400
Subject: [PATCH 03/55] [SYCL] Remove invalid `copy` test case (#9829)

KhronosGroup/SYCL-Docs#425 clarifies that the source type for the `copy`
must be device copyable. Remove the copy from `const void *` test case
since it fails this assertion.

Signed-off-by: Michael Aziz <michael.aziz@intel.com>
---
 sycl/test-e2e/Basic/handler/handler_mem_op.cpp | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/sycl/test-e2e/Basic/handler/handler_mem_op.cpp b/sycl/test-e2e/Basic/handler/handler_mem_op.cpp
index 5d24f0d420dae..c3c8fd625e760 100644
--- a/sycl/test-e2e/Basic/handler/handler_mem_op.cpp
+++ b/sycl/test-e2e/Basic/handler/handler_mem_op.cpp
@@ -255,20 +255,6 @@ template <typename T> void test_copy_ptr_acc() {
     assert(Data[I] == Values[I]);
   }
 
-  // Check copy from 'const void *' memory to accessor.
-  {
-    buffer<T, 1> Buffer(Size);
-    queue Queue;
-    Queue.submit([&](handler &Cgh) {
-      auto Acc = Buffer.template get_access<access::mode::discard_write>(Cgh);
-      Cgh.copy(reinterpret_cast<const void *>(Values), Acc);
-    });
-
-    auto Acc = Buffer.template get_access<access::mode::read>();
-    for (int I = 0; I < Size; ++I)
-      assert(Acc[I] == Values[I]);
-  }
-
   // Check copy from memory to 0-dimensional accessor.
   T SrcValue = 99;
   T DstValue = 0;

From b7f09d83da471c0a11fe65a7d70216f2993d7f92 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Tue, 13 Jun 2023 11:12:20 +0100
Subject: [PATCH 04/55] [SYCL][CUDA] Non-uniform algorithm implementations for
 ext_oneapi_cuda. (#9671)

This PR adds cuda support for fixed_size_group, ballot_group, and
opportunistic_group algorithms. All group algorithm support added for
the SPIRV impls (those added in e.g.
https://github.com/intel/llvm/pull/9181) is correspondingly added here
for the cuda backend.

Everything except the reduce/scans uses the same impl for all
non-uniform groups. Reduce algorithms also use the same impl for all
group types on sm80 for special IsRedux types/ops pairs.

Otherwise reduce/scans have two impl categories:
1.fixed_size_group
2.opportunistic_group, ballot_group, (and tangle_group once it is
supported) all use the same impls.

Note that tangle_group is still not supported. However all algorithms
implemented by ballot_group/opportunistic_group will I think be
appropriate for tangle_group when it is supported.

---------

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 sycl/include/sycl/detail/spirv.hpp            |  10 +-
 sycl/include/sycl/detail/type_traits.hpp      |   7 +
 .../cuda/non_uniform_algorithms.hpp           | 337 ++++++++++++++++++
 .../oneapi/experimental/fixed_size_group.hpp  |   7 +
 sycl/include/sycl/group_algorithm.hpp         |  51 +++
 .../ballot_group_algorithms.cpp               |   2 +-
 .../fixed_size_group_algorithms.cpp           |   2 +-
 .../opportunistic_group_algorithms.cpp        |   2 +-
 8 files changed, 410 insertions(+), 8 deletions(-)
 create mode 100644 sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp

diff --git a/sycl/include/sycl/detail/spirv.hpp b/sycl/include/sycl/detail/spirv.hpp
index 63172d026ec9b..a5e0c59b917d9 100644
--- a/sycl/include/sycl/detail/spirv.hpp
+++ b/sycl/include/sycl/detail/spirv.hpp
@@ -152,7 +152,7 @@ template <typename ParentGroup>
 bool GroupAll(ext::oneapi::experimental::tangle_group<ParentGroup>, bool pred) {
   return __spirv_GroupNonUniformAll(group_scope<ParentGroup>::value, pred);
 }
-template <typename Group>
+
 bool GroupAll(const ext::oneapi::experimental::opportunistic_group &,
               bool pred) {
   return __spirv_GroupNonUniformAll(
@@ -1022,8 +1022,10 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
 template <typename Group>
 typename std::enable_if_t<
     ext::oneapi::experimental::is_user_constructed_group_v<Group>>
-ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
-#if defined(__SPIR__)
+ControlBarrier(Group g, memory_scope FenceScope, memory_order Order) {
+#if defined(__NVPTX__)
+  __nvvm_bar_warp_sync(detail::ExtractMask(detail::GetMask(g))[0]);
+#else
   // SPIR-V does not define an instruction to synchronize partial groups.
   // However, most (possibly all?) of the current SPIR-V targets execute
   // work-items in lockstep, so we can probably get away with a MemoryBarrier.
@@ -1033,8 +1035,6 @@ ControlBarrier(Group, memory_scope FenceScope, memory_order Order) {
                             __spv::MemorySemanticsMask::SubgroupMemory |
                             __spv::MemorySemanticsMask::WorkgroupMemory |
                             __spv::MemorySemanticsMask::CrossWorkgroupMemory);
-#elif defined(__NVPTX__)
-  // TODO: Call syncwarp with appropriate mask extracted from the group
 #endif
 }
 
diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp
index b6613ea080c03..be072531a7a14 100644
--- a/sycl/include/sycl/detail/type_traits.hpp
+++ b/sycl/include/sycl/detail/type_traits.hpp
@@ -20,6 +20,13 @@
 
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
+namespace detail {
+template <class T> struct is_fixed_size_group : std::false_type {};
+
+template <class T>
+inline constexpr bool is_fixed_size_group_v = is_fixed_size_group<T>::value;
+} // namespace detail
+
 template <int Dimensions> class group;
 namespace ext::oneapi {
 struct sub_group;
diff --git a/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
new file mode 100644
index 0000000000000..eea68d89a35fe
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp
@@ -0,0 +1,337 @@
+//==----- non_uniform_algorithms.hpp - cuda masked subgroup algorithms -----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <sycl/known_identity.hpp>
+
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+namespace detail {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+
+template <typename T, class BinaryOperation>
+using IsRedux = std::bool_constant<
+    std::is_integral<T>::value && IsBitAND<T, BinaryOperation>::value ||
+    IsBitOR<T, BinaryOperation>::value || IsBitXOR<T, BinaryOperation>::value ||
+    IsPlus<T, BinaryOperation>::value || IsMinimum<T, BinaryOperation>::value ||
+    IsMaximum<T, BinaryOperation>::value>;
+
+//// Masked reductions using redux.sync, requires integer types
+
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<
+    is_sugeninteger<T>::value && IsMinimum<T, BinaryOperation>::value, T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return __nvvm_redux_sync_umin(x, MemberMask);
+}
+
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<
+    is_sigeninteger<T>::value && IsMinimum<T, BinaryOperation>::value, T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return __nvvm_redux_sync_min(x, MemberMask);
+}
+
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<
+    is_sugeninteger<T>::value && IsMaximum<T, BinaryOperation>::value, T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return __nvvm_redux_sync_umax(x, MemberMask);
+}
+
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<
+    is_sigeninteger<T>::value && IsMaximum<T, BinaryOperation>::value, T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return __nvvm_redux_sync_max(x, MemberMask);
+}
+
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<(is_sugeninteger<T>::value || is_sigeninteger<T>::value) &&
+                     IsPlus<T, BinaryOperation>::value,
+                 T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return __nvvm_redux_sync_add(x, MemberMask);
+}
+
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<(is_sugeninteger<T>::value || is_sigeninteger<T>::value) &&
+                     IsBitAND<T, BinaryOperation>::value,
+                 T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return __nvvm_redux_sync_and(x, MemberMask);
+}
+
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<(is_sugeninteger<T>::value || is_sigeninteger<T>::value) &&
+                     IsBitOR<T, BinaryOperation>::value,
+                 T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return __nvvm_redux_sync_or(x, MemberMask);
+}
+
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<(is_sugeninteger<T>::value || is_sigeninteger<T>::value) &&
+                     IsBitXOR<T, BinaryOperation>::value,
+                 T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return __nvvm_redux_sync_xor(x, MemberMask);
+}
+////
+
+//// Shuffle based masked reduction impls
+
+// fixed_size_group group reduction using shfls
+template <typename Group, typename T, class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<is_fixed_size_group_v<Group>, T>
+masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
+                            const uint32_t MemberMask) {
+  for (int i = g.get_local_range()[0] / 2; i > 0; i /= 2) {
+    T tmp;
+    if constexpr (std::is_same_v<T, double>) {
+      int x_a, x_b;
+      asm volatile("mov.b64 {%0,%1},%2;" : "=r"(x_a), "=r"(x_b) : "d"(x));
+      auto tmp_a = __nvvm_shfl_sync_bfly_i32(MemberMask, x_a, -1, i);
+      auto tmp_b = __nvvm_shfl_sync_bfly_i32(MemberMask, x_b, -1, i);
+      asm volatile("mov.b64 %0,{%1,%2};" : "=d"(tmp) : "r"(tmp_a), "r"(tmp_b));
+    } else if constexpr (std::is_same_v<T, long> ||
+                         std::is_same_v<T, unsigned long>) {
+      int x_a, x_b;
+      asm volatile("mov.b64 {%0,%1},%2;" : "=r"(x_a), "=r"(x_b) : "l"(x));
+      auto tmp_a = __nvvm_shfl_sync_bfly_i32(MemberMask, x_a, -1, i);
+      auto tmp_b = __nvvm_shfl_sync_bfly_i32(MemberMask, x_b, -1, i);
+      asm volatile("mov.b64 %0,{%1,%2};" : "=l"(tmp) : "r"(tmp_a), "r"(tmp_b));
+    } else if constexpr (std::is_same_v<T, half>) {
+      short tmp_b16;
+      asm volatile("mov.b16 %0,%1;" : "=h"(tmp_b16) : "h"(x));
+      auto tmp_b32 = __nvvm_shfl_sync_bfly_i32(
+          MemberMask, static_cast<int>(tmp_b16), -1, i);
+      asm volatile("mov.b16 %0,%1;"
+                   : "=h"(tmp)
+                   : "h"(static_cast<short>(tmp_b32)));
+    } else if constexpr (std::is_same_v<T, float>) {
+      auto tmp_b32 =
+          __nvvm_shfl_sync_bfly_i32(MemberMask, __nvvm_bitcast_f2i(x), -1, i);
+      tmp = __nvvm_bitcast_i2f(tmp_b32);
+    } else {
+      tmp = __nvvm_shfl_sync_bfly_i32(MemberMask, x, -1, i);
+    }
+    x = binary_op(x, tmp);
+  }
+  return x;
+}
+
+template <typename Group, typename T>
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<
+    ext::oneapi::experimental::is_user_constructed_group_v<Group>, T>
+non_uniform_shfl_T(const uint32_t MemberMask, T x, int shfl_param) {
+  if constexpr (is_fixed_size_group_v<Group>) {
+    return __nvvm_shfl_sync_up_i32(MemberMask, x, shfl_param, 0);
+  } else {
+    return __nvvm_shfl_sync_idx_i32(MemberMask, x, shfl_param, 31);
+  }
+}
+
+template <typename Group, typename T>
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<
+    ext::oneapi::experimental::is_user_constructed_group_v<Group>, T>
+non_uniform_shfl(Group g, const uint32_t MemberMask, T x, int shfl_param) {
+  T res;
+  if constexpr (std::is_same_v<T, double>) {
+    int x_a, x_b;
+    asm volatile("mov.b64 {%0,%1},%2;" : "=r"(x_a), "=r"(x_b) : "d"(x));
+    auto tmp_a = non_uniform_shfl_T<Group>(MemberMask, x_a, shfl_param);
+    auto tmp_b = non_uniform_shfl_T<Group>(MemberMask, x_b, shfl_param);
+    asm volatile("mov.b64 %0,{%1,%2};" : "=d"(res) : "r"(tmp_a), "r"(tmp_b));
+  } else if constexpr (std::is_same_v<T, long> ||
+                       std::is_same_v<T, unsigned long>) {
+    int x_a, x_b;
+    asm volatile("mov.b64 {%0,%1},%2;" : "=r"(x_a), "=r"(x_b) : "l"(x));
+    auto tmp_a = non_uniform_shfl_T<Group>(MemberMask, x_a, shfl_param);
+    auto tmp_b = non_uniform_shfl_T<Group>(MemberMask, x_b, shfl_param);
+    asm volatile("mov.b64 %0,{%1,%2};" : "=l"(res) : "r"(tmp_a), "r"(tmp_b));
+  } else if constexpr (std::is_same_v<T, half>) {
+    short tmp_b16;
+    asm volatile("mov.b16 %0,%1;" : "=h"(tmp_b16) : "h"(x));
+    auto tmp_b32 = non_uniform_shfl_T<Group>(
+        MemberMask, static_cast<int>(tmp_b16), shfl_param);
+    asm volatile("mov.b16 %0,%1;"
+                 : "=h"(res)
+                 : "h"(static_cast<short>(tmp_b32)));
+  } else if constexpr (std::is_same_v<T, float>) {
+    auto tmp_b32 = non_uniform_shfl_T<Group>(MemberMask, __nvvm_bitcast_f2i(x),
+                                             shfl_param);
+    res = __nvvm_bitcast_i2f(tmp_b32);
+  } else {
+    res = non_uniform_shfl_T<Group>(MemberMask, x, shfl_param);
+  }
+  return res;
+}
+
+// Opportunistic/Ballot group reduction using shfls
+template <typename Group, typename T, class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<
+    ext::oneapi::experimental::is_user_constructed_group_v<Group> &&
+        !is_fixed_size_group_v<Group>,
+    T>
+masked_reduction_cuda_shfls(Group g, T x, BinaryOperation binary_op,
+                            const uint32_t MemberMask) {
+
+  unsigned localSetBit = g.get_local_id()[0] + 1;
+
+  // number of elements requiring binary operations each loop iteration
+  auto opRange = g.get_local_range()[0];
+
+  // stride between local_ids forming a binary op
+  unsigned stride = opRange / 2;
+  while (stride >= 1) {
+
+    // if (remainder == 1), there is a WI without a binary op partner
+    unsigned remainder = opRange % 2;
+
+    // unfolded position of set bit in mask of shfl src lane
+    int unfoldedSrcSetBit = localSetBit + stride;
+
+    // __nvvm_fns automatically wraps around to the correct bit position.
+    // There is no performance impact on src_set_bit position wrt localSetBit
+    auto tmp = non_uniform_shfl(g, MemberMask, x,
+                                __nvvm_fns(MemberMask, 0, unfoldedSrcSetBit));
+
+    if (!(localSetBit == 1 && remainder != 0)) {
+      x = binary_op(x, tmp);
+    }
+
+    opRange = stride + remainder;
+    stride = opRange / 2;
+  }
+  unsigned broadID;
+  asm volatile(".reg .u32 rev;\n\t"
+               "brev.b32 rev, %1;\n\t" // reverse mask bits
+               "clz.b32 %0, rev;"
+               : "=r"(broadID)
+               : "r"(MemberMask));
+
+  return non_uniform_shfl(g, MemberMask, x, broadID);
+}
+
+// Non Redux types must fall back to shfl based implementations.
+template <typename Group, typename T, class BinaryOperation>
+std::enable_if_t<
+    std::is_same<IsRedux<T, BinaryOperation>, std::false_type>::value &&
+        ext::oneapi::experimental::is_user_constructed_group_v<Group>,
+    T>
+masked_reduction_cuda_sm80(Group g, T x, BinaryOperation binary_op,
+                           const uint32_t MemberMask) {
+  return masked_reduction_cuda_shfls(g, x, binary_op, MemberMask);
+}
+
+// get_identity is only currently used in this cuda specific header. If in the
+// future it has more general use it should be moved to a more appropriate
+// header.
+template <typename T, class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE
+    std::enable_if_t<IsPlus<T, BinaryOperation>::value ||
+                         IsBitOR<T, BinaryOperation>::value ||
+                         IsBitXOR<T, BinaryOperation>::value,
+                     T>
+    get_identity() {
+  return 0;
+}
+
+template <typename T, class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE
+    std::enable_if_t<IsMultiplies<T, BinaryOperation>::value, T>
+    get_identity() {
+  return 1;
+}
+
+template <typename T, class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE
+    std::enable_if_t<IsBitAND<T, BinaryOperation>::value, T>
+    get_identity() {
+  return ~0;
+}
+
+#define GET_ID(OP_CHECK, OP)                                                   \
+  template <typename T, class BinaryOperation>                                 \
+  inline __SYCL_ALWAYS_INLINE                                                  \
+      std::enable_if_t<OP_CHECK<T, BinaryOperation>::value, T>                 \
+      get_identity() {                                                         \
+    return std::numeric_limits<T>::OP();                                       \
+  }
+
+GET_ID(IsMinimum, max)
+GET_ID(IsMaximum, min)
+
+#undef GET_ID
+
+//// Shuffle based masked reduction impls
+
+// fixed_size_group group scan using shfls
+template <__spv::GroupOperation Op, typename Group, typename T,
+          class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<is_fixed_size_group_v<Group>, T>
+masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op,
+                       const uint32_t MemberMask) {
+  unsigned localIdVal = g.get_local_id()[0];
+  for (int i = 1; i < g.get_local_range()[0]; i *= 2) {
+    auto tmp = non_uniform_shfl(g, MemberMask, x, i);
+    if (localIdVal >= i)
+      x = binary_op(x, tmp);
+  }
+  if constexpr (Op == __spv::GroupOperation::ExclusiveScan) {
+
+    x = non_uniform_shfl(g, MemberMask, x, 1);
+    if (localIdVal == 0) {
+      return get_identity<T, BinaryOperation>();
+    }
+  }
+  return x;
+}
+
+template <__spv::GroupOperation Op, typename Group, typename T,
+          class BinaryOperation>
+inline __SYCL_ALWAYS_INLINE std::enable_if_t<
+    ext::oneapi::experimental::is_user_constructed_group_v<Group> &&
+        !is_fixed_size_group_v<Group>,
+    T>
+masked_scan_cuda_shfls(Group g, T x, BinaryOperation binary_op,
+                       const uint32_t MemberMask) {
+  unsigned localIdVal = g.get_local_id()[0];
+  unsigned localSetBit = localIdVal + 1;
+
+  for (int i = 1; i < g.get_local_range()[0]; i *= 2) {
+    int unfoldedSrcSetBit = localSetBit - i;
+
+    auto tmp = non_uniform_shfl(g, MemberMask, x,
+                                __nvvm_fns(MemberMask, 0, unfoldedSrcSetBit));
+    if (localIdVal >= i)
+      x = binary_op(x, tmp);
+  }
+  if constexpr (Op == __spv::GroupOperation::ExclusiveScan) {
+    x = non_uniform_shfl(g, MemberMask, x,
+                         __nvvm_fns(MemberMask, 0, localSetBit - 1));
+    if (localIdVal == 0) {
+      return get_identity<T, BinaryOperation>();
+    }
+  }
+  return x;
+}
+
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+} // namespace detail
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl
diff --git a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp
index de45d5ba3bdef..cbc6be038f4ba 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp
@@ -163,6 +163,13 @@ struct is_user_constructed_group<fixed_size_group<PartitionSize, ParentGroup>>
 
 } // namespace ext::oneapi::experimental
 
+namespace detail {
+template <size_t PartitionSize, typename ParentGroup>
+struct is_fixed_size_group<
+    ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup>>
+    : std::true_type {};
+} // namespace detail
+
 template <size_t PartitionSize, typename ParentGroup>
 struct is_group<
     ext::oneapi::experimental::fixed_size_group<PartitionSize, ParentGroup>>
diff --git a/sycl/include/sycl/group_algorithm.hpp b/sycl/include/sycl/group_algorithm.hpp
index 8963184814b4e..d016839bf6d50 100644
--- a/sycl/include/sycl/group_algorithm.hpp
+++ b/sycl/include/sycl/group_algorithm.hpp
@@ -15,6 +15,7 @@
 #include <sycl/builtins.hpp>
 #include <sycl/detail/spirv.hpp>
 #include <sycl/detail/type_traits.hpp>
+#include <sycl/ext/oneapi/experimental/cuda/non_uniform_algorithms.hpp>
 #include <sycl/ext/oneapi/functional.hpp>
 #include <sycl/functional.hpp>
 #include <sycl/group.hpp>
@@ -207,6 +208,17 @@ reduce_over_group(Group g, T x, BinaryOperation binary_op) {
            std::is_same_v<decltype(binary_op(x, x)), float>),
       "Result type of binary_op must match reduction accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
+#if defined(__NVPTX__)
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<Group>) {
+    sycl::vec<unsigned, 4> MemberMask =
+        sycl::detail::ExtractMask(sycl::detail::GetMask(g));
+#if (__SYCL_CUDA_ARCH__ >= 800)
+    return detail::masked_reduction_cuda_sm80(g, x, binary_op, MemberMask[0]);
+#else
+    return detail::masked_reduction_cuda_shfls(g, x, binary_op, MemberMask[0]);
+#endif
+  }
+#endif
   return sycl::detail::calc<__spv::GroupOperation::Reduce>(
       g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
 #else
@@ -375,6 +387,12 @@ template <typename Group>
 std::enable_if_t<is_group_v<std::decay_t<Group>>, bool>
 any_of_group(Group g, bool pred) {
 #ifdef __SYCL_DEVICE_ONLY__
+#if defined(__NVPTX__)
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<Group>) {
+    return __nvvm_vote_any_sync(detail::ExtractMask(detail::GetMask(g))[0],
+                                pred);
+  }
+#endif
   return sycl::detail::spirv::GroupAny(g, pred);
 #else
   (void)g;
@@ -415,6 +433,12 @@ template <typename Group>
 std::enable_if_t<is_group_v<std::decay_t<Group>>, bool>
 all_of_group(Group g, bool pred) {
 #ifdef __SYCL_DEVICE_ONLY__
+#if defined(__NVPTX__)
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<Group>) {
+    return __nvvm_vote_all_sync(detail::ExtractMask(detail::GetMask(g))[0],
+                                pred);
+  }
+#endif
   return sycl::detail::spirv::GroupAll(g, pred);
 #else
   (void)g;
@@ -455,6 +479,12 @@ template <typename Group>
 std::enable_if_t<is_group_v<std::decay_t<Group>>, bool>
 none_of_group(Group g, bool pred) {
 #ifdef __SYCL_DEVICE_ONLY__
+#if defined(__NVPTX__)
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<Group>) {
+    return __nvvm_vote_all_sync(detail::ExtractMask(detail::GetMask(g))[0],
+                                !pred);
+  }
+#endif
   return sycl::detail::spirv::GroupAll(g, !pred);
 #else
   (void)g;
@@ -573,6 +603,13 @@ std::enable_if_t<(is_group_v<std::decay_t<Group>> &&
                  T>
 group_broadcast(Group g, T x, typename Group::id_type local_id) {
 #ifdef __SYCL_DEVICE_ONLY__
+#if defined(__NVPTX__)
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<Group>) {
+    auto LocalId = detail::IdToMaskPosition(g, local_id);
+    return __nvvm_shfl_sync_idx_i32(detail::ExtractMask(detail::GetMask(g))[0],
+                                    x, LocalId, 31);
+  }
+#endif
   return sycl::detail::spirv::GroupBroadcast(g, x, local_id);
 #else
   (void)g;
@@ -636,6 +673,13 @@ exclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) {
                      std::is_same_v<decltype(binary_op(x, x)), float>),
                 "Result type of binary_op must match scan accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
+#if defined(__NVPTX__)
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<Group>) {
+    return detail::masked_scan_cuda_shfls<__spv::GroupOperation::ExclusiveScan>(
+        g, x, binary_op,
+        sycl::detail::ExtractMask(sycl::detail::GetMask(g))[0]);
+  }
+#endif
   return sycl::detail::calc<__spv::GroupOperation::ExclusiveScan>(
       g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
 #else
@@ -865,6 +909,13 @@ inclusive_scan_over_group(Group g, T x, BinaryOperation binary_op) {
                      std::is_same_v<decltype(binary_op(x, x)), float>),
                 "Result type of binary_op must match scan accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
+#if defined(__NVPTX__)
+  if constexpr (ext::oneapi::experimental::is_user_constructed_group_v<Group>) {
+    return detail::masked_scan_cuda_shfls<__spv::GroupOperation::InclusiveScan>(
+        g, x, binary_op,
+        sycl::detail::ExtractMask(sycl::detail::GetMask(g))[0]);
+  }
+#endif
   return sycl::detail::calc<__spv::GroupOperation::InclusiveScan>(
       g, typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
 #else
diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp
index da27e89bc2458..03cb9e5ba6a7f 100644
--- a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp
+++ b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp
@@ -2,7 +2,7 @@
 // RUN: %{run} %t.out
 //
 // REQUIRES: gpu
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 
 #include <sycl/sycl.hpp>
 #include <vector>
diff --git a/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp
index a338a6cd6f98a..d2ed6e97baf65 100644
--- a/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp
+++ b/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp
@@ -2,7 +2,7 @@
 // RUN: %{run} %t.out
 //
 // REQUIRES: gpu
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 
 #include <sycl/sycl.hpp>
 #include <vector>
diff --git a/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp
index ef364bf243425..93636a8156167 100644
--- a/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp
+++ b/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp
@@ -2,7 +2,7 @@
 // RUN: %{run} %t.out
 //
 // REQUIRES: gpu
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 
 #include <sycl/sycl.hpp>
 #include <vector>

From 56e05afa3b323aa0fcf1c8d47fd97560c6818eb9 Mon Sep 17 00:00:00 2001
From: mmoadeli <mmoadeli@hotmail.co.uk>
Date: Tue, 13 Jun 2023 15:15:14 +0100
Subject: [PATCH 05/55] [SYCL][CUDA] Improve function to guess local work size
 more efficiently. (#9787)

* The `threadsPerBlock` values computed by `guessLocalWorkSize` are not
the most optimal values. In particular the `threadsPerBlock` for `Y` and
`Z` were much below the possible values.
* When Y/Z values of range are prime a very poor performance is
witnessed as shown in the associated
[issue](https://github.com/intel/llvm/issues/8018)
* This PR compute `threadsPerBlock` for X/Y/Z to reduce corresponding
`BlocksPerGrid` values.

* Below presents the output of the code in associated issue without the
changes in this PR.

Device = NVIDIA GeForce GTX 1050 Ti
N,   elapsed(ms)

- 1009,4.61658
- 2003,45.6869
- 3001,67.5192
- 4001,88.1543
- 5003,111.338
- 6007,132.848
- 7001,154.697
- 8009,175.452
- 9001,196.237
- 10007,219.39
- 1000,4.59423
- 2000,4.61525
- 3000,4.61935
- 4000,4.62526
- 5000,4.64623
- 6000,4.78904
- 7000,8.92251
- 8000,8.97263
- 9000,9.06992
- 10000,9.03802


* And below shows the output with the PR's updates
 Device = NVIDIA GeForce GTX 1050 Ti
N,  elapsed(ms)

- 1009,4.58252
- 2003,4.60139
- 3001,3.47269
- 4001,3.62314
- 5003,4.15179
- 6007,7.07976
- 7001,7.49027
- 8009,8.00097
- 9001,9.08756
- 10007,8.0005
- 1000,4.56335
- 2000,4.60376
- 3000,4.76395
- 4000,4.63283
- 5000,4.64732
- 6000,4.63936
- 7000,8.97499
- 8000,8.9941
- 9000,9.01531
- 10000,9.00935
---
 sycl/plugins/cuda/pi_cuda.cpp | 48 +++++++++++++++++++++++++++--------
 1 file changed, 37 insertions(+), 11 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index a19889704ce40..dd68c196e94c1 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -19,6 +19,7 @@
 #include <algorithm>
 #include <cassert>
 #include <chrono>
+#include <cmath>
 #include <cuda.h>
 #include <cuda_device_runtime_api.h>
 #include <limits>
@@ -305,25 +306,49 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock,
   assert(threadsPerBlock != nullptr);
   assert(global_work_size != nullptr);
   assert(kernel != nullptr);
-  int minGrid, maxBlockSize, gridDim[3];
+  int minGrid, maxBlockSize, maxBlockDim[3];
 
-  cuDeviceGetAttribute(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
+  static auto isPrime = [](size_t number) -> bool {
+    auto lastNumToCheck = ceil(sqrt(number));
+    if (number < 2)
+      return false;
+    if (number == 2)
+      return true;
+    if (number % 2 == 0)
+      return false;
+    for (int i = 3; i <= lastNumToCheck; i += 2) {
+      if (number % i == 0)
+        return false;
+    }
+    return true;
+  };
+
+  cuDeviceGetAttribute(&maxBlockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
                        device->get());
-  cuDeviceGetAttribute(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
+  cuDeviceGetAttribute(&maxBlockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
                        device->get());
 
-  threadsPerBlock[1] = ((global_work_size[1] - 1) / gridDim[1]) + 1;
-  threadsPerBlock[2] = ((global_work_size[2] - 1) / gridDim[2]) + 1;
-
   PI_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
       &minGrid, &maxBlockSize, kernel->get(), NULL, local_size,
       maxThreadsPerBlock[0]));
 
-  gridDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]);
-
+  threadsPerBlock[2] = std::min(global_work_size[2], size_t(maxBlockDim[2]));
+  threadsPerBlock[1] =
+      std::min(global_work_size[1], std::min(maxBlockSize / threadsPerBlock[2],
+                                             size_t(maxBlockDim[1])));
+  maxBlockDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]);
   threadsPerBlock[0] =
       std::min(maxThreadsPerBlock[0],
-               std::min(global_work_size[0], static_cast<size_t>(gridDim[0])));
+               std::min(global_work_size[0], size_t(maxBlockDim[0])));
+
+  // When global_work_size[0] is prime threadPerBlock[0] will later computed as
+  // 1, which is not efficient configuration. In such case we use
+  // global_work_size[0] + 1 to compute threadPerBlock[0].
+  int adjusted_0_dim_global_work_size =
+      (isPrime(global_work_size[0]) &&
+       (threadsPerBlock[0] != global_work_size[0]))
+          ? global_work_size[0] + 1
+          : global_work_size[0];
 
   static auto isPowerOf2 = [](size_t value) -> bool {
     return value && !(value & (value - 1));
@@ -333,7 +358,7 @@ void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock,
   // work group size to produce uniform work groups.
   // Additionally, for best compute utilisation, the local size has
   // to be a power of two.
-  while (0u != (global_work_size[0] % threadsPerBlock[0]) ||
+  while (0u != (adjusted_0_dim_global_work_size % threadsPerBlock[0]) ||
          !isPowerOf2(threadsPerBlock[0])) {
     --threadsPerBlock[0];
   }
@@ -2161,7 +2186,8 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
         cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, device->get()) ==
         CUDA_SUCCESS);
     // CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written
-    sycl::detail::pi::assertion(strnlen(AddressBuffer, AddressBufferSize) == 12);
+    sycl::detail::pi::assertion(strnlen(AddressBuffer, AddressBufferSize) ==
+                                12);
     return getInfoArray(strnlen(AddressBuffer, AddressBufferSize - 1) + 1,
                         param_value_size, param_value, param_value_size_ret,
                         AddressBuffer);

From 93eb9ffe5779443d7b77fb1066c0845baa0e1b05 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Tue, 13 Jun 2023 15:29:29 +0100
Subject: [PATCH 06/55] [SYCL][CUDA] Add missing device scope to atomic fence
 (#9824)

This patch maps `Device` scope fence to the right NVVM built-in. It
would previously incorrectly use the CTA (threadblock) variant.
---
 libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl b/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl
index 467053bb0f9d6..84560beda9d6e 100644
--- a/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl
+++ b/libclc/ptx-nvidiacl/libspirv/synchronization/barrier.cl
@@ -13,6 +13,8 @@ _CLC_OVERLOAD _CLC_DEF void __spirv_MemoryBarrier(unsigned int memory,
                                                   unsigned int semantics) {
   if (memory == CrossDevice) {
     __nvvm_membar_sys();
+  } else if (memory == Device) {
+    __nvvm_membar_gl();
   } else {
     __nvvm_membar_cta();
   }

From a055665574afc9d4277ee8b6d7463a4f02ea9651 Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Tue, 13 Jun 2023 07:43:28 -0700
Subject: [PATCH 07/55] [CI] Improve devops/actions/cached_checkout (#9831)

* Add ability to skip the merge
* Setup alternates on the filesystem level so that other jobs in the
workflow could work with GIT without setting the environment variable.
---
 devops/actions/cached_checkout/action.yml | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/devops/actions/cached_checkout/action.yml b/devops/actions/cached_checkout/action.yml
index 5413cad7a2457..dc31da2f5d007 100644
--- a/devops/actions/cached_checkout/action.yml
+++ b/devops/actions/cached_checkout/action.yml
@@ -16,6 +16,9 @@ inputs:
   default_branch:
     description: 'Name of the default branch'
     default: 'sycl'
+  merge:
+    description: 'Merge default branch after the checkout'
+    default: true
 
 runs:
   using: 'composite'
@@ -41,10 +44,16 @@ runs:
       ref: ${{ inputs.ref }}
       path: ${{ inputs.path }}
       fetch-depth: 0
-  - name: Merge sycl HEAD into current branch
+  - name: Setup alternates
     shell: bash
     env:
       GIT_ALTERNATE_OBJECT_DIRECTORIES: ${{ inputs.cache_path }}/${{ inputs.repository }}/.git/objects
+    run: |
+      echo $GIT_ALTERNATE_OBJECT_DIRECTORIES > ${{ inputs.path }}/.git/objects/info/alternates
+  - name: Merge sycl HEAD into current branch
+    shell: bash
+    if: ${{ inputs.merge == 'true' }}
+    env:
       DEFAULT_BRANCH: ${{ inputs.default_branch }}
     run: |
       cd ${{ inputs.path }}

From b3e0428ffef4edddb1e7236219de056de8d0ffcc Mon Sep 17 00:00:00 2001
From: marcin-smuklerz-mobica
 <133634458+marcin-smuklerz-mobica@users.noreply.github.com>
Date: Tue, 13 Jun 2023 17:26:06 +0200
Subject: [PATCH 08/55] [SYCL] Non-standard RT namespace removed(#7133) (#9837)

---
 sycl/include/sycl/detail/pi.hpp                              | 3 ---
 sycl/include/sycl/handler.hpp                                | 2 +-
 sycl/source/context.cpp                                      | 2 +-
 sycl/source/detail/usm/usm_impl.cpp                          | 2 +-
 sycl/source/event.cpp                                        | 4 ++--
 sycl/source/handler.cpp                                      | 4 ++--
 sycl/source/queue.cpp                                        | 2 +-
 sycl/unittests/kernel-and-program/KernelInfo.cpp             | 4 ++--
 sycl/unittests/kernel-and-program/KernelRelease.cpp          | 4 ++--
 .../kernel-and-program/PersistentDeviceCodeCache.cpp         | 2 +-
 sycl/unittests/scheduler/Commands.cpp                        | 5 +++--
 11 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/sycl/include/sycl/detail/pi.hpp b/sycl/include/sycl/detail/pi.hpp
index 01f7cb621e6b7..ebd2eedaf4ea0 100644
--- a/sycl/include/sycl/detail/pi.hpp
+++ b/sycl/include/sycl/detail/pi.hpp
@@ -275,9 +275,6 @@ template <class To, class FromE> To cast(std::vector<FromE> Values) {
 } // namespace pi
 } // namespace detail
 
-// For shortness of using PI from the top-level sycl files.
-namespace RT = sycl::detail::pi;
-
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
 } // namespace sycl
 
diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp
index c37b05ee51ef8..86313cf734dae 100644
--- a/sycl/include/sycl/handler.hpp
+++ b/sycl/include/sycl/handler.hpp
@@ -3199,7 +3199,7 @@ class __SYCL_EXPORT handler {
   }
 
   // Set value of the gpu cache configuration for the kernel.
-  void setKernelCacheConfig(RT::PiKernelCacheConfig);
+  void setKernelCacheConfig(sycl::detail::pi::PiKernelCacheConfig);
 };
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
 } // namespace sycl
diff --git a/sycl/source/context.cpp b/sycl/source/context.cpp
index d911d057f609c..4923621a95251 100644
--- a/sycl/source/context.cpp
+++ b/sycl/source/context.cpp
@@ -85,7 +85,7 @@ context::context(const std::vector<device> &DeviceList,
   }
 }
 context::context(cl_context ClContext, async_handler AsyncHandler) {
-  const auto &Plugin = RT::getPlugin<backend::opencl>();
+  const auto &Plugin = sycl::detail::pi::getPlugin<backend::opencl>();
   impl = std::make_shared<detail::context_impl>(
       detail::pi::cast<detail::RT::PiContext>(ClContext), AsyncHandler, Plugin);
 }
diff --git a/sycl/source/detail/usm/usm_impl.cpp b/sycl/source/detail/usm/usm_impl.cpp
index 49f4d69294724..70acfc07a8c98 100644
--- a/sycl/source/detail/usm/usm_impl.cpp
+++ b/sycl/source/detail/usm/usm_impl.cpp
@@ -589,7 +589,7 @@ alloc get_pointer_type(const void *Ptr, const context &Ctxt) {
 
   // query type using PI function
   const detail::PluginPtr &Plugin = CtxImpl->getPlugin();
-  RT::PiResult Err =
+  sycl::detail::pi::PiResult Err =
       Plugin->call_nocheck<detail::PiApiKind::piextUSMGetMemAllocInfo>(
           PICtx, Ptr, PI_MEM_ALLOC_TYPE, sizeof(pi_usm_type), &AllocTy,
           nullptr);
diff --git a/sycl/source/event.cpp b/sycl/source/event.cpp
index 8a7033fc8b307..a4b2370e97a63 100644
--- a/sycl/source/event.cpp
+++ b/sycl/source/event.cpp
@@ -27,11 +27,11 @@ event::event() : impl(std::make_shared<detail::event_impl>(std::nullopt)) {}
 
 event::event(cl_event ClEvent, const context &SyclContext)
     : impl(std::make_shared<detail::event_impl>(
-          detail::pi::cast<RT::PiEvent>(ClEvent), SyclContext)) {
+          detail::pi::cast<sycl::detail::pi::PiEvent>(ClEvent), SyclContext)) {
   // This is a special interop constructor for OpenCL, so the event must be
   // retained.
   impl->getPlugin()->call<detail::PiApiKind::piEventRetain>(
-      detail::pi::cast<RT::PiEvent>(ClEvent));
+      detail::pi::cast<sycl::detail::pi::PiEvent>(ClEvent));
 }
 
 bool event::operator==(const event &rhs) const { return rhs.impl == impl; }
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index bc1956a3f4d91..1704321317d8c 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -188,9 +188,9 @@ event handler::finalize() {
       // this faster path is used to submit kernel bypassing scheduler and
       // avoiding CommandGroup, Command objects creation.
 
-      std::vector<RT::PiEvent> RawEvents;
+      std::vector<sycl::detail::pi::PiEvent> RawEvents;
       detail::EventImplPtr NewEvent;
-      RT::PiEvent *OutEvent = nullptr;
+      sycl::detail::pi::PiEvent *OutEvent = nullptr;
 
       auto EnqueueKernel = [&]() {
         // 'Result' for single point of return
diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp
index 1a814fed67c43..90bc1bf3d3a06 100644
--- a/sycl/source/queue.cpp
+++ b/sycl/source/queue.cpp
@@ -66,7 +66,7 @@ queue::queue(cl_command_queue clQueue, const context &SyclContext,
              const async_handler &AsyncHandler) {
   const property_list PropList{};
   impl = std::make_shared<detail::queue_impl>(
-      reinterpret_cast<RT::PiQueue>(clQueue),
+      reinterpret_cast<sycl::detail::pi::PiQueue>(clQueue),
       detail::getSyclObjImpl(SyclContext), AsyncHandler, PropList);
 }
 
diff --git a/sycl/unittests/kernel-and-program/KernelInfo.cpp b/sycl/unittests/kernel-and-program/KernelInfo.cpp
index 98bb1f0e0f667..a5b406ba469c5 100644
--- a/sycl/unittests/kernel-and-program/KernelInfo.cpp
+++ b/sycl/unittests/kernel-and-program/KernelInfo.cpp
@@ -45,8 +45,8 @@ static pi_result redefinedKernelGetInfo(pi_kernel kernel,
                                         size_t *param_value_size_ret) {
   EXPECT_EQ(param_name, PI_KERNEL_INFO_CONTEXT)
       << "Unexpected kernel info requested";
-  auto *Result = reinterpret_cast<RT::PiContext *>(param_value);
-  RT::PiContext PiCtx =
+  auto *Result = reinterpret_cast<sycl::detail::pi::PiContext *>(param_value);
+  sycl::detail::pi::PiContext PiCtx =
       detail::getSyclObjImpl(TestContext->Ctx)->getHandleRef();
   *Result = PiCtx;
   return PI_SUCCESS;
diff --git a/sycl/unittests/kernel-and-program/KernelRelease.cpp b/sycl/unittests/kernel-and-program/KernelRelease.cpp
index b6f616c34461e..a982772db573c 100644
--- a/sycl/unittests/kernel-and-program/KernelRelease.cpp
+++ b/sycl/unittests/kernel-and-program/KernelRelease.cpp
@@ -53,8 +53,8 @@ static pi_result redefinedKernelGetInfo(pi_kernel kernel,
                                         size_t *param_value_size_ret) {
   EXPECT_EQ(param_name, PI_KERNEL_INFO_CONTEXT)
       << "Unexpected kernel info requested";
-  auto *Result = reinterpret_cast<RT::PiContext *>(param_value);
-  RT::PiContext PiCtx =
+  auto *Result = reinterpret_cast<sycl::detail::pi::PiContext *>(param_value);
+  sycl::detail::pi::PiContext PiCtx =
       detail::getSyclObjImpl(TestContext->Ctx)->getHandleRef();
   *Result = PiCtx;
   return PI_SUCCESS;
diff --git a/sycl/unittests/kernel-and-program/PersistentDeviceCodeCache.cpp b/sycl/unittests/kernel-and-program/PersistentDeviceCodeCache.cpp
index 8c418c0e692cb..6ee9f5658bdbc 100644
--- a/sycl/unittests/kernel-and-program/PersistentDeviceCodeCache.cpp
+++ b/sycl/unittests/kernel-and-program/PersistentDeviceCodeCache.cpp
@@ -236,7 +236,7 @@ class PersistentDeviceCodeCache
                                     /*PropertySetsEnd*/ nullptr};
   pi_device_binary Bin = &BinStruct;
   detail::RTDeviceBinaryImage Img{Bin};
-  RT::PiProgram NativeProg;
+  sycl::detail::pi::PiProgram NativeProg;
 };
 
 /* Checks that key values with \0 symbols are processed correctly
diff --git a/sycl/unittests/scheduler/Commands.cpp b/sycl/unittests/scheduler/Commands.cpp
index aa9c750d85f55..a995800643421 100644
--- a/sycl/unittests/scheduler/Commands.cpp
+++ b/sycl/unittests/scheduler/Commands.cpp
@@ -26,11 +26,12 @@ pi_result redefinePiEnqueueEventsWaitWithBarrier(pi_queue Queue,
 }
 
 // Hack that allows to return a context in redefinePiEventGetInfo
-RT::PiContext queue_global_context = nullptr;
+sycl::detail::pi::PiContext queue_global_context = nullptr;
 
 pi_result redefinePiEventGetInfo(pi_event, pi_event_info, size_t,
                                  void *param_value, size_t *) {
-  *reinterpret_cast<RT::PiContext *>(param_value) = queue_global_context;
+  *reinterpret_cast<sycl::detail::pi::PiContext *>(param_value) =
+      queue_global_context;
   return PI_SUCCESS;
 }
 

From 36e6e06ccda8eff5c9c189536d2b1fe288e7a082 Mon Sep 17 00:00:00 2001
From: Michael Toguchi <michael.d.toguchi@intel.com>
Date: Tue, 13 Jun 2023 10:20:16 -0700
Subject: [PATCH 09/55] [SYCL][NFC] Remove llvm-no-spir-kernel tool (#9710)

The llvm-no-spir-kernel tool is no longer in use. Remove the creation,
tests and driver infrastructure to use the tool. Also remove the
reference from the docs.
---
 clang/include/clang/Driver/Action.h           | 16 ----
 clang/include/clang/Driver/ToolChain.h        |  2 -
 clang/lib/Driver/Action.cpp                   |  7 --
 clang/lib/Driver/ToolChain.cpp                |  9 ---
 clang/lib/Driver/ToolChains/Clang.cpp         | 39 ---------
 clang/lib/Driver/ToolChains/Clang.h           | 13 ---
 llvm/test/CMakeLists.txt                      |  1 -
 .../tools/llvm-no-spir-kernel/error-code.ll   | 11 ---
 .../llvm-no-spir-kernel/has-spir-kernel.ll    | 15 ----
 .../llvm-no-spir-kernel/invalid-input.ll      |  2 -
 .../llvm-no-spir-kernel/no-spir-kernel.ll     |  8 --
 .../llvm-no-spir-kernel/no-spir-kernel2.ll    | 12 ---
 llvm/tools/llvm-no-spir-kernel/CMakeLists.txt | 13 ---
 llvm/tools/llvm-no-spir-kernel/LLVMBuild.txt  | 21 -----
 .../llvm-no-spir-kernel.cpp                   | 79 -------------------
 sycl/CMakeLists.txt                           |  2 -
 sycl/doc/design/CompilerAndRuntimeDesign.md   |  9 ---
 17 files changed, 259 deletions(-)
 delete mode 100644 llvm/test/tools/llvm-no-spir-kernel/error-code.ll
 delete mode 100755 llvm/test/tools/llvm-no-spir-kernel/has-spir-kernel.ll
 delete mode 100644 llvm/test/tools/llvm-no-spir-kernel/invalid-input.ll
 delete mode 100755 llvm/test/tools/llvm-no-spir-kernel/no-spir-kernel.ll
 delete mode 100755 llvm/test/tools/llvm-no-spir-kernel/no-spir-kernel2.ll
 delete mode 100644 llvm/tools/llvm-no-spir-kernel/CMakeLists.txt
 delete mode 100644 llvm/tools/llvm-no-spir-kernel/LLVMBuild.txt
 delete mode 100644 llvm/tools/llvm-no-spir-kernel/llvm-no-spir-kernel.cpp

diff --git a/clang/include/clang/Driver/Action.h b/clang/include/clang/Driver/Action.h
index 45e9133b7ed13..b4105623ed7b8 100644
--- a/clang/include/clang/Driver/Action.h
+++ b/clang/include/clang/Driver/Action.h
@@ -79,7 +79,6 @@ class Action {
     OffloadPackagerJobClass,
     OffloadDepsJobClass,
     SPIRVTranslatorJobClass,
-    SPIRCheckJobClass,
     SYCLPostLinkJobClass,
     BackendCompileJobClass,
     FileTableTformJobClass,
@@ -750,21 +749,6 @@ class SPIRVTranslatorJobAction : public JobAction {
   }
 };
 
-// Provides a check of the given input file for the existence of SPIR kernel
-// code.  This is currently only used for FPGA specific tool chains and can
-// be expanded to perform other SPIR checks if needed.
-// TODO: No longer being used for FPGA (or elsewhere), cleanup needed.
-class SPIRCheckJobAction : public JobAction {
-  void anchor() override;
-
-public:
-  SPIRCheckJobAction(Action *Input, types::ID OutputType);
-
-  static bool classof(const Action *A) {
-    return A->getKind() == SPIRCheckJobClass;
-  }
-};
-
 class SYCLPostLinkJobAction : public JobAction {
   void anchor() override;
 
diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
index 141412c2a78f2..4abc92186fd05 100644
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -160,7 +160,6 @@ class ToolChain {
   mutable std::unique_ptr<Tool> OffloadPackager;
   mutable std::unique_ptr<Tool> OffloadDeps;
   mutable std::unique_ptr<Tool> SPIRVTranslator;
-  mutable std::unique_ptr<Tool> SPIRCheck;
   mutable std::unique_ptr<Tool> SYCLPostLink;
   mutable std::unique_ptr<Tool> BackendCompiler;
   mutable std::unique_ptr<Tool> AppendFooter;
@@ -180,7 +179,6 @@ class ToolChain {
   Tool *getOffloadPackager() const;
   Tool *getOffloadDeps() const;
   Tool *getSPIRVTranslator() const;
-  Tool *getSPIRCheck() const;
   Tool *getSYCLPostLink() const;
   Tool *getBackendCompiler() const;
   Tool *getAppendFooter() const;
diff --git a/clang/lib/Driver/Action.cpp b/clang/lib/Driver/Action.cpp
index 4cb0225cad293..be87f334644b0 100644
--- a/clang/lib/Driver/Action.cpp
+++ b/clang/lib/Driver/Action.cpp
@@ -50,8 +50,6 @@ const char *Action::getClassName(ActionClass AC) {
     return "clang-offload-deps";
   case SPIRVTranslatorJobClass:
     return "llvm-spirv";
-  case SPIRCheckJobClass:
-    return "llvm-no-spir-kernel";
   case SYCLPostLinkJobClass:
     return "sycl-post-link";
   case BackendCompileJobClass:
@@ -508,11 +506,6 @@ SPIRVTranslatorJobAction::SPIRVTranslatorJobAction(Action *Input,
                                                    types::ID Type)
     : JobAction(SPIRVTranslatorJobClass, Input, Type) {}
 
-void SPIRCheckJobAction::anchor() {}
-
-SPIRCheckJobAction::SPIRCheckJobAction(Action *Input, types::ID Type)
-    : JobAction(SPIRCheckJobClass, Input, Type) {}
-
 void SYCLPostLinkJobAction::anchor() {}
 
 SYCLPostLinkJobAction::SYCLPostLinkJobAction(Action *Input,
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 6040dec40c260..dd1a2f9519008 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -417,12 +417,6 @@ Tool *ToolChain::getSPIRVTranslator() const {
   return SPIRVTranslator.get();
 }
 
-Tool *ToolChain::getSPIRCheck() const {
-  if (!SPIRCheck)
-    SPIRCheck.reset(new tools::SPIRCheck(*this));
-  return SPIRCheck.get();
-}
-
 Tool *ToolChain::getSYCLPostLink() const {
   if (!SYCLPostLink)
     SYCLPostLink.reset(new tools::SYCLPostLink(*this));
@@ -508,9 +502,6 @@ Tool *ToolChain::getTool(Action::ActionClass AC) const {
   case Action::SPIRVTranslatorJobClass:
     return getSPIRVTranslator();
 
-  case Action::SPIRCheckJobClass:
-    return getSPIRCheck();
-
   case Action::SYCLPostLinkJobClass:
     return getSYCLPostLink();
 
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index dcd391f7c7569..1acb5cfe6c016 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -9742,45 +9742,6 @@ void SPIRVTranslator::ConstructJob(Compilation &C, const JobAction &JA,
     C.addCommand(std::move(Cmd));
 }
 
-void SPIRCheck::ConstructJob(Compilation &C, const JobAction &JA,
-                             const InputInfo &Output,
-                             const InputInfoList &Inputs,
-                             const llvm::opt::ArgList &TCArgs,
-                             const char *LinkingOutput) const {
-  // Construct llvm-no-spir-kernel command.
-  assert(isa<SPIRCheckJobAction>(JA) && "Expecting SPIR Check job!");
-
-  // The spir check command looks like this:
-  // llvm-no-spir-kernel <file>.bc
-  // Upon success, we just move ahead.  Error means the check failed and
-  // we need to exit.  The expected output is the input as this is just an
-  // intermediate check with no functional change.
-  ArgStringList CheckArgs;
-  assert(Inputs.size() == 1 && "Unexpected number of inputs to the tool");
-  const InputInfo &InputFile = Inputs.front();
-  CheckArgs.push_back(InputFile.getFilename());
-
-  // Add output file, which is just a copy of the input to better fit in the
-  // toolchain flow.
-  CheckArgs.push_back("-o");
-  CheckArgs.push_back(Output.getFilename());
-  auto Cmd = std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::None(),
-      TCArgs.MakeArgString(getToolChain().GetProgramPath(getShortName())),
-      CheckArgs, std::nullopt);
-
-  if (getToolChain().getTriple().getSubArch() ==
-      llvm::Triple::SPIRSubArch_fpga) {
-    const char *Msg = TCArgs.MakeArgString(
-        Twine("The FPGA image does not include all device kernels from ") +
-        Twine(InputFile.getBaseInput()) +
-        Twine(". Please re-generate the image"));
-    Cmd->addDiagForErrorCode(/*ErrorCode*/ 1, Msg);
-  }
-
-  C.addCommand(std::move(Cmd));
-}
-
 static void addArgs(ArgStringList &DstArgs, const llvm::opt::ArgList &Alloc,
                     ArrayRef<StringRef> SrcArgs) {
   for (const auto Arg : SrcArgs) {
diff --git a/clang/lib/Driver/ToolChains/Clang.h b/clang/lib/Driver/ToolChains/Clang.h
index d65df3a1df72b..b04e6d9303d9f 100644
--- a/clang/lib/Driver/ToolChains/Clang.h
+++ b/clang/lib/Driver/ToolChains/Clang.h
@@ -228,19 +228,6 @@ class LLVM_LIBRARY_VISIBILITY SPIRVTranslator final : public Tool {
                     const char *LinkingOutput) const override;
 };
 
-/// SPIR Checking tool.
-class LLVM_LIBRARY_VISIBILITY SPIRCheck final : public Tool {
-public:
-  SPIRCheck(const ToolChain &TC)
-      : Tool("SPIR Checker", "llvm-no-spir-kernel", TC) {}
-
-  bool hasIntegratedCPP() const override { return false; }
-  void ConstructJob(Compilation &C, const JobAction &JA,
-                    const InputInfo &Output, const InputInfoList &Inputs,
-                    const llvm::opt::ArgList &TCArgs,
-                    const char *LinkingOutput) const override;
-};
-
 /// SYCL post-link device code processing tool.
 class LLVM_LIBRARY_VISIBILITY SYCLPostLink final : public Tool {
 public:
diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index 0e32c144acfdd..7e355d6923298 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -109,7 +109,6 @@ set(LLVM_TEST_DEPENDS
           llvm-modextract
           llvm-mt
           llvm-nm
-          llvm-no-spir-kernel
           llvm-objcopy
           llvm-objdump
           llvm-opt-fuzzer
diff --git a/llvm/test/tools/llvm-no-spir-kernel/error-code.ll b/llvm/test/tools/llvm-no-spir-kernel/error-code.ll
deleted file mode 100644
index ea556960180a3..0000000000000
--- a/llvm/test/tools/llvm-no-spir-kernel/error-code.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; UNSUPPORTED: system-windows
-
-; Check the return code
-; RUN: llvm-no-spir-kernel %s; \
-; RUN: if [ $? = 1 ]; then exit 0; else exit 1; fi
-
-; expected failure
-define spir_kernel void @foo() {
-bb:
-  ret void
-}
diff --git a/llvm/test/tools/llvm-no-spir-kernel/has-spir-kernel.ll b/llvm/test/tools/llvm-no-spir-kernel/has-spir-kernel.ll
deleted file mode 100755
index a1f8990b46429..0000000000000
--- a/llvm/test/tools/llvm-no-spir-kernel/has-spir-kernel.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: not llvm-no-spir-kernel %s 2>&1 | FileCheck %s
-
-; expected no failures
-define void @foo() {
-bb:
-  ret void
-}
-
-; expected failure
-; CHECK: error: Unexpected SPIR kernel occurrence:
-; CHECK-SAME: foo2
-define spir_kernel void @foo2() {
-bb:
-  ret void
-}
diff --git a/llvm/test/tools/llvm-no-spir-kernel/invalid-input.ll b/llvm/test/tools/llvm-no-spir-kernel/invalid-input.ll
deleted file mode 100644
index dab8826bef675..0000000000000
--- a/llvm/test/tools/llvm-no-spir-kernel/invalid-input.ll
+++ /dev/null
@@ -1,2 +0,0 @@
-; RUN: echo garbage > garbage.ll
-; RUN: not llvm-no-spir-kernel garbage.ll
diff --git a/llvm/test/tools/llvm-no-spir-kernel/no-spir-kernel.ll b/llvm/test/tools/llvm-no-spir-kernel/no-spir-kernel.ll
deleted file mode 100755
index 08548ede481bc..0000000000000
--- a/llvm/test/tools/llvm-no-spir-kernel/no-spir-kernel.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: llvm-no-spir-kernel  %s 
-
-define void @foo() {
-bb:
-  ret void
-}
-
-
diff --git a/llvm/test/tools/llvm-no-spir-kernel/no-spir-kernel2.ll b/llvm/test/tools/llvm-no-spir-kernel/no-spir-kernel2.ll
deleted file mode 100755
index a4fc32fdd4276..0000000000000
--- a/llvm/test/tools/llvm-no-spir-kernel/no-spir-kernel2.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: llvm-no-spir-kernel  %s 
-
-define void @foo() {
-bb:
-  ret void
-}
-
-define void @foo2() {
-bb:
-  ret void
-}
-
diff --git a/llvm/tools/llvm-no-spir-kernel/CMakeLists.txt b/llvm/tools/llvm-no-spir-kernel/CMakeLists.txt
deleted file mode 100644
index e5679c9bbd3f4..0000000000000
--- a/llvm/tools/llvm-no-spir-kernel/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-set(LLVM_LINK_COMPONENTS
-  Core
-  Demangle
-  IRReader
-  Support
-  )
-
-add_llvm_tool(llvm-no-spir-kernel
-  llvm-no-spir-kernel.cpp
-
-  DEPENDS
-  intrinsics_gen
-  )
diff --git a/llvm/tools/llvm-no-spir-kernel/LLVMBuild.txt b/llvm/tools/llvm-no-spir-kernel/LLVMBuild.txt
deleted file mode 100644
index dc74a84e85e54..0000000000000
--- a/llvm/tools/llvm-no-spir-kernel/LLVMBuild.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-;===- ./tools/llvm-no-spir-kernel/LLVMBuild.txt ----------------*- Conf -*--===;
-;
-; Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-; See https://llvm.org/LICENSE.txt for license information.
-; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Tool
-name = llvm-no-spir-kernel
-parent = Tools
-required_libraries = AsmParser BitReader IRReader
diff --git a/llvm/tools/llvm-no-spir-kernel/llvm-no-spir-kernel.cpp b/llvm/tools/llvm-no-spir-kernel/llvm-no-spir-kernel.cpp
deleted file mode 100644
index 1c50d29dfdcbf..0000000000000
--- a/llvm/tools/llvm-no-spir-kernel/llvm-no-spir-kernel.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-//===--- llvm-no-spir-kernel.cpp - Utility check spir kernel entry point --===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This utility checks if the input module contains functions that are a SPIR
-// kernel.
-//
-// - Return 0 if the LLVM module is "clean" from SPIR kernels
-// - Return 1 upon the first SPIR kernel occurence
-//
-// Use of an output file is not required for a successful check. It is used
-// to allow for proper input and output flow within the driver toolchain.
-//
-// Usage: llvm-no-spir-kernel input.bc/input.ll -o output.bc/output.ll
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Demangle/Demangle.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/Path.h"
-using namespace llvm;
-
-// InputFilename - The filename to read from.
-static cl::opt<std::string> InputFilename(cl::Positional,
-                                          cl::desc("<input bitcode file>"),
-                                          cl::init("-"),
-                                          cl::value_desc("filename"));
-
-// Output - The filename to output to.
-static cl::opt<std::string> Output("o",
-                                   cl::desc("<output filename>"),
-                                   cl::value_desc("filename"));
-
-
-int main(int argc, char **argv) {
-  InitLLVM X(argc, argv);
-
-  LLVMContext Context;
-  cl::ParseCommandLineOptions(argc, argv, "llvm no spir kernel\n");
-
-  // Use lazy loading, since we only care about function calling convention
-  SMDiagnostic Err;
-  const char *ProgramName = llvm::sys::path::filename(argv[0]).data();
-  std::unique_ptr<Module> M = getLazyIRFileModule(InputFilename, Err, Context);
-
-  if (!M.get()) {
-    Err.print(ProgramName, errs());
-    return 1;
-  }
-
-  for (auto &F : *M) {
-    if (F.getCallingConv() == CallingConv::SPIR_KERNEL) {
-      std::string SPIRKernelMsg =
-          "Unexpected SPIR kernel occurrence: " + demangle(F.getName().str());
-      SMDiagnostic SPIRKernelDiag(InputFilename, SourceMgr::DiagKind::DK_Error,
-                                  SPIRKernelMsg);
-      SPIRKernelDiag.print(ProgramName, errs());
-      return 1;
-    }
-  }
-
-  // When given an output file, just copy the input to the output
-  if (!Output.empty() && !InputFilename.empty()) {
-    llvm::sys::fs::copy_file(InputFilename, Output);
-  }
-
-  return 0;
-}
diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt
index a140e66fbade9..58fc3bad47175 100644
--- a/sycl/CMakeLists.txt
+++ b/sycl/CMakeLists.txt
@@ -303,7 +303,6 @@ add_custom_target(sycl-compiler
           llc
           llvm-ar
           llvm-foreach
-          llvm-no-spir-kernel
           llvm-spirv
           llvm-link
           llvm-objcopy
@@ -391,7 +390,6 @@ set( SYCL_TOOLCHAIN_DEPLOY_COMPONENTS
      llc
      llvm-ar
      llvm-foreach
-     llvm-no-spir-kernel
      llvm-spirv
      llvm-link
      llvm-objcopy
diff --git a/sycl/doc/design/CompilerAndRuntimeDesign.md b/sycl/doc/design/CompilerAndRuntimeDesign.md
index f06b0d52c257d..e72b7728085b1 100644
--- a/sycl/doc/design/CompilerAndRuntimeDesign.md
+++ b/sycl/doc/design/CompilerAndRuntimeDesign.md
@@ -423,15 +423,6 @@ Case 1 can be identified in the device binary generation stage (step 1) by
 scanning the known kernels. Case 2 must be verified by the driver by checking
 for newly introduced kernels in the final link stage (step 3).
 
-The llvm-no-spir-kernel tool was introduced to facilitate checking for case 2 in
-the driver. It detects if a module includes kernels and is invoked as follows:
-
-```bash
-llvm-no-spir-kernel host.bc
-```
-
-It returns 0 if no kernels are present and 1 otherwise.
-
 #### Device code post-link step
 
 At link time all the device code is linked into

From 2e6f732ae16077b86802c83cdb8312eb6197b186 Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Tue, 13 Jun 2023 10:55:22 -0700
Subject: [PATCH 10/55] [CI] Move lint job to self-hosted runner (#9844)

Github's allocation of default ubuntu-* runners isn't reliably stable,
so keep moving tasks to self hosted runners. We don't use the cuda
runner currently, so assign those to it for the time being. Later we
should be able to extend those utility tasks to run on generic `Linux`
class of self-hosted runners.
---
 .github/workflows/sycl_precommit.yml   | 33 ++++++++++++++++++++++----
 devops/actions/clang-format/action.yml | 11 ++++++---
 2 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/sycl_precommit.yml b/.github/workflows/sycl_precommit.yml
index 64fe2e1253549..8dc8cc1b18969 100644
--- a/.github/workflows/sycl_precommit.yml
+++ b/.github/workflows/sycl_precommit.yml
@@ -31,19 +31,42 @@ jobs:
     uses: ./.github/workflows/sycl_detect_changes.yml
 
   lint:
-    runs-on: ubuntu-22.04
+    runs-on: cuda
     container:
       image: ghcr.io/intel/llvm/sycl_ubuntu2204_nightly:no-drivers
+      # actions/checkout fails without "--privileged".
+      options: -u 1001:1001 --privileged
     steps:
+    - name: Fake actions/checkout task
+      uses: actions/checkout@v3
+      with:
+        # cached_checkout below uses actions/checkout internally. However, when
+        # actions/checkout is run from within another action step (not from
+        # workflow), github seems to try to download from within the container
+        # and doesn't have requried filesystem permissions. Make sure it's
+        # already downloaded by the time it's needed by checking out some small
+        # repository.
+        repository: actions/checkout
+        path: fake-checkout
     - name: 'PR commits + 1'
       run: echo "PR_FETCH_DEPTH=$(( ${{ github.event.pull_request.commits }} + 1 ))" >> "${GITHUB_ENV}"
-    - uses: actions/checkout@v3
+    - name: Setup action
+      # We can switch to `cp -r /actions .` once changes in cached_checkout are
+      # propagated into the nightly container image.
+      run: |
+        mkdir -p actions/cached_checkout
+        wget raw.githubusercontent.com/intel/llvm/sycl/devops/actions/cached_checkout/action.yml   -P ./actions/cached_checkout
+    - uses: ./actions/cached_checkout
       with:
-        ref: ${{ github.event.pull_request.head.sha }}
-        persist-credentials: false
+        path: src
         fetch-depth: ${{ env.PR_FETCH_DEPTH }}
+        ref: ${{ github.event.pull_request.head.sha }}
+        cache_path: "/__w/repo_cache/"
+        merge: false
     - name: Run clang-format
-      uses: ./devops/actions/clang-format
+      uses: ./src/devops/actions/clang-format
+      with:
+        path: src
 
   # This job generates matrix of tests for SYCL End-to-End tests
   test_matrix:
diff --git a/devops/actions/clang-format/action.yml b/devops/actions/clang-format/action.yml
index 973d9a5288ecd..cf448ee4c5c11 100644
--- a/devops/actions/clang-format/action.yml
+++ b/devops/actions/clang-format/action.yml
@@ -1,14 +1,19 @@
 name: 'clang-format'
 description: 'Run clang-format on pull request'
+inputs:
+  path:
+    description: Path to the checkout
+    required: true
+
 runs:
   using: "composite"
   steps:
   - name: Run clang-format for the patch
     shell: bash {0}
     run: |
-      git config --global --add safe.directory /__w/llvm/llvm
-      git clang-format ${{ github.event.pull_request.base.sha }}
-      git diff > ./clang-format.patch
+      git config --global --add safe.directory ${{ inputs.path }}
+      git -C ${{ inputs.path }} clang-format ${{ github.event.pull_request.base.sha }}
+      git -C ${{ inputs.path }} diff > ./clang-format.patch
   # Add patch with formatting fixes to CI job artifacts
   - uses: actions/upload-artifact@v1
     with:

From 53c8089674f28fc6f666f52cc1beeafe7fd00efd Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Tue, 13 Jun 2023 11:13:11 -0700
Subject: [PATCH 11/55] [CI] Don't install lz4 (#9848)

We decided to use "zstd" instead.
---
 devops/scripts/install_build_tools.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/devops/scripts/install_build_tools.sh b/devops/scripts/install_build_tools.sh
index e98c3d98305a5..d04522b07cd4a 100755
--- a/devops/scripts/install_build_tools.sh
+++ b/devops/scripts/install_build_tools.sh
@@ -19,7 +19,6 @@ apt update && apt install -yqq \
       libdw1 \
       wget \
       sudo \
-      lz4 \
       zstd
 
 pip3 install psutil

From 6b7424349ee3592909afde25f9921c4e8b83db10 Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Tue, 13 Jun 2023 11:30:53 -0700
Subject: [PATCH 12/55] [CI] Enable HIP/CUDA/ESIMD plugins in nightly build
 (#9850)

That is needed so that we could use the resulting image to test PRs that
only touch SYCL End-to-End tests
---
 .github/workflows/sycl_nightly.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/sycl_nightly.yml b/.github/workflows/sycl_nightly.yml
index 3bfec01ff60dc..058f1ab33f4fc 100644
--- a/.github/workflows/sycl_nightly.yml
+++ b/.github/workflows/sycl_nightly.yml
@@ -24,7 +24,7 @@ jobs:
     with:
       build_cache_root: "/__w/"
       build_artifact_suffix: default-2204
-      build_configure_extra_args: ''
+      build_configure_extra_args: '--hip --cuda --enable-esimd-emulator'
 
   ubuntu2204_opaque_pointers_build_test:
     if: github.repository == 'intel/llvm'

From 9ad2588e0d7cb8a4916158cbc0b23acd62aebb63 Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Tue, 13 Jun 2023 16:21:34 -0400
Subject: [PATCH 13/55] [SYCL][ESIMD][E2E] Revert "Fix scale tests on hardware
 that doesn't support fp64 (#9552)" (#9826)

After 2910add75c712a5fab3e2fecc6bfc61fd4884417, the splitting does the
right thing with `invoke_simd`, and we can use this test to lock down
the functionality which previously didn't work.

This reverts commit 8e19a94400c79a3d9ecb1870473828da82c54831.
---
 .../Feature/ImplicitSubgroup/scale_double.cpp | 13 ---------
 sycl/test-e2e/InvokeSimd/Feature/scale.cpp    |  4 +--
 .../InvokeSimd/Feature/scale_double.cpp       | 27 -------------------
 3 files changed, 1 insertion(+), 43 deletions(-)
 delete mode 100644 sycl/test-e2e/InvokeSimd/Feature/ImplicitSubgroup/scale_double.cpp
 delete mode 100644 sycl/test-e2e/InvokeSimd/Feature/scale_double.cpp

diff --git a/sycl/test-e2e/InvokeSimd/Feature/ImplicitSubgroup/scale_double.cpp b/sycl/test-e2e/InvokeSimd/Feature/ImplicitSubgroup/scale_double.cpp
deleted file mode 100644
index 82a06125214e0..0000000000000
--- a/sycl/test-e2e/InvokeSimd/Feature/ImplicitSubgroup/scale_double.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-// REQUIRES: aspect-fp64
-// Check that full compilation works:
-// RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %S/../scale_double.cpp -o %t.out
-// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out
-//
-// VISALTO enable run
-// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out
-
-/*
- * This tests is the same as InvokeSimd/feature/scale_double.cpp, but compiles
- * without optional subgroup attribute specified and intended to check that
- * compiler is able to choose subgroup size correctly.
- */
diff --git a/sycl/test-e2e/InvokeSimd/Feature/scale.cpp b/sycl/test-e2e/InvokeSimd/Feature/scale.cpp
index 9b7850f51589d..812074e81f716 100644
--- a/sycl/test-e2e/InvokeSimd/Feature/scale.cpp
+++ b/sycl/test-e2e/InvokeSimd/Feature/scale.cpp
@@ -147,7 +147,6 @@ int main(void) {
   const bool SupportsDouble = dev.has(aspect::fp64);
 
   bool passed = true;
-#ifndef TEST_DOUBLE_TYPE
   passed &= test<unsigned char>(q);
   passed &= test<char>(q);
   passed &= test<unsigned short>(q);
@@ -158,10 +157,9 @@ int main(void) {
   passed &= test<long>(q);
 
   passed &= test<float>(q);
-#else
   if (SupportsDouble)
     passed &= test<double>(q);
-#endif
+
   std::cout << (passed ? "Passed\n" : "FAILED\n");
   return passed ? 0 : 1;
 }
diff --git a/sycl/test-e2e/InvokeSimd/Feature/scale_double.cpp b/sycl/test-e2e/InvokeSimd/Feature/scale_double.cpp
deleted file mode 100644
index 4b722b0239181..0000000000000
--- a/sycl/test-e2e/InvokeSimd/Feature/scale_double.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// REQUIRES: aspect-fp64
-// Check that full compilation works:
-// RUN: %{build} -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -o %t.out
-// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out
-//
-// VISALTO enable run
-// RUN: env IGC_VISALTO=63 IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out
-
-/*
- * Tests invoke_simd support in the compiler/headers
- * Test case purpose:
- * -----------------
- * To verify that the simple scale example from the invoke_simd spec
- * https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_invoke_simd.asciidoc
- * works.
- *
- * Test case description:
- * ---------------------
- * Invoke a simple SIMD function that scales all elements of a SIMD type X by a
- * scalar value n with double.
- *
- * This test also runs with all types of VISA link time optimizations enabled.
- */
-
-#define TEST_DOUBLE_TYPE
-
-#include "scale.cpp"

From f4525e901cd6ab4b8cf114350698d93b75ff5f17 Mon Sep 17 00:00:00 2001
From: Kseniya Tikhomirova <kseniya.tikhomirova@intel.com>
Date: Wed, 14 Jun 2023 00:25:56 +0200
Subject: [PATCH 14/55] [SYCL] Remove _CODELOC* macro from API  (#9847)

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/doc/PreprocessorMacros.md      |   5 -
 sycl/include/sycl/detail/common.hpp |  29 +-
 sycl/include/sycl/queue.hpp         | 682 ++++++++++++++++------------
 sycl/include/sycl/usm.hpp           | 267 ++++++-----
 sycl/source/detail/usm/usm_impl.cpp | 168 +++----
 sycl/source/queue.cpp               |  48 +-
 6 files changed, 623 insertions(+), 576 deletions(-)

diff --git a/sycl/doc/PreprocessorMacros.md b/sycl/doc/PreprocessorMacros.md
index a4154b217ffd0..e03ee9a94e31c 100644
--- a/sycl/doc/PreprocessorMacros.md
+++ b/sycl/doc/PreprocessorMacros.md
@@ -19,11 +19,6 @@ This file describes macros that have effect on SYCL compiler and run-time.
   the underlying pointer types return pointers without any additional qualifiers
   so it's disabled by default.
 
-- **DISABLE_SYCL_INSTRUMENTATION_METADATA**
-
-  This macro is used to disable passing of code location information to public
-  methods.
-
 - **SYCL2020_DISABLE_DEPRECATION_WARNINGS**
 
   Disables warnings coming from usage of SYCL 1.2.1 APIs, that are deprecated in
diff --git a/sycl/include/sycl/detail/common.hpp b/sycl/include/sycl/detail/common.hpp
index 4d0b1aea245f1..41bd33d89cfc7 100644
--- a/sycl/include/sycl/detail/common.hpp
+++ b/sycl/include/sycl/detail/common.hpp
@@ -18,9 +18,7 @@
 #include <string>
 
 // Default signature enables the passing of user code location information to
-// public methods as a default argument. If the end-user wants to disable the
-// code location information, they must compile the code with
-// -DDISABLE_SYCL_INSTRUMENTATION_METADATA flag
+// public methods as a default argument.
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 namespace detail {
@@ -96,31 +94,6 @@ struct code_location {
   unsigned long MColumnNo;
 };
 
-// The C++ FE may instrument user calls with code location metadata.
-// If it does then that will appear as an extra last argument.
-// Having _TWO_ mid-param #ifdefs makes the functions very difficult to read.
-// Here we simplify the &CodeLoc declaration to be _CODELOCPARAM(&CodeLoc) and
-// _CODELOCARG(&CodeLoc).
-
-#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
-#define _CODELOCONLYPARAM(a)                                                   \
-  const ::sycl::detail::code_location a =                                      \
-      ::sycl::detail::code_location::current()
-#define _CODELOCPARAM(a)                                                       \
-  , const ::sycl::detail::code_location a =                                    \
-        ::sycl::detail::code_location::current()
-#define _CODELOCPARAMDEF(a) , const ::sycl::detail::code_location a
-
-#define _CODELOCARG(a)
-#define _CODELOCFW(a) , a
-#else
-#define _CODELOCONLYPARAM(a)
-#define _CODELOCPARAM(a)
-
-#define _CODELOCARG(a) const ::sycl::detail::code_location a = {}
-#define _CODELOCFW(a)
-#endif
-
 /// @brief Data type that manages the code_location information in TLS
 /// @details As new SYCL features are added, they all enable the propagation of
 /// the code location information where the SYCL API was called by the
diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp
index 964d7a530d46f..a5edd309ba67b 100644
--- a/sycl/include/sycl/queue.hpp
+++ b/sycl/include/sycl/queue.hpp
@@ -305,8 +305,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param CGF is a function object containing command group.
   /// \param CodeLoc is the code location of the submit call (default argument)
   /// \return a SYCL event object for the submitted command group.
-  template <typename T> event submit(T CGF _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  template <typename T>
+  event submit(T CGF, const detail::code_location &CodeLoc =
+                          detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
 #if __SYCL_USE_FALLBACK_ASSERT
     auto PostProcess = [this, &CodeLoc](bool IsKernel, bool KernelUsesAssert,
@@ -341,8 +342,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \return a SYCL event object, which corresponds to the queue the command
   /// group is being enqueued on.
   template <typename T>
-  event submit(T CGF, queue &SecondaryQueue _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  event submit(
+      T CGF, queue &SecondaryQueue,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
 #if __SYCL_USE_FALLBACK_ASSERT
     auto PostProcess = [this, &SecondaryQueue, &CodeLoc](
@@ -375,9 +377,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param CodeLoc is the code location of the submit call (default argument)
   /// \return a SYCL event object, which corresponds to the queue the command
   /// group is being enqueued on.
-  event ext_oneapi_submit_barrier(_CODELOCONLYPARAM(&CodeLoc)) {
-    return submit(
-        [=](handler &CGH) { CGH.ext_oneapi_barrier(); } _CODELOCFW(CodeLoc));
+  event ext_oneapi_submit_barrier(
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit([=](handler &CGH) { CGH.ext_oneapi_barrier(); }, CodeLoc);
   }
 
   /// Prevents any commands submitted afterward to this queue from executing
@@ -388,8 +390,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \return a SYCL event object, which corresponds to the queue the command
   /// group is being enqueued on.
   __SYCL2020_DEPRECATED("use 'ext_oneapi_submit_barrier' instead")
-  event submit_barrier(_CODELOCONLYPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  event submit_barrier(
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     return ext_oneapi_submit_barrier(CodeLoc);
   }
 
@@ -403,10 +405,10 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \return a SYCL event object, which corresponds to the queue the command
   /// group is being enqueued on.
   event ext_oneapi_submit_barrier(
-      const std::vector<event> &WaitList _CODELOCPARAM(&CodeLoc)) {
-    return submit([=](handler &CGH) {
-      CGH.ext_oneapi_barrier(WaitList);
-    } _CODELOCFW(CodeLoc));
+      const std::vector<event> &WaitList,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit([=](handler &CGH) { CGH.ext_oneapi_barrier(WaitList); },
+                  CodeLoc);
   }
 
   /// Prevents any commands submitted afterward to this queue from executing
@@ -419,9 +421,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \return a SYCL event object, which corresponds to the queue the command
   /// group is being enqueued on.
   __SYCL2020_DEPRECATED("use 'ext_oneapi_submit_barrier' instead")
-  event
-  submit_barrier(const std::vector<event> &WaitList _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  event submit_barrier(
+      const std::vector<event> &WaitList,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     return ext_oneapi_submit_barrier(WaitList, CodeLoc);
   }
 
@@ -430,8 +432,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   ///
   /// Synchronous errors will be reported through SYCL exceptions.
   /// @param CodeLoc is the code location of the submit call (default argument)
-  void wait(_CODELOCONLYPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  void wait(
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     wait_proxy(CodeLoc);
   }
@@ -444,8 +446,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// construction. If no async_handler was provided then asynchronous
   /// exceptions will be lost.
   /// @param CodeLoc is the code location of the submit call (default argument)
-  void wait_and_throw(_CODELOCONLYPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  void wait_and_throw(
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     wait_and_throw_proxy(CodeLoc);
   }
@@ -481,14 +483,12 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param Count is the number of times to fill Pattern into Ptr.
   /// \return an event representing fill operation.
   template <typename T>
-  event fill(void *Ptr, const T &Pattern,
-             size_t Count _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  event fill(
+      void *Ptr, const T &Pattern, size_t Count,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
-
-    return submit([&](handler &CGH) {
-      CGH.fill<T>(Ptr, Pattern, Count);
-    } _CODELOCFW(CodeLoc));
+    return submit([&](handler &CGH) { CGH.fill<T>(Ptr, Pattern, Count); },
+                  CodeLoc);
   }
 
   /// Fills the specified memory with the specified pattern.
@@ -500,14 +500,16 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param DepEvent is an event that specifies the kernel dependencies.
   /// \return an event representing fill operation.
   template <typename T>
-  event fill(void *Ptr, const T &Pattern, size_t Count,
-             event DepEvent _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  event fill(
+      void *Ptr, const T &Pattern, size_t Count, event DepEvent,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
-    return submit([&](handler &CGH) {
-      CGH.depends_on(DepEvent);
-      CGH.fill<T>(Ptr, Pattern, Count);
-    } _CODELOCFW(CodeLoc));
+    return submit(
+        [&](handler &CGH) {
+          CGH.depends_on(DepEvent);
+          CGH.fill<T>(Ptr, Pattern, Count);
+        },
+        CodeLoc);
   }
 
   /// Fills the specified memory with the specified pattern.
@@ -520,14 +522,17 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// dependencies.
   /// \return an event representing fill operation.
   template <typename T>
-  event fill(void *Ptr, const T &Pattern, size_t Count,
-             const std::vector<event> &DepEvents _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  event fill(
+      void *Ptr, const T &Pattern, size_t Count,
+      const std::vector<event> &DepEvents,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
-    return submit([&](handler &CGH) {
-      CGH.depends_on(DepEvents);
-      CGH.fill<T>(Ptr, Pattern, Count);
-    } _CODELOCFW(CodeLoc));
+    return submit(
+        [&](handler &CGH) {
+          CGH.depends_on(DepEvents);
+          CGH.fill<T>(Ptr, Pattern, Count);
+        },
+        CodeLoc);
   }
 
   /// Fills the memory pointed by a USM pointer with the value specified.
@@ -539,7 +544,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param Value is a value to be set. Value is cast as an unsigned char.
   /// \param Count is a number of bytes to fill.
   /// \return an event representing fill operation.
-  event memset(void *Ptr, int Value, size_t Count _CODELOCPARAM(&CodeLoc));
+  event memset(
+      void *Ptr, int Value, size_t Count,
+      const detail::code_location &CodeLoc = detail::code_location::current());
 
   /// Fills the memory pointed by a USM pointer with the value specified.
   /// No operations is done if \param Count is zero. An exception is thrown
@@ -551,8 +558,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param Count is a number of bytes to fill.
   /// \param DepEvent is an event that specifies the kernel dependencies.
   /// \return an event representing fill operation.
-  event memset(void *Ptr, int Value, size_t Count,
-               event DepEvent _CODELOCPARAM(&CodeLoc));
+  event memset(
+      void *Ptr, int Value, size_t Count, event DepEvent,
+      const detail::code_location &CodeLoc = detail::code_location::current());
 
   /// Fills the memory pointed by a USM pointer with the value specified.
   /// No operations is done if \param Count is zero. An exception is thrown
@@ -565,8 +573,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param DepEvents is a vector of events that specifies the kernel
   /// dependencies.
   /// \return an event representing fill operation.
-  event memset(void *Ptr, int Value, size_t Count,
-               const std::vector<event> &DepEvents _CODELOCPARAM(&CodeLoc));
+  event memset(
+      void *Ptr, int Value, size_t Count, const std::vector<event> &DepEvents,
+      const detail::code_location &CodeLoc = detail::code_location::current());
 
   /// Copies data from one memory region to another, each is either a host
   /// pointer or a pointer within USM allocation accessible on the device
@@ -579,8 +588,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param Src is a USM pointer to the source memory.
   /// \param Count is a number of bytes to copy.
   /// \return an event representing copy operation.
-  event memcpy(void *Dest, const void *Src,
-               size_t Count _CODELOCPARAM(&CodeLoc));
+  event memcpy(
+      void *Dest, const void *Src, size_t Count,
+      const detail::code_location &CodeLoc = detail::code_location::current());
 
   /// Copies data from one memory region to another, each is either a host
   /// pointer or a pointer within USM allocation accessible on the device
@@ -594,8 +604,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param Count is a number of bytes to copy.
   /// \param DepEvent is an event that specifies the kernel dependencies.
   /// \return an event representing copy operation.
-  event memcpy(void *Dest, const void *Src, size_t Count,
-               event DepEvent _CODELOCPARAM(&CodeLoc));
+  event memcpy(
+      void *Dest, const void *Src, size_t Count, event DepEvent,
+      const detail::code_location &CodeLoc = detail::code_location::current());
 
   /// Copies data from one memory region to another, each is either a host
   /// pointer or a pointer within USM allocation accessible on the device
@@ -610,8 +621,10 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param DepEvents is a vector of events that specifies the kernel
   /// dependencies.
   /// \return an event representing copy operation.
-  event memcpy(void *Dest, const void *Src, size_t Count,
-               const std::vector<event> &DepEvents _CODELOCPARAM(&CodeLoc));
+  event memcpy(
+      void *Dest, const void *Src, size_t Count,
+      const std::vector<event> &DepEvents,
+      const detail::code_location &CodeLoc = detail::code_location::current());
 
   /// Copies data from one memory region to another, each is either a host
   /// pointer or a pointer within USM allocation accessible on the device
@@ -626,8 +639,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param CodeLoc contains the code location of user code
   /// \return an event representing copy operation.
   template <typename T>
-  event copy(const T *Src, T *Dest, size_t Count _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  event copy(
+      const T *Src, T *Dest, size_t Count,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     return this->memcpy(Dest, Src, Count * sizeof(T));
   }
@@ -646,9 +660,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param CodeLoc contains the code location of user code
   /// \return an event representing copy operation.
   template <typename T>
-  event copy(const T *Src, T *Dest, size_t Count,
-             event DepEvent _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  event copy(
+      const T *Src, T *Dest, size_t Count, event DepEvent,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     return this->memcpy(Dest, Src, Count * sizeof(T), DepEvent);
   }
@@ -667,9 +681,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param CodeLoc contains the code location of user code
   /// \return an event representing copy operation.
   template <typename T>
-  event copy(const T *Src, T *Dest, size_t Count,
-             const std::vector<event> &DepEvents _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  event copy(
+      const T *Src, T *Dest, size_t Count, const std::vector<event> &DepEvents,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     return this->memcpy(Dest, Src, Count * sizeof(T), DepEvents);
   }
@@ -682,8 +696,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param Advice is a device-defined advice for the specified allocation.
   /// \return an event representing advice operation.
   __SYCL2020_DEPRECATED("use the overload with int Advice instead")
-  event mem_advise(const void *Ptr, size_t Length,
-                   pi_mem_advice Advice _CODELOCPARAM(&CodeLoc));
+  event mem_advise(
+      const void *Ptr, size_t Length, pi_mem_advice Advice,
+      const detail::code_location &CodeLoc = detail::code_location::current());
 
   /// Provides additional information to the underlying runtime about how
   /// different allocations are used.
@@ -692,8 +707,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param Length is a number of bytes in the allocation.
   /// \param Advice is a device-defined advice for the specified allocation.
   /// \return an event representing advice operation.
-  event mem_advise(const void *Ptr, size_t Length,
-                   int Advice _CODELOCPARAM(&CodeLoc));
+  event mem_advise(
+      const void *Ptr, size_t Length, int Advice,
+      const detail::code_location &CodeLoc = detail::code_location::current());
 
   /// Provides additional information to the underlying runtime about how
   /// different allocations are used.
@@ -703,8 +719,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param Advice is a device-defined advice for the specified allocation.
   /// \param DepEvent is an event that specifies the kernel dependencies.
   /// \return an event representing advice operation.
-  event mem_advise(const void *Ptr, size_t Length, int Advice,
-                   event DepEvent _CODELOCPARAM(&CodeLoc));
+  event mem_advise(
+      const void *Ptr, size_t Length, int Advice, event DepEvent,
+      const detail::code_location &CodeLoc = detail::code_location::current());
 
   /// Provides additional information to the underlying runtime about how
   /// different allocations are used.
@@ -715,8 +732,10 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param DepEvents is a vector of events that specifies the kernel
   /// dependencies.
   /// \return an event representing advice operation.
-  event mem_advise(const void *Ptr, size_t Length, int Advice,
-                   const std::vector<event> &DepEvents _CODELOCPARAM(&CodeLoc));
+  event mem_advise(
+      const void *Ptr, size_t Length, int Advice,
+      const std::vector<event> &DepEvents,
+      const detail::code_location &CodeLoc = detail::code_location::current());
 
   /// Provides hints to the runtime library that data should be made available
   /// on a device earlier than Unified Shared Memory would normally require it
@@ -725,11 +744,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param Ptr is a USM pointer to the memory to be prefetched to the device.
   /// \param Count is a number of bytes to be prefetched.
   /// \return an event representing prefetch operation.
-  event prefetch(const void *Ptr, size_t Count _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  event prefetch(
+      const void *Ptr, size_t Count,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
-    return submit(
-        [=](handler &CGH) { CGH.prefetch(Ptr, Count); } _CODELOCFW(CodeLoc));
+    return submit([=](handler &CGH) { CGH.prefetch(Ptr, Count); }, CodeLoc);
   }
 
   /// Provides hints to the runtime library that data should be made available
@@ -740,14 +759,16 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param Count is a number of bytes to be prefetched.
   /// \param DepEvent is an event that specifies the kernel dependencies.
   /// \return an event representing prefetch operation.
-  event prefetch(const void *Ptr, size_t Count,
-                 event DepEvent _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  event prefetch(
+      const void *Ptr, size_t Count, event DepEvent,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
-    return submit([=](handler &CGH) {
-      CGH.depends_on(DepEvent);
-      CGH.prefetch(Ptr, Count);
-    } _CODELOCFW(CodeLoc));
+    return submit(
+        [=](handler &CGH) {
+          CGH.depends_on(DepEvent);
+          CGH.prefetch(Ptr, Count);
+        },
+        CodeLoc);
   }
 
   /// Provides hints to the runtime library that data should be made available
@@ -759,14 +780,16 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param DepEvents is a vector of events that specifies the kernel
   /// dependencies.
   /// \return an event representing prefetch operation.
-  event prefetch(const void *Ptr, size_t Count,
-                 const std::vector<event> &DepEvents _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  event prefetch(
+      const void *Ptr, size_t Count, const std::vector<event> &DepEvents,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
-    return submit([=](handler &CGH) {
-      CGH.depends_on(DepEvents);
-      CGH.prefetch(Ptr, Count);
-    } _CODELOCFW(CodeLoc));
+    return submit(
+        [=](handler &CGH) {
+          CGH.depends_on(DepEvents);
+          CGH.prefetch(Ptr, Count);
+        },
+        CodeLoc);
   }
 
   /// Copies data from one 2D memory region to another, both pointed by
@@ -789,12 +812,16 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \return an event representing the copy operation.
   template <typename T = unsigned char,
             typename = std::enable_if_t<std::is_same_v<T, unsigned char>>>
-  event ext_oneapi_memcpy2d(void *Dest, size_t DestPitch, const void *Src,
-                            size_t SrcPitch, size_t Width,
-                            size_t Height _CODELOCPARAM(&CodeLoc)) {
-    return submit([=](handler &CGH) {
-      CGH.ext_oneapi_memcpy2d<T>(Dest, DestPitch, Src, SrcPitch, Width, Height);
-    } _CODELOCFW(CodeLoc));
+  event ext_oneapi_memcpy2d(
+      void *Dest, size_t DestPitch, const void *Src, size_t SrcPitch,
+      size_t Width, size_t Height,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit(
+        [=](handler &CGH) {
+          CGH.ext_oneapi_memcpy2d<T>(Dest, DestPitch, Src, SrcPitch, Width,
+                                     Height);
+        },
+        CodeLoc);
   }
 
   /// Copies data from one 2D memory region to another, both pointed by
@@ -818,13 +845,17 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \return an event representing the copy operation.
   template <typename T = unsigned char,
             typename = std::enable_if_t<std::is_same_v<T, unsigned char>>>
-  event ext_oneapi_memcpy2d(void *Dest, size_t DestPitch, const void *Src,
-                            size_t SrcPitch, size_t Width, size_t Height,
-                            event DepEvent _CODELOCPARAM(&CodeLoc)) {
-    return submit([=](handler &CGH) {
-      CGH.depends_on(DepEvent);
-      CGH.ext_oneapi_memcpy2d<T>(Dest, DestPitch, Src, SrcPitch, Width, Height);
-    } _CODELOCFW(CodeLoc));
+  event ext_oneapi_memcpy2d(
+      void *Dest, size_t DestPitch, const void *Src, size_t SrcPitch,
+      size_t Width, size_t Height, event DepEvent,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit(
+        [=](handler &CGH) {
+          CGH.depends_on(DepEvent);
+          CGH.ext_oneapi_memcpy2d<T>(Dest, DestPitch, Src, SrcPitch, Width,
+                                     Height);
+        },
+        CodeLoc);
   }
 
   /// Copies data from one 2D memory region to another, both pointed by
@@ -849,14 +880,17 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \return an event representing the copy operation.
   template <typename T = unsigned char,
             typename = std::enable_if_t<std::is_same_v<T, unsigned char>>>
-  event ext_oneapi_memcpy2d(void *Dest, size_t DestPitch, const void *Src,
-                            size_t SrcPitch, size_t Width, size_t Height,
-                            const std::vector<event> &DepEvents
-                                _CODELOCPARAM(&CodeLoc)) {
-    return submit([=](handler &CGH) {
-      CGH.depends_on(DepEvents);
-      CGH.ext_oneapi_memcpy2d<T>(Dest, DestPitch, Src, SrcPitch, Width, Height);
-    } _CODELOCFW(CodeLoc));
+  event ext_oneapi_memcpy2d(
+      void *Dest, size_t DestPitch, const void *Src, size_t SrcPitch,
+      size_t Width, size_t Height, const std::vector<event> &DepEvents,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit(
+        [=](handler &CGH) {
+          CGH.depends_on(DepEvents);
+          CGH.ext_oneapi_memcpy2d<T>(Dest, DestPitch, Src, SrcPitch, Width,
+                                     Height);
+        },
+        CodeLoc);
   }
 
   /// Copies data from one 2D memory region to another, both pointed by
@@ -875,12 +909,16 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param Height is the height in number of rows of the 2D region to copy.
   /// \return an event representing the copy operation.
   template <typename T>
-  event ext_oneapi_copy2d(const T *Src, size_t SrcPitch, T *Dest,
-                          size_t DestPitch, size_t Width,
-                          size_t Height _CODELOCPARAM(&CodeLoc)) {
-    return submit([=](handler &CGH) {
-      CGH.ext_oneapi_copy2d<T>(Src, SrcPitch, Dest, DestPitch, Width, Height);
-    } _CODELOCFW(CodeLoc));
+  event ext_oneapi_copy2d(
+      const T *Src, size_t SrcPitch, T *Dest, size_t DestPitch, size_t Width,
+      size_t Height,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit(
+        [=](handler &CGH) {
+          CGH.ext_oneapi_copy2d<T>(Src, SrcPitch, Dest, DestPitch, Width,
+                                   Height);
+        },
+        CodeLoc);
   }
 
   /// Copies data from one 2D memory region to another, both pointed by
@@ -900,13 +938,17 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param DepEvent is an event that specifies the kernel dependencies.
   /// \return an event representing the copy operation.
   template <typename T>
-  event ext_oneapi_copy2d(const T *Src, size_t SrcPitch, T *Dest,
-                          size_t DestPitch, size_t Width, size_t Height,
-                          event DepEvent _CODELOCPARAM(&CodeLoc)) {
-    return submit([=](handler &CGH) {
-      CGH.depends_on(DepEvent);
-      CGH.ext_oneapi_copy2d<T>(Src, SrcPitch, Dest, DestPitch, Width, Height);
-    } _CODELOCFW(CodeLoc));
+  event ext_oneapi_copy2d(
+      const T *Src, size_t SrcPitch, T *Dest, size_t DestPitch, size_t Width,
+      size_t Height, event DepEvent,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit(
+        [=](handler &CGH) {
+          CGH.depends_on(DepEvent);
+          CGH.ext_oneapi_copy2d<T>(Src, SrcPitch, Dest, DestPitch, Width,
+                                   Height);
+        },
+        CodeLoc);
   }
 
   /// Copies data from one 2D memory region to another, both pointed by
@@ -927,14 +969,17 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// dependencies.
   /// \return an event representing the copy operation.
   template <typename T>
-  event ext_oneapi_copy2d(const T *Src, size_t SrcPitch, T *Dest,
-                          size_t DestPitch, size_t Width, size_t Height,
-                          const std::vector<event> &DepEvents
-                              _CODELOCPARAM(&CodeLoc)) {
-    return submit([=](handler &CGH) {
-      CGH.depends_on(DepEvents);
-      CGH.ext_oneapi_copy2d<T>(Src, SrcPitch, Dest, DestPitch, Width, Height);
-    } _CODELOCFW(CodeLoc));
+  event ext_oneapi_copy2d(
+      const T *Src, size_t SrcPitch, T *Dest, size_t DestPitch, size_t Width,
+      size_t Height, const std::vector<event> &DepEvents,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit(
+        [=](handler &CGH) {
+          CGH.depends_on(DepEvents);
+          CGH.ext_oneapi_copy2d<T>(Src, SrcPitch, Dest, DestPitch, Width,
+                                   Height);
+        },
+        CodeLoc);
   }
 
   /// Fills the memory pointed by a USM pointer with the value specified.
@@ -955,12 +1000,14 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \return an event representing the fill operation.
   template <typename T = unsigned char,
             typename = std::enable_if_t<std::is_same_v<T, unsigned char>>>
-  event ext_oneapi_memset2d(void *Dest, size_t DestPitch, int Value,
-                            size_t Width,
-                            size_t Height _CODELOCPARAM(&CodeLoc)) {
-    return submit([=](handler &CGH) {
-      CGH.ext_oneapi_memset2d<T>(Dest, DestPitch, Value, Width, Height);
-    } _CODELOCFW(CodeLoc));
+  event ext_oneapi_memset2d(
+      void *Dest, size_t DestPitch, int Value, size_t Width, size_t Height,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit(
+        [=](handler &CGH) {
+          CGH.ext_oneapi_memset2d<T>(Dest, DestPitch, Value, Width, Height);
+        },
+        CodeLoc);
   }
 
   /// Fills the memory pointed by a USM pointer with the value specified.
@@ -982,13 +1029,16 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \return an event representing the fill operation.
   template <typename T = unsigned char,
             typename = std::enable_if_t<std::is_same_v<T, unsigned char>>>
-  event ext_oneapi_memset2d(void *Dest, size_t DestPitch, int Value,
-                            size_t Width, size_t Height,
-                            event DepEvent _CODELOCPARAM(&CodeLoc)) {
-    return submit([=](handler &CGH) {
-      CGH.depends_on(DepEvent);
-      CGH.ext_oneapi_memset2d<T>(Dest, DestPitch, Value, Width, Height);
-    } _CODELOCFW(CodeLoc));
+  event ext_oneapi_memset2d(
+      void *Dest, size_t DestPitch, int Value, size_t Width, size_t Height,
+      event DepEvent,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit(
+        [=](handler &CGH) {
+          CGH.depends_on(DepEvent);
+          CGH.ext_oneapi_memset2d<T>(Dest, DestPitch, Value, Width, Height);
+        },
+        CodeLoc);
   }
 
   /// Fills the memory pointed by a USM pointer with the value specified.
@@ -1013,11 +1063,14 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
             typename = std::enable_if_t<std::is_same_v<T, unsigned char>>>
   event ext_oneapi_memset2d(
       void *Dest, size_t DestPitch, int Value, size_t Width, size_t Height,
-      const std::vector<event> &DepEvents _CODELOCPARAM(&CodeLoc)) {
-    return submit([=](handler &CGH) {
-      CGH.depends_on(DepEvents);
-      CGH.ext_oneapi_memset2d<T>(Dest, DestPitch, Value, Width, Height);
-    } _CODELOCFW(CodeLoc));
+      const std::vector<event> &DepEvents,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit(
+        [=](handler &CGH) {
+          CGH.depends_on(DepEvents);
+          CGH.ext_oneapi_memset2d<T>(Dest, DestPitch, Value, Width, Height);
+        },
+        CodeLoc);
   }
 
   /// Fills the memory pointed by a USM pointer with the value specified.
@@ -1034,11 +1087,15 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param Height is the height in number of rows of the 2D region to fill.
   /// \return an event representing the fill operation.
   template <typename T>
-  event ext_oneapi_fill2d(void *Dest, size_t DestPitch, const T &Pattern,
-                          size_t Width, size_t Height _CODELOCPARAM(&CodeLoc)) {
-    return submit([=](handler &CGH) {
-      CGH.ext_oneapi_fill2d<T>(Dest, DestPitch, Pattern, Width, Height);
-    } _CODELOCFW(CodeLoc));
+  event ext_oneapi_fill2d(
+      void *Dest, size_t DestPitch, const T &Pattern, size_t Width,
+      size_t Height,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit(
+        [=](handler &CGH) {
+          CGH.ext_oneapi_fill2d<T>(Dest, DestPitch, Pattern, Width, Height);
+        },
+        CodeLoc);
   }
 
   /// Fills the memory pointed by a USM pointer with the value specified.
@@ -1056,13 +1113,16 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param DepEvent is an event that specifies the kernel dependencies.
   /// \return an event representing the fill operation.
   template <typename T>
-  event ext_oneapi_fill2d(void *Dest, size_t DestPitch, const T &Pattern,
-                          size_t Width, size_t Height,
-                          event DepEvent _CODELOCPARAM(&CodeLoc)) {
-    return submit([=](handler &CGH) {
-      CGH.depends_on(DepEvent);
-      CGH.ext_oneapi_fill2d<T>(Dest, DestPitch, Pattern, Width, Height);
-    } _CODELOCFW(CodeLoc));
+  event ext_oneapi_fill2d(
+      void *Dest, size_t DestPitch, const T &Pattern, size_t Width,
+      size_t Height, event DepEvent,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit(
+        [=](handler &CGH) {
+          CGH.depends_on(DepEvent);
+          CGH.ext_oneapi_fill2d<T>(Dest, DestPitch, Pattern, Width, Height);
+        },
+        CodeLoc);
   }
 
   /// Fills the memory pointed by a USM pointer with the value specified.
@@ -1081,14 +1141,16 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// dependencies.
   /// \return an event representing the fill operation.
   template <typename T>
-  event ext_oneapi_fill2d(void *Dest, size_t DestPitch, const T &Pattern,
-                          size_t Width, size_t Height,
-                          const std::vector<event> &DepEvents
-                              _CODELOCPARAM(&CodeLoc)) {
-    return submit([=](handler &CGH) {
-      CGH.depends_on(DepEvents);
-      CGH.ext_oneapi_fill2d<T>(Dest, DestPitch, Pattern, Width, Height);
-    } _CODELOCFW(CodeLoc));
+  event ext_oneapi_fill2d(
+      void *Dest, size_t DestPitch, const T &Pattern, size_t Width,
+      size_t Height, const std::vector<event> &DepEvents,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit(
+        [=](handler &CGH) {
+          CGH.depends_on(DepEvents);
+          CGH.ext_oneapi_fill2d<T>(Dest, DestPitch, Pattern, Width, Height);
+        },
+        CodeLoc);
   }
 
   /// Copies data from a USM memory region to a device_global.
@@ -1104,10 +1166,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// dependencies.
   /// \return an event representing copy operation.
   template <typename T, typename PropertyListT>
-  event memcpy(ext::oneapi::experimental::device_global<T, PropertyListT> &Dest,
-               const void *Src, size_t NumBytes, size_t Offset,
-               const std::vector<event> &DepEvents _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  event memcpy(
+      ext::oneapi::experimental::device_global<T, PropertyListT> &Dest,
+      const void *Src, size_t NumBytes, size_t Offset,
+      const std::vector<event> &DepEvents,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     if (sizeof(T) < Offset + NumBytes)
       throw sycl::exception(make_error_code(errc::invalid),
@@ -1116,10 +1179,12 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
     if (!detail::isDeviceGlobalUsedInKernel(&Dest)) {
       // device_global is unregistered so we need a fallback. We let the handler
       // implement this fallback.
-      return submit([&](handler &CGH) {
-        CGH.depends_on(DepEvents);
-        return CGH.memcpy(Dest, Src, NumBytes, Offset);
-      } _CODELOCFW(CodeLoc));
+      return submit(
+          [&](handler &CGH) {
+            CGH.depends_on(DepEvents);
+            return CGH.memcpy(Dest, Src, NumBytes, Offset);
+          },
+          CodeLoc);
     }
 
     constexpr bool IsDeviceImageScoped = PropertyListT::template has_property<
@@ -1141,10 +1206,10 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// dependency.
   /// \return an event representing copy operation.
   template <typename T, typename PropertyListT>
-  event memcpy(ext::oneapi::experimental::device_global<T, PropertyListT> &Dest,
-               const void *Src, size_t NumBytes, size_t Offset,
-               event DepEvent _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  event memcpy(
+      ext::oneapi::experimental::device_global<T, PropertyListT> &Dest,
+      const void *Src, size_t NumBytes, size_t Offset, event DepEvent,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     return this->memcpy(Dest, Src, NumBytes, Offset,
                         std::vector<event>{DepEvent});
@@ -1161,10 +1226,10 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param Offset is the offset into \param Dest to copy to.
   /// \return an event representing copy operation.
   template <typename T, typename PropertyListT>
-  event memcpy(ext::oneapi::experimental::device_global<T, PropertyListT> &Dest,
-               const void *Src, size_t NumBytes = sizeof(T),
-               size_t Offset = 0 _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  event memcpy(
+      ext::oneapi::experimental::device_global<T, PropertyListT> &Dest,
+      const void *Src, size_t NumBytes = sizeof(T), size_t Offset = 0,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     return this->memcpy(Dest, Src, NumBytes, Offset, std::vector<event>{});
   }
@@ -1182,12 +1247,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// dependencies.
   /// \return an event representing copy operation.
   template <typename T, typename PropertyListT>
-  event
-  memcpy(void *Dest,
-         const ext::oneapi::experimental::device_global<T, PropertyListT> &Src,
-         size_t NumBytes, size_t Offset,
-         const std::vector<event> &DepEvents _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  event memcpy(
+      void *Dest,
+      const ext::oneapi::experimental::device_global<T, PropertyListT> &Src,
+      size_t NumBytes, size_t Offset, const std::vector<event> &DepEvents,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     if (sizeof(T) < Offset + NumBytes)
       throw sycl::exception(make_error_code(errc::invalid),
@@ -1221,12 +1285,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// dependency.
   /// \return an event representing copy operation.
   template <typename T, typename PropertyListT>
-  event
-  memcpy(void *Dest,
-         const ext::oneapi::experimental::device_global<T, PropertyListT> &Src,
-         size_t NumBytes, size_t Offset,
-         event DepEvent _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  event memcpy(
+      void *Dest,
+      const ext::oneapi::experimental::device_global<T, PropertyListT> &Src,
+      size_t NumBytes, size_t Offset, event DepEvent,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     return this->memcpy(Dest, Src, NumBytes, Offset,
                         std::vector<event>{DepEvent});
@@ -1243,12 +1306,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param Offset is the offset into \param Src to copy from.
   /// \return an event representing copy operation.
   template <typename T, typename PropertyListT>
-  event
-  memcpy(void *Dest,
-         const ext::oneapi::experimental::device_global<T, PropertyListT> &Src,
-         size_t NumBytes = sizeof(T),
-         size_t Offset = 0 _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  event memcpy(
+      void *Dest,
+      const ext::oneapi::experimental::device_global<T, PropertyListT> &Src,
+      size_t NumBytes = sizeof(T), size_t Offset = 0,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     return this->memcpy(Dest, Src, NumBytes, Offset, std::vector<event>{});
   }
@@ -1267,11 +1329,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// dependencies.
   /// \return an event representing copy operation.
   template <typename T, typename PropertyListT>
-  event copy(const std::remove_all_extents_t<T> *Src,
-             ext::oneapi::experimental::device_global<T, PropertyListT> &Dest,
-             size_t Count, size_t StartIndex,
-             const std::vector<event> &DepEvents _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  event copy(
+      const std::remove_all_extents_t<T> *Src,
+      ext::oneapi::experimental::device_global<T, PropertyListT> &Dest,
+      size_t Count, size_t StartIndex, const std::vector<event> &DepEvents,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     return this->memcpy(Dest, Src, Count * sizeof(std::remove_all_extents_t<T>),
                         StartIndex * sizeof(std::remove_all_extents_t<T>),
@@ -1292,11 +1354,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// dependency.
   /// \return an event representing copy operation.
   template <typename T, typename PropertyListT>
-  event copy(const std::remove_all_extents_t<T> *Src,
-             ext::oneapi::experimental::device_global<T, PropertyListT> &Dest,
-             size_t Count, size_t StartIndex,
-             event DepEvent _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  event copy(
+      const std::remove_all_extents_t<T> *Src,
+      ext::oneapi::experimental::device_global<T, PropertyListT> &Dest,
+      size_t Count, size_t StartIndex, event DepEvent,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     return this->memcpy(Dest, Src, Count * sizeof(std::remove_all_extents_t<T>),
                         StartIndex * sizeof(std::remove_all_extents_t<T>),
@@ -1315,11 +1377,12 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param StartIndex is the index of the first element in Dest to copy to.
   /// \return an event representing copy operation.
   template <typename T, typename PropertyListT>
-  event copy(const std::remove_all_extents_t<T> *Src,
-             ext::oneapi::experimental::device_global<T, PropertyListT> &Dest,
-             size_t Count = sizeof(T) / sizeof(std::remove_all_extents_t<T>),
-             size_t StartIndex = 0 _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  event copy(
+      const std::remove_all_extents_t<T> *Src,
+      ext::oneapi::experimental::device_global<T, PropertyListT> &Dest,
+      size_t Count = sizeof(T) / sizeof(std::remove_all_extents_t<T>),
+      size_t StartIndex = 0,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     return this->memcpy(Dest, Src, Count * sizeof(std::remove_all_extents_t<T>),
                         StartIndex * sizeof(std::remove_all_extents_t<T>));
@@ -1339,11 +1402,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// dependencies.
   /// \return an event representing copy operation.
   template <typename T, typename PropertyListT>
-  event
-  copy(const ext::oneapi::experimental::device_global<T, PropertyListT> &Src,
-       std::remove_all_extents_t<T> *Dest, size_t Count, size_t StartIndex,
-       const std::vector<event> &DepEvents _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  event copy(
+      const ext::oneapi::experimental::device_global<T, PropertyListT> &Src,
+      std::remove_all_extents_t<T> *Dest, size_t Count, size_t StartIndex,
+      const std::vector<event> &DepEvents,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     return this->memcpy(Dest, Src, Count * sizeof(std::remove_all_extents_t<T>),
                         StartIndex * sizeof(std::remove_all_extents_t<T>),
@@ -1364,11 +1427,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// dependency.
   /// \return an event representing copy operation.
   template <typename T, typename PropertyListT>
-  event
-  copy(const ext::oneapi::experimental::device_global<T, PropertyListT> &Src,
-       std::remove_all_extents_t<T> *Dest, size_t Count, size_t StartIndex,
-       event DepEvent _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  event copy(
+      const ext::oneapi::experimental::device_global<T, PropertyListT> &Src,
+      std::remove_all_extents_t<T> *Dest, size_t Count, size_t StartIndex,
+      event DepEvent,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     return this->memcpy(Dest, Src, Count * sizeof(std::remove_all_extents_t<T>),
                         StartIndex * sizeof(std::remove_all_extents_t<T>),
@@ -1387,12 +1450,12 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param StartIndex is the index of the first element in Src to copy from.
   /// \return an event representing copy operation.
   template <typename T, typename PropertyListT>
-  event
-  copy(const ext::oneapi::experimental::device_global<T, PropertyListT> &Src,
-       std::remove_all_extents_t<T> *Dest,
-       size_t Count = sizeof(T) / sizeof(std::remove_all_extents_t<T>),
-       size_t StartIndex = 0 _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
+  event copy(
+      const ext::oneapi::experimental::device_global<T, PropertyListT> &Src,
+      std::remove_all_extents_t<T> *Dest,
+      size_t Count = sizeof(T) / sizeof(std::remove_all_extents_t<T>),
+      size_t StartIndex = 0,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     return this->memcpy(Dest, Src, Count * sizeof(std::remove_all_extents_t<T>),
                         StartIndex * sizeof(std::remove_all_extents_t<T>));
@@ -1407,8 +1470,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
             typename PropertiesT>
   std::enable_if_t<
       ext::oneapi::experimental::is_property_list<PropertiesT>::value, event>
-  single_task(PropertiesT Properties,
-              _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
+  single_task(
+      PropertiesT Properties, _KERNELFUNCPARAM(KernelFunc),
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     static_assert(
         (detail::check_fn_signature<std::remove_reference_t<KernelType>,
                                     void()>::value ||
@@ -1416,7 +1480,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
                                     void(kernel_handler)>::value),
         "sycl::queue.single_task() requires a kernel instead of command group. "
         "Use queue.submit() instead");
-    _CODELOCARG(&CodeLoc);
+
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     return submit(
         [&](handler &CGH) {
@@ -1431,10 +1495,12 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param KernelFunc is the Kernel functor or lambda
   /// \param CodeLoc contains the code location of user code
   template <typename KernelName = detail::auto_name, typename KernelType>
-  event single_task(_KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
+  event single_task(
+      _KERNELFUNCPARAM(KernelFunc),
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     return single_task<KernelName, KernelType>(
-        ext::oneapi::experimental::detail::empty_properties_t{},
-        KernelFunc _CODELOCFW(CodeLoc));
+        ext::oneapi::experimental::detail::empty_properties_t{}, KernelFunc,
+        CodeLoc);
   }
 
   /// single_task version with a kernel represented as a lambda.
@@ -1447,8 +1513,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
             typename PropertiesT>
   std::enable_if_t<
       ext::oneapi::experimental::is_property_list<PropertiesT>::value, event>
-  single_task(event DepEvent, PropertiesT Properties,
-              _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
+  single_task(
+      event DepEvent, PropertiesT Properties, _KERNELFUNCPARAM(KernelFunc),
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     static_assert(
         (detail::check_fn_signature<std::remove_reference_t<KernelType>,
                                     void()>::value ||
@@ -1456,7 +1523,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
                                     void(kernel_handler)>::value),
         "sycl::queue.single_task() requires a kernel instead of command group. "
         "Use queue.submit() instead");
-    _CODELOCARG(&CodeLoc);
+
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     return submit(
         [&](handler &CGH) {
@@ -1473,11 +1540,12 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param KernelFunc is the Kernel functor or lambda
   /// \param CodeLoc contains the code location of user code
   template <typename KernelName = detail::auto_name, typename KernelType>
-  event single_task(event DepEvent,
-                    _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
+  event single_task(
+      event DepEvent, _KERNELFUNCPARAM(KernelFunc),
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     return single_task<KernelName, KernelType>(
         DepEvent, ext::oneapi::experimental::detail::empty_properties_t{},
-        KernelFunc _CODELOCFW(CodeLoc));
+        KernelFunc, CodeLoc);
   }
 
   /// single_task version with a kernel represented as a lambda.
@@ -1491,8 +1559,10 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
             typename PropertiesT>
   std::enable_if_t<
       ext::oneapi::experimental::is_property_list<PropertiesT>::value, event>
-  single_task(const std::vector<event> &DepEvents, PropertiesT Properties,
-              _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
+  single_task(
+      const std::vector<event> &DepEvents, PropertiesT Properties,
+      _KERNELFUNCPARAM(KernelFunc),
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     static_assert(
         (detail::check_fn_signature<std::remove_reference_t<KernelType>,
                                     void()>::value ||
@@ -1500,7 +1570,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
                                     void(kernel_handler)>::value),
         "sycl::queue.single_task() requires a kernel instead of command group. "
         "Use queue.submit() instead");
-    _CODELOCARG(&CodeLoc);
+
     detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
     return submit(
         [&](handler &CGH) {
@@ -1518,11 +1588,12 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \param KernelFunc is the Kernel functor or lambda
   /// \param CodeLoc contains the code location of user code
   template <typename KernelName = detail::auto_name, typename KernelType>
-  event single_task(const std::vector<event> &DepEvents,
-                    _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
+  event single_task(
+      const std::vector<event> &DepEvents, _KERNELFUNCPARAM(KernelFunc),
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
     return single_task<KernelName, KernelType>(
         DepEvents, ext::oneapi::experimental::detail::empty_properties_t{},
-        KernelFunc _CODELOCFW(CodeLoc));
+        KernelFunc, CodeLoc);
   }
 
   /// parallel_for version with a kernel represented as a lambda + range that
@@ -1825,12 +1896,16 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \return an event representing copy operation.
   template <typename SrcT, int SrcDims, access_mode SrcMode, target SrcTgt,
             access::placeholder IsPlaceholder, typename DestT>
-  event copy(accessor<SrcT, SrcDims, SrcMode, SrcTgt, IsPlaceholder> Src,
-             std::shared_ptr<DestT> Dest _CODELOCPARAM(&CodeLoc)) {
-    return submit([&](handler &CGH) {
-      CGH.require(Src);
-      CGH.copy(Src, Dest);
-    } _CODELOCFW(CodeLoc));
+  event copy(
+      accessor<SrcT, SrcDims, SrcMode, SrcTgt, IsPlaceholder> Src,
+      std::shared_ptr<DestT> Dest,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit(
+        [&](handler &CGH) {
+          CGH.require(Src);
+          CGH.copy(Src, Dest);
+        },
+        CodeLoc);
   }
 
   /// Copies data from a memory region pointed to by a shared_ptr to another
@@ -1841,13 +1916,16 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \return an event representing copy operation.
   template <typename SrcT, typename DestT, int DestDims, access_mode DestMode,
             target DestTgt, access::placeholder IsPlaceholder>
-  event copy(std::shared_ptr<SrcT> Src,
-             accessor<DestT, DestDims, DestMode, DestTgt, IsPlaceholder> Dest
-                 _CODELOCPARAM(&CodeLoc)) {
-    return submit([&](handler &CGH) {
-      CGH.require(Dest);
-      CGH.copy(Src, Dest);
-    } _CODELOCFW(CodeLoc));
+  event copy(
+      std::shared_ptr<SrcT> Src,
+      accessor<DestT, DestDims, DestMode, DestTgt, IsPlaceholder> Dest,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit(
+        [&](handler &CGH) {
+          CGH.require(Dest);
+          CGH.copy(Src, Dest);
+        },
+        CodeLoc);
   }
 
   /// Copies data from a memory region pointed to by a placeholder accessor to
@@ -1858,12 +1936,15 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \return an event representing copy operation.
   template <typename SrcT, int SrcDims, access_mode SrcMode, target SrcTgt,
             access::placeholder IsPlaceholder, typename DestT>
-  event copy(accessor<SrcT, SrcDims, SrcMode, SrcTgt, IsPlaceholder> Src,
-             DestT *Dest _CODELOCPARAM(&CodeLoc)) {
-    return submit([&](handler &CGH) {
-      CGH.require(Src);
-      CGH.copy(Src, Dest);
-    } _CODELOCFW(CodeLoc));
+  event copy(
+      accessor<SrcT, SrcDims, SrcMode, SrcTgt, IsPlaceholder> Src, DestT *Dest,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit(
+        [&](handler &CGH) {
+          CGH.require(Src);
+          CGH.copy(Src, Dest);
+        },
+        CodeLoc);
   }
 
   /// Copies data from a memory region pointed to by a raw pointer to another
@@ -1874,13 +1955,16 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \return an event representing copy operation.
   template <typename SrcT, typename DestT, int DestDims, access_mode DestMode,
             target DestTgt, access::placeholder IsPlaceholder>
-  event copy(const SrcT *Src,
-             accessor<DestT, DestDims, DestMode, DestTgt, IsPlaceholder> Dest
-                 _CODELOCPARAM(&CodeLoc)) {
-    return submit([&](handler &CGH) {
-      CGH.require(Dest);
-      CGH.copy(Src, Dest);
-    } _CODELOCFW(CodeLoc));
+  event copy(
+      const SrcT *Src,
+      accessor<DestT, DestDims, DestMode, DestTgt, IsPlaceholder> Dest,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit(
+        [&](handler &CGH) {
+          CGH.require(Dest);
+          CGH.copy(Src, Dest);
+        },
+        CodeLoc);
   }
 
   /// Copies data from one memory region to another, both pointed by placeholder
@@ -1893,15 +1977,17 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
             access::placeholder IsSrcPlaceholder, typename DestT, int DestDims,
             access_mode DestMode, target DestTgt,
             access::placeholder IsDestPlaceholder>
-  event
-  copy(accessor<SrcT, SrcDims, SrcMode, SrcTgt, IsSrcPlaceholder> Src,
-       accessor<DestT, DestDims, DestMode, DestTgt, IsDestPlaceholder> Dest
-           _CODELOCPARAM(&CodeLoc)) {
-    return submit([&](handler &CGH) {
-      CGH.require(Src);
-      CGH.require(Dest);
-      CGH.copy(Src, Dest);
-    } _CODELOCFW(CodeLoc));
+  event copy(
+      accessor<SrcT, SrcDims, SrcMode, SrcTgt, IsSrcPlaceholder> Src,
+      accessor<DestT, DestDims, DestMode, DestTgt, IsDestPlaceholder> Dest,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit(
+        [&](handler &CGH) {
+          CGH.require(Src);
+          CGH.require(Dest);
+          CGH.copy(Src, Dest);
+        },
+        CodeLoc);
   }
 
   /// Provides guarantees that the memory object accessed via Acc is updated
@@ -1912,11 +1998,14 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   template <typename T, int Dims, access_mode Mode, target Tgt,
             access::placeholder IsPlaceholder>
   event update_host(
-      accessor<T, Dims, Mode, Tgt, IsPlaceholder> Acc _CODELOCPARAM(&CodeLoc)) {
-    return submit([&](handler &CGH) {
-      CGH.require(Acc);
-      CGH.update_host(Acc);
-    } _CODELOCFW(CodeLoc));
+      accessor<T, Dims, Mode, Tgt, IsPlaceholder> Acc,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit(
+        [&](handler &CGH) {
+          CGH.require(Acc);
+          CGH.update_host(Acc);
+        },
+        CodeLoc);
   }
 
   /// Fills the specified memory with the specified data.
@@ -1927,12 +2016,15 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   /// \return an event representing fill operation.
   template <typename T, int Dims, access_mode Mode, target Tgt,
             access::placeholder IsPlaceholder>
-  event fill(accessor<T, Dims, Mode, Tgt, IsPlaceholder> Dest,
-             const T &Src _CODELOCPARAM(&CodeLoc)) {
-    return submit([&](handler &CGH) {
-      CGH.require(Dest);
-      CGH.fill<T>(Dest, Src);
-    } _CODELOCFW(CodeLoc));
+  event fill(
+      accessor<T, Dims, Mode, Tgt, IsPlaceholder> Dest, const T &Src,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit(
+        [&](handler &CGH) {
+          CGH.require(Dest);
+          CGH.fill<T>(Dest, Src);
+        },
+        CodeLoc);
   }
 
   /// @brief Returns true if the queue was created with the
diff --git a/sycl/include/sycl/usm.hpp b/sycl/include/sycl/usm.hpp
index 048f007cc61d4..87d2b10fe91c4 100644
--- a/sycl/include/sycl/usm.hpp
+++ b/sycl/include/sycl/usm.hpp
@@ -20,111 +20,135 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) {
 ///
 // Explicit USM
 ///
-__SYCL_EXPORT void *malloc_device(size_t size, const device &dev,
-                                  const context &ctxt _CODELOCPARAM(&CodeLoc));
-__SYCL_EXPORT void *
-malloc_device(size_t size, const device &dev, const context &ctxt,
-              const property_list &propList _CODELOCPARAM(&CodeLoc));
-__SYCL_EXPORT void *malloc_device(size_t size,
-                                  const queue &q _CODELOCPARAM(&CodeLoc));
-__SYCL_EXPORT void *
-malloc_device(size_t size, const queue &q,
-              const property_list &propList _CODELOCPARAM(&CodeLoc));
+__SYCL_EXPORT void *malloc_device(
+    size_t size, const device &dev, const context &ctxt,
+    const detail::code_location &CodeLoc = detail::code_location::current());
+__SYCL_EXPORT void *malloc_device(
+    size_t size, const device &dev, const context &ctxt,
+    const property_list &propList,
+    const detail::code_location &CodeLoc = detail::code_location::current());
+__SYCL_EXPORT void *malloc_device(
+    size_t size, const queue &q,
+    const detail::code_location &CodeLoc = detail::code_location::current());
+__SYCL_EXPORT void *malloc_device(
+    size_t size, const queue &q, const property_list &propList,
+    const detail::code_location &CodeLoc = detail::code_location::current());
 
-__SYCL_EXPORT void *
-aligned_alloc_device(size_t alignment, size_t size, const device &dev,
-                     const context &ctxt _CODELOCPARAM(&CodeLoc));
-__SYCL_EXPORT void *
-aligned_alloc_device(size_t alignment, size_t size, const device &dev,
-                     const context &ctxt,
-                     const property_list &propList _CODELOCPARAM(&CodeLoc));
-__SYCL_EXPORT void *
-aligned_alloc_device(size_t alignment, size_t size,
-                     const queue &q _CODELOCPARAM(&CodeLoc));
-__SYCL_EXPORT void *
-aligned_alloc_device(size_t alignment, size_t size, const queue &q,
-                     const property_list &propList _CODELOCPARAM(&CodeLoc));
+__SYCL_EXPORT void *aligned_alloc_device(
+    size_t alignment, size_t size, const device &dev, const context &ctxt,
+    const detail::code_location &CodeLoc = detail::code_location::current());
+__SYCL_EXPORT void *aligned_alloc_device(
+    size_t alignment, size_t size, const device &dev, const context &ctxt,
+    const property_list &propList,
+    const detail::code_location &CodeLoc = detail::code_location::current());
+__SYCL_EXPORT void *aligned_alloc_device(
+    size_t alignment, size_t size, const queue &q,
+    const detail::code_location &CodeLoc = detail::code_location::current());
+__SYCL_EXPORT void *aligned_alloc_device(
+    size_t alignment, size_t size, const queue &q,
+    const property_list &propList,
+    const detail::code_location &CodeLoc = detail::code_location::current());
 
-__SYCL_EXPORT void free(void *ptr, const context &ctxt _CODELOCPARAM(&CodeLoc));
-__SYCL_EXPORT void free(void *ptr, const queue &q _CODELOCPARAM(&CodeLoc));
+__SYCL_EXPORT void
+free(void *ptr, const context &ctxt,
+     const detail::code_location &CodeLoc = detail::code_location::current());
+__SYCL_EXPORT void
+free(void *ptr, const queue &q,
+     const detail::code_location &CodeLoc = detail::code_location::current());
 
 ///
 // Restricted USM
 ///
-__SYCL_EXPORT void *malloc_host(size_t size,
-                                const context &ctxt _CODELOCPARAM(&CodeLoc));
-__SYCL_EXPORT void *
-malloc_host(size_t size, const context &ctxt,
-            const property_list &propList _CODELOCPARAM(&CodeLoc));
-__SYCL_EXPORT void *malloc_host(size_t size,
-                                const queue &q _CODELOCPARAM(&CodeLoc));
-__SYCL_EXPORT void *
-malloc_host(size_t size, const queue &q,
-            const property_list &propList _CODELOCPARAM(&CodeLoc));
+__SYCL_EXPORT void *malloc_host(
+    size_t size, const context &ctxt,
+    const detail::code_location &CodeLoc = detail::code_location::current());
+__SYCL_EXPORT void *malloc_host(
+    size_t size, const context &ctxt, const property_list &propList,
+    const detail::code_location &CodeLoc = detail::code_location::current());
+__SYCL_EXPORT void *malloc_host(
+    size_t size, const queue &q,
+    const detail::code_location &CodeLoc = detail::code_location::current());
+__SYCL_EXPORT void *malloc_host(
+    size_t size, const queue &q, const property_list &propList,
+    const detail::code_location &CodeLoc = detail::code_location::current());
 
-__SYCL_EXPORT void *malloc_shared(size_t size, const device &dev,
-                                  const context &ctxt _CODELOCPARAM(&CodeLoc));
-__SYCL_EXPORT void *
-malloc_shared(size_t size, const device &dev, const context &ctxt,
-              const property_list &propList _CODELOCPARAM(&CodeLoc));
-__SYCL_EXPORT void *malloc_shared(size_t size,
-                                  const queue &q _CODELOCPARAM(&CodeLoc));
-__SYCL_EXPORT void *
-malloc_shared(size_t size, const queue &q,
-              const property_list &propList _CODELOCPARAM(&CodeLoc));
+__SYCL_EXPORT void *malloc_shared(
+    size_t size, const device &dev, const context &ctxt,
+    const detail::code_location &CodeLoc = detail::code_location::current());
+__SYCL_EXPORT void *malloc_shared(
+    size_t size, const device &dev, const context &ctxt,
+    const property_list &propList,
+    const detail::code_location &CodeLoc = detail::code_location::current());
+__SYCL_EXPORT void *malloc_shared(
+    size_t size, const queue &q,
+    const detail::code_location &CodeLoc = detail::code_location::current());
+__SYCL_EXPORT void *malloc_shared(
+    size_t size, const queue &q, const property_list &propList,
+    const detail::code_location &CodeLoc = detail::code_location::current());
 
-__SYCL_EXPORT void *
-aligned_alloc_host(size_t alignment, size_t size,
-                   const context &ctxt _CODELOCPARAM(&CodeLoc));
-__SYCL_EXPORT void *
-aligned_alloc_host(size_t alignment, size_t size, const context &ctxt,
-                   const property_list &propList _CODELOCPARAM(&CodeLoc));
-__SYCL_EXPORT void *aligned_alloc_host(size_t alignment, size_t size,
-                                       const queue &q _CODELOCPARAM(&CodeLoc));
-__SYCL_EXPORT void *
-aligned_alloc_host(size_t alignment, size_t size, const queue &q,
-                   const property_list &propList _CODELOCPARAM(&CodeLoc));
+__SYCL_EXPORT void *aligned_alloc_host(
+    size_t alignment, size_t size, const context &ctxt,
+    const detail::code_location &CodeLoc = detail::code_location::current());
+__SYCL_EXPORT void *aligned_alloc_host(
+    size_t alignment, size_t size, const context &ctxt,
+    const property_list &propList,
+    const detail::code_location &CodeLoc = detail::code_location::current());
+__SYCL_EXPORT void *aligned_alloc_host(
+    size_t alignment, size_t size, const queue &q,
+    const detail::code_location &CodeLoc = detail::code_location::current());
+__SYCL_EXPORT void *aligned_alloc_host(
+    size_t alignment, size_t size, const queue &q,
+    const property_list &propList,
+    const detail::code_location &CodeLoc = detail::code_location::current());
 
-__SYCL_EXPORT void *
-aligned_alloc_shared(size_t alignment, size_t size, const device &dev,
-                     const context &ctxt _CODELOCPARAM(&CodeLoc));
-__SYCL_EXPORT void *
-aligned_alloc_shared(size_t alignment, size_t size, const device &dev,
-                     const context &ctxt,
-                     const property_list &propList _CODELOCPARAM(&CodeLoc));
-__SYCL_EXPORT void *
-aligned_alloc_shared(size_t alignment, size_t size,
-                     const queue &q _CODELOCPARAM(&CodeLoc));
-__SYCL_EXPORT void *
-aligned_alloc_shared(size_t alignment, size_t size, const queue &q,
-                     const property_list &propList _CODELOCPARAM(&CodeLoc));
+__SYCL_EXPORT void *aligned_alloc_shared(
+    size_t alignment, size_t size, const device &dev, const context &ctxt,
+    const detail::code_location &CodeLoc = detail::code_location::current());
+__SYCL_EXPORT void *aligned_alloc_shared(
+    size_t alignment, size_t size, const device &dev, const context &ctxt,
+    const property_list &propList,
+    const detail::code_location &CodeLoc = detail::code_location::current());
+__SYCL_EXPORT void *aligned_alloc_shared(
+    size_t alignment, size_t size, const queue &q,
+    const detail::code_location &CodeLoc = detail::code_location::current());
+__SYCL_EXPORT void *aligned_alloc_shared(
+    size_t alignment, size_t size, const queue &q,
+    const property_list &propList,
+    const detail::code_location &CodeLoc = detail::code_location::current());
 
 ///
 // single form
 ///
-__SYCL_EXPORT void *malloc(size_t size, const device &dev, const context &ctxt,
-                           usm::alloc kind _CODELOCPARAM(&CodeLoc));
 __SYCL_EXPORT void *
 malloc(size_t size, const device &dev, const context &ctxt, usm::alloc kind,
-       const property_list &propList _CODELOCPARAM(&CodeLoc));
-__SYCL_EXPORT void *malloc(size_t size, const queue &q,
-                           usm::alloc kind _CODELOCPARAM(&CodeLoc));
+       const detail::code_location &CodeLoc = detail::code_location::current());
 __SYCL_EXPORT void *
-malloc(size_t size, const queue &q, usm::alloc kind,
-       const property_list &propList _CODELOCPARAM(&CodeLoc));
-
-__SYCL_EXPORT void *aligned_alloc(size_t alignment, size_t size,
-                                  const device &dev, const context &ctxt,
-                                  usm::alloc kind _CODELOCPARAM(&CodeLoc));
+malloc(size_t size, const device &dev, const context &ctxt, usm::alloc kind,
+       const property_list &propList,
+       const detail::code_location &CodeLoc = detail::code_location::current());
 __SYCL_EXPORT void *
-aligned_alloc(size_t alignment, size_t size, const device &dev,
-              const context &ctxt, usm::alloc kind,
-              const property_list &propList _CODELOCPARAM(&CodeLoc));
-__SYCL_EXPORT void *aligned_alloc(size_t alignment, size_t size, const queue &q,
-                                  usm::alloc kind _CODELOCPARAM(&CodeLoc));
+malloc(size_t size, const queue &q, usm::alloc kind,
+       const detail::code_location &CodeLoc = detail::code_location::current());
 __SYCL_EXPORT void *
-aligned_alloc(size_t alignment, size_t size, const queue &q, usm::alloc kind,
-              const property_list &propList _CODELOCPARAM(&CodeLoc));
+malloc(size_t size, const queue &q, usm::alloc kind,
+       const property_list &propList,
+       const detail::code_location &CodeLoc = detail::code_location::current());
+
+__SYCL_EXPORT void *aligned_alloc(
+    size_t alignment, size_t size, const device &dev, const context &ctxt,
+    usm::alloc kind,
+    const detail::code_location &CodeLoc = detail::code_location::current());
+__SYCL_EXPORT void *aligned_alloc(
+    size_t alignment, size_t size, const device &dev, const context &ctxt,
+    usm::alloc kind, const property_list &propList,
+    const detail::code_location &CodeLoc = detail::code_location::current());
+__SYCL_EXPORT void *aligned_alloc(
+    size_t alignment, size_t size, const queue &q, usm::alloc kind,
+    const detail::code_location &CodeLoc = detail::code_location::current());
+__SYCL_EXPORT void *aligned_alloc(
+    size_t alignment, size_t size, const queue &q, usm::alloc kind,
+    const property_list &propList,
+    const detail::code_location &CodeLoc = detail::code_location::current());
 
 ///
 // Template forms
@@ -132,17 +156,16 @@ aligned_alloc(size_t alignment, size_t size, const queue &q, usm::alloc kind,
 template <typename T>
 T *malloc_device(
     size_t Count, const device &Dev, const context &Ctxt,
-    const property_list &PropList = {} _CODELOCPARAM(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+    const property_list &PropList = {},
+    const detail::code_location &CodeLoc = detail::code_location::current()) {
   return static_cast<T *>(aligned_alloc_device(alignof(T), Count * sizeof(T),
                                                Dev, Ctxt, PropList, CodeLoc));
 }
 
 template <typename T>
 T *malloc_device(
-    size_t Count, const queue &Q,
-    const property_list &PropList = {} _CODELOCPARAM(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+    size_t Count, const queue &Q, const property_list &PropList = {},
+    const detail::code_location &CodeLoc = detail::code_location::current()) {
   return malloc_device<T>(Count, Q.get_device(), Q.get_context(), PropList,
                           CodeLoc);
 }
@@ -150,8 +173,8 @@ T *malloc_device(
 template <typename T>
 T *aligned_alloc_device(
     size_t Alignment, size_t Count, const device &Dev, const context &Ctxt,
-    const property_list &PropList = {} _CODELOCPARAM(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+    const property_list &PropList = {},
+    const detail::code_location &CodeLoc = detail::code_location::current()) {
   return static_cast<T *>(aligned_alloc_device(max(Alignment, alignof(T)),
                                                Count * sizeof(T), Dev, Ctxt,
                                                PropList, CodeLoc));
@@ -160,43 +183,40 @@ T *aligned_alloc_device(
 template <typename T>
 T *aligned_alloc_device(
     size_t Alignment, size_t Count, const queue &Q,
-    const property_list &PropList = {} _CODELOCPARAM(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+    const property_list &PropList = {},
+    const detail::code_location &CodeLoc = detail::code_location::current()) {
   return aligned_alloc_device<T>(Alignment, Count, Q.get_device(),
                                  Q.get_context(), PropList, CodeLoc);
 }
 
 template <typename T>
 T *malloc_host(
-    size_t Count, const context &Ctxt,
-    const property_list &PropList = {} _CODELOCPARAM(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+    size_t Count, const context &Ctxt, const property_list &PropList = {},
+    const detail::code_location &CodeLoc = detail::code_location::current()) {
   return static_cast<T *>(aligned_alloc_host(alignof(T), Count * sizeof(T),
                                              Ctxt, PropList, CodeLoc));
 }
 
 template <typename T>
 T *malloc_host(
-    size_t Count, const queue &Q,
-    const property_list &PropList = {} _CODELOCPARAM(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+    size_t Count, const queue &Q, const property_list &PropList = {},
+    const detail::code_location &CodeLoc = detail::code_location::current()) {
   return malloc_host<T>(Count, Q.get_context(), PropList, CodeLoc);
 }
 
 template <typename T>
 T *malloc_shared(
     size_t Count, const device &Dev, const context &Ctxt,
-    const property_list &PropList = {} _CODELOCPARAM(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+    const property_list &PropList = {},
+    const detail::code_location &CodeLoc = detail::code_location::current()) {
   return static_cast<T *>(aligned_alloc_shared(alignof(T), Count * sizeof(T),
                                                Dev, Ctxt, PropList, CodeLoc));
 }
 
 template <typename T>
 T *malloc_shared(
-    size_t Count, const queue &Q,
-    const property_list &PropList = {} _CODELOCPARAM(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+    size_t Count, const queue &Q, const property_list &PropList = {},
+    const detail::code_location &CodeLoc = detail::code_location::current()) {
   return malloc_shared<T>(Count, Q.get_device(), Q.get_context(), PropList,
                           CodeLoc);
 }
@@ -204,8 +224,8 @@ T *malloc_shared(
 template <typename T>
 T *aligned_alloc_host(
     size_t Alignment, size_t Count, const context &Ctxt,
-    const property_list &PropList = {} _CODELOCPARAM(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+    const property_list &PropList = {},
+    const detail::code_location &CodeLoc = detail::code_location::current()) {
   return static_cast<T *>(aligned_alloc_host(std ::max(Alignment, alignof(T)),
                                              Count * sizeof(T), Ctxt, PropList,
                                              CodeLoc));
@@ -214,8 +234,8 @@ T *aligned_alloc_host(
 template <typename T>
 T *aligned_alloc_host(
     size_t Alignment, size_t Count, const queue &Q,
-    const property_list &PropList = {} _CODELOCPARAM(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+    const property_list &PropList = {},
+    const detail::code_location &CodeLoc = detail::code_location::current()) {
   return aligned_alloc_host<T>(Alignment, Count, Q.get_context(), PropList,
                                CodeLoc);
 }
@@ -223,8 +243,8 @@ T *aligned_alloc_host(
 template <typename T>
 T *aligned_alloc_shared(
     size_t Alignment, size_t Count, const device &Dev, const context &Ctxt,
-    const property_list &PropList = {} _CODELOCPARAM(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+    const property_list &PropList = {},
+    const detail::code_location &CodeLoc = detail::code_location::current()) {
   return static_cast<T *>(aligned_alloc_shared(max(Alignment, alignof(T)),
                                                Count * sizeof(T), Dev, Ctxt,
                                                PropList, CodeLoc));
@@ -233,8 +253,8 @@ T *aligned_alloc_shared(
 template <typename T>
 T *aligned_alloc_shared(
     size_t Alignment, size_t Count, const queue &Q,
-    const property_list &PropList = {} _CODELOCPARAM(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+    const property_list &PropList = {},
+    const detail::code_location &CodeLoc = detail::code_location::current()) {
   return aligned_alloc_shared<T>(Alignment, Count, Q.get_device(),
                                  Q.get_context(), PropList, CodeLoc);
 }
@@ -242,8 +262,8 @@ T *aligned_alloc_shared(
 template <typename T>
 T *malloc(
     size_t Count, const device &Dev, const context &Ctxt, usm::alloc Kind,
-    const property_list &PropList = {} _CODELOCPARAM(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+    const property_list &PropList = {},
+    const detail::code_location &CodeLoc = detail::code_location::current()) {
   return static_cast<T *>(aligned_alloc(alignof(T), Count * sizeof(T), Dev,
                                         Ctxt, Kind, PropList, CodeLoc));
 }
@@ -251,8 +271,8 @@ T *malloc(
 template <typename T>
 T *malloc(
     size_t Count, const queue &Q, usm::alloc Kind,
-    const property_list &PropList = {} _CODELOCPARAM(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+    const property_list &PropList = {},
+    const detail::code_location &CodeLoc = detail::code_location::current()) {
   return malloc<T>(Count, Q.get_device(), Q.get_context(), Kind, PropList,
                    CodeLoc);
 }
@@ -260,9 +280,8 @@ T *malloc(
 template <typename T>
 T *aligned_alloc(
     size_t Alignment, size_t Count, const device &Dev, const context &Ctxt,
-    usm::alloc Kind,
-    const property_list &PropList = {} _CODELOCPARAM(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+    usm::alloc Kind, const property_list &PropList = {},
+    const detail::code_location &CodeLoc = detail::code_location::current()) {
   return static_cast<T *>(aligned_alloc(max(Alignment, alignof(T)),
                                         Count * sizeof(T), Dev, Ctxt, Kind,
                                         PropList, CodeLoc));
@@ -271,8 +290,8 @@ T *aligned_alloc(
 template <typename T>
 T *aligned_alloc(
     size_t Alignment, size_t Count, const queue &Q, usm::alloc Kind,
-    const property_list &PropList = {} _CODELOCPARAM(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+    const property_list &PropList = {},
+    const detail::code_location &CodeLoc = detail::code_location::current()) {
   return aligned_alloc<T>(Alignment, Count, Q.get_device(), Q.get_context(),
                           Kind, PropList, CodeLoc);
 }
diff --git a/sycl/source/detail/usm/usm_impl.cpp b/sycl/source/detail/usm/usm_impl.cpp
index 70acfc07a8c98..976037c08292c 100644
--- a/sycl/source/detail/usm/usm_impl.cpp
+++ b/sycl/source/detail/usm/usm_impl.cpp
@@ -297,187 +297,169 @@ void free(void *Ptr, const context &Ctxt,
 } // namespace usm
 } // namespace detail
 
-void *malloc_device(size_t Size, const device &Dev,
-                    const context &Ctxt _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+void *malloc_device(size_t Size, const device &Dev, const context &Ctxt,
+                    const detail::code_location &CodeLoc) {
   return detail::usm::alignedAlloc(0, Size, Ctxt, Dev, alloc::device,
                                    property_list{}, CodeLoc);
 }
 
 void *malloc_device(size_t Size, const device &Dev, const context &Ctxt,
-                    const property_list &PropList _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+                    const property_list &PropList,
+                    const detail::code_location &CodeLoc) {
   return detail::usm::alignedAlloc(0, Size, Ctxt, Dev, alloc::device, PropList,
                                    CodeLoc);
 }
 
-void *malloc_device(size_t Size, const queue &Q _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+void *malloc_device(size_t Size, const queue &Q,
+                    const detail::code_location &CodeLoc) {
   return detail::usm::alignedAlloc(0, Size, Q.get_context(), Q.get_device(),
                                    alloc::device, property_list{}, CodeLoc);
 }
 
-void *malloc_device(size_t Size, const queue &Q,
-                    const property_list &PropList _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+void *malloc_device(size_t Size, const queue &Q, const property_list &PropList,
+                    const detail::code_location &CodeLoc) {
   return detail::usm::alignedAlloc(0, Size, Q.get_context(), Q.get_device(),
                                    alloc::device, PropList, CodeLoc);
 }
 
 void *aligned_alloc_device(size_t Alignment, size_t Size, const device &Dev,
-                           const context &Ctxt _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+                           const context &Ctxt,
+                           const detail::code_location &CodeLoc) {
   return detail::usm::alignedAlloc(Alignment, Size, Ctxt, Dev, alloc::device,
                                    property_list{}, CodeLoc);
 }
 
-void *
-aligned_alloc_device(size_t Alignment, size_t Size, const device &Dev,
-                     const context &Ctxt,
-                     const property_list &PropList _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+void *aligned_alloc_device(size_t Alignment, size_t Size, const device &Dev,
+                           const context &Ctxt, const property_list &PropList,
+                           const detail::code_location &CodeLoc) {
   return detail::usm::alignedAlloc(Alignment, Size, Ctxt, Dev, alloc::device,
                                    PropList, CodeLoc);
 }
 
-void *aligned_alloc_device(size_t Alignment, size_t Size,
-                           const queue &Q _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+void *aligned_alloc_device(size_t Alignment, size_t Size, const queue &Q,
+                           const detail::code_location &CodeLoc) {
   return detail::usm::alignedAlloc(Alignment, Size, Q.get_context(),
                                    Q.get_device(), alloc::device,
                                    property_list{}, CodeLoc);
 }
 
-void *
-aligned_alloc_device(size_t Alignment, size_t Size, const queue &Q,
-                     const property_list &PropList _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+void *aligned_alloc_device(size_t Alignment, size_t Size, const queue &Q,
+                           const property_list &PropList,
+                           const detail::code_location &CodeLoc) {
   return detail::usm::alignedAlloc(Alignment, Size, Q.get_context(),
                                    Q.get_device(), alloc::device, PropList,
                                    CodeLoc);
 }
 
-void free(void *ptr, const context &Ctxt _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+void free(void *ptr, const context &Ctxt,
+          const detail::code_location &CodeLoc) {
   return detail::usm::free(ptr, Ctxt, CodeLoc);
 }
 
-void free(void *ptr, const queue &Q _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+void free(void *ptr, const queue &Q, const detail::code_location &CodeLoc) {
   return detail::usm::free(ptr, Q.get_context(), CodeLoc);
 }
 
-void *malloc_host(size_t Size, const context &Ctxt _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+void *malloc_host(size_t Size, const context &Ctxt,
+                  const detail::code_location &CodeLoc) {
   return detail::usm::alignedAllocHost(0, Size, Ctxt, alloc::host,
                                        property_list{}, CodeLoc);
 }
 
 void *malloc_host(size_t Size, const context &Ctxt,
-                  const property_list &PropList _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+                  const property_list &PropList,
+                  const detail::code_location &CodeLoc) {
   return detail::usm::alignedAllocHost(0, Size, Ctxt, alloc::host, PropList,
                                        CodeLoc);
 }
 
-void *malloc_host(size_t Size, const queue &Q _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+void *malloc_host(size_t Size, const queue &Q,
+                  const detail::code_location &CodeLoc) {
   return detail::usm::alignedAllocHost(0, Size, Q.get_context(), alloc::host,
                                        property_list{}, CodeLoc);
 }
 
-void *malloc_host(size_t Size, const queue &Q,
-                  const property_list &PropList _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+void *malloc_host(size_t Size, const queue &Q, const property_list &PropList,
+                  const detail::code_location &CodeLoc) {
   return detail::usm::alignedAllocHost(0, Size, Q.get_context(), alloc::host,
                                        PropList, CodeLoc);
 }
 
-void *malloc_shared(size_t Size, const device &Dev,
-                    const context &Ctxt _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+void *malloc_shared(size_t Size, const device &Dev, const context &Ctxt,
+                    const detail::code_location &CodeLoc) {
   return detail::usm::alignedAlloc(0, Size, Ctxt, Dev, alloc::shared,
                                    property_list{}, CodeLoc);
 }
 
 void *malloc_shared(size_t Size, const device &Dev, const context &Ctxt,
-                    const property_list &PropList _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+                    const property_list &PropList,
+                    const detail::code_location &CodeLoc) {
   return detail::usm::alignedAlloc(0, Size, Ctxt, Dev, alloc::shared, PropList,
                                    CodeLoc);
 }
 
-void *malloc_shared(size_t Size, const queue &Q _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+void *malloc_shared(size_t Size, const queue &Q,
+                    const detail::code_location &CodeLoc) {
   return detail::usm::alignedAlloc(0, Size, Q.get_context(), Q.get_device(),
                                    alloc::shared, property_list{}, CodeLoc);
 }
 
-void *malloc_shared(size_t Size, const queue &Q,
-                    const property_list &PropList _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+void *malloc_shared(size_t Size, const queue &Q, const property_list &PropList,
+                    const detail::code_location &CodeLoc) {
   return detail::usm::alignedAlloc(0, Size, Q.get_context(), Q.get_device(),
                                    alloc::shared, PropList, CodeLoc);
 }
 
-void *aligned_alloc_host(size_t Alignment, size_t Size,
-                         const context &Ctxt _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+void *aligned_alloc_host(size_t Alignment, size_t Size, const context &Ctxt,
+                         const detail::code_location &CodeLoc) {
   return detail::usm::alignedAllocHost(Alignment, Size, Ctxt, alloc::host,
                                        property_list{}, CodeLoc);
 }
 
-void *
-aligned_alloc_host(size_t Alignment, size_t Size, const context &Ctxt,
-                   const property_list &PropList _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+void *aligned_alloc_host(size_t Alignment, size_t Size, const context &Ctxt,
+                         const property_list &PropList,
+                         const detail::code_location &CodeLoc) {
   return detail::usm::alignedAllocHost(Alignment, Size, Ctxt, alloc::host,
                                        PropList, CodeLoc);
 }
 
-void *aligned_alloc_host(size_t Alignment, size_t Size,
-                         const queue &Q _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+void *aligned_alloc_host(size_t Alignment, size_t Size, const queue &Q,
+                         const detail::code_location &CodeLoc) {
   return detail::usm::alignedAllocHost(Alignment, Size, Q.get_context(),
                                        alloc::host, property_list{}, CodeLoc);
 }
 
-void *
-aligned_alloc_host(size_t Alignment, size_t Size, const queue &Q,
-                   const property_list &PropList _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+void *aligned_alloc_host(size_t Alignment, size_t Size, const queue &Q,
+                         const property_list &PropList,
+                         const detail::code_location &CodeLoc) {
   return detail::usm::alignedAllocHost(Alignment, Size, Q.get_context(),
                                        alloc::host, PropList, CodeLoc);
 }
 
 void *aligned_alloc_shared(size_t Alignment, size_t Size, const device &Dev,
-                           const context &Ctxt _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+                           const context &Ctxt,
+                           const detail::code_location &CodeLoc) {
   return detail::usm::alignedAlloc(Alignment, Size, Ctxt, Dev, alloc::shared,
                                    property_list{}, CodeLoc);
 }
 
-void *
-aligned_alloc_shared(size_t Alignment, size_t Size, const device &Dev,
-                     const context &Ctxt,
-                     const property_list &PropList _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+void *aligned_alloc_shared(size_t Alignment, size_t Size, const device &Dev,
+                           const context &Ctxt, const property_list &PropList,
+                           const detail::code_location &CodeLoc) {
   return detail::usm::alignedAlloc(Alignment, Size, Ctxt, Dev, alloc::shared,
                                    PropList, CodeLoc);
 }
 
-void *aligned_alloc_shared(size_t Alignment, size_t Size,
-                           const queue &Q _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+void *aligned_alloc_shared(size_t Alignment, size_t Size, const queue &Q,
+                           const detail::code_location &CodeLoc) {
   return detail::usm::alignedAlloc(Alignment, Size, Q.get_context(),
                                    Q.get_device(), alloc::shared,
                                    property_list{}, CodeLoc);
 }
 
-void *
-aligned_alloc_shared(size_t Alignment, size_t Size, const queue &Q,
-                     const property_list &PropList _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+void *aligned_alloc_shared(size_t Alignment, size_t Size, const queue &Q,
+                           const property_list &PropList,
+                           const detail::code_location &CodeLoc) {
   return detail::usm::alignedAlloc(Alignment, Size, Q.get_context(),
                                    Q.get_device(), alloc::shared, PropList,
                                    CodeLoc);
@@ -486,17 +468,16 @@ aligned_alloc_shared(size_t Alignment, size_t Size, const queue &Q,
 // single form
 
 void *malloc(size_t Size, const device &Dev, const context &Ctxt, alloc Kind,
-             const property_list &PropList _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+             const property_list &PropList,
+             const detail::code_location &CodeLoc) {
   if (Kind == alloc::host)
     return detail::usm::alignedAllocHost(0, Size, Ctxt, Kind, PropList,
                                          CodeLoc);
   return detail::usm::alignedAlloc(0, Size, Ctxt, Dev, Kind, PropList, CodeLoc);
 }
 
-void *malloc(size_t Size, const device &Dev, const context &Ctxt,
-             alloc Kind _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+void *malloc(size_t Size, const device &Dev, const context &Ctxt, alloc Kind,
+             const detail::code_location &CodeLoc) {
   if (Kind == alloc::host)
     return detail::usm::alignedAllocHost(0, Size, Ctxt, Kind, property_list{},
                                          CodeLoc);
@@ -504,9 +485,8 @@ void *malloc(size_t Size, const device &Dev, const context &Ctxt,
                                    CodeLoc);
 }
 
-void *malloc(size_t Size, const queue &Q,
-             alloc Kind _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+void *malloc(size_t Size, const queue &Q, alloc Kind,
+             const detail::code_location &CodeLoc) {
   if (Kind == alloc::host)
     return detail::usm::alignedAllocHost(0, Size, Q.get_context(), Kind,
                                          property_list{}, CodeLoc);
@@ -515,8 +495,8 @@ void *malloc(size_t Size, const queue &Q,
 }
 
 void *malloc(size_t Size, const queue &Q, alloc Kind,
-             const property_list &PropList _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+             const property_list &PropList,
+             const detail::code_location &CodeLoc) {
   if (Kind == alloc::host)
     return detail::usm::alignedAllocHost(0, Size, Q.get_context(), Kind,
                                          PropList, CodeLoc);
@@ -525,9 +505,8 @@ void *malloc(size_t Size, const queue &Q, alloc Kind,
 }
 
 void *aligned_alloc(size_t Alignment, size_t Size, const device &Dev,
-                    const context &Ctxt,
-                    alloc Kind _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+                    const context &Ctxt, alloc Kind,
+                    const detail::code_location &CodeLoc) {
   if (Kind == alloc::host)
     return detail::usm::alignedAllocHost(Alignment, Size, Ctxt, Kind,
                                          property_list{}, CodeLoc);
@@ -538,8 +517,8 @@ void *aligned_alloc(size_t Alignment, size_t Size, const device &Dev,
 
 void *aligned_alloc(size_t Alignment, size_t Size, const device &Dev,
                     const context &Ctxt, alloc Kind,
-                    const property_list &PropList _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+                    const property_list &PropList,
+                    const detail::code_location &CodeLoc) {
   if (Kind == alloc::host)
     return detail::usm::alignedAllocHost(Alignment, Size, Ctxt, Kind, PropList,
                                          CodeLoc);
@@ -547,9 +526,8 @@ void *aligned_alloc(size_t Alignment, size_t Size, const device &Dev,
                                    CodeLoc);
 }
 
-void *aligned_alloc(size_t Alignment, size_t Size, const queue &Q,
-                    alloc Kind _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+void *aligned_alloc(size_t Alignment, size_t Size, const queue &Q, alloc Kind,
+                    const detail::code_location &CodeLoc) {
   if (Kind == alloc::host)
     return detail::usm::alignedAllocHost(Alignment, Size, Q.get_context(), Kind,
                                          property_list{}, CodeLoc);
@@ -559,8 +537,8 @@ void *aligned_alloc(size_t Alignment, size_t Size, const queue &Q,
 }
 
 void *aligned_alloc(size_t Alignment, size_t Size, const queue &Q, alloc Kind,
-                    const property_list &PropList _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+                    const property_list &PropList,
+                    const detail::code_location &CodeLoc) {
   if (Kind == alloc::host)
     return detail::usm::alignedAllocHost(Alignment, Size, Q.get_context(), Kind,
                                          PropList, CodeLoc);
diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp
index 90bc1bf3d3a06..ed2c2c402b006 100644
--- a/sycl/source/queue.cpp
+++ b/sycl/source/queue.cpp
@@ -84,75 +84,65 @@ bool queue::is_host() const {
 
 void queue::throw_asynchronous() { impl->throw_asynchronous(); }
 
-event queue::memset(void *Ptr, int Value,
-                    size_t Count _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+event queue::memset(void *Ptr, int Value, size_t Count,
+                    const detail::code_location &CodeLoc) {
   detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
   return impl->memset(impl, Ptr, Value, Count, {});
 }
 
-event queue::memset(void *Ptr, int Value, size_t Count,
-                    event DepEvent _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+event queue::memset(void *Ptr, int Value, size_t Count, event DepEvent,
+                    const detail::code_location &CodeLoc) {
   detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
   return impl->memset(impl, Ptr, Value, Count, {DepEvent});
 }
 
 event queue::memset(void *Ptr, int Value, size_t Count,
-                    const std::vector<event> &DepEvents
-                        _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+                    const std::vector<event> &DepEvents,
+                    const detail::code_location &CodeLoc) {
   detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
   return impl->memset(impl, Ptr, Value, Count, DepEvents);
 }
 
-event queue::memcpy(void *Dest, const void *Src,
-                    size_t Count _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+event queue::memcpy(void *Dest, const void *Src, size_t Count,
+                    const detail::code_location &CodeLoc) {
   detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
   return impl->memcpy(impl, Dest, Src, Count, {});
 }
 
-event queue::memcpy(void *Dest, const void *Src, size_t Count,
-                    event DepEvent _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+event queue::memcpy(void *Dest, const void *Src, size_t Count, event DepEvent,
+                    const detail::code_location &CodeLoc) {
   detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
   return impl->memcpy(impl, Dest, Src, Count, {DepEvent});
 }
 
 event queue::memcpy(void *Dest, const void *Src, size_t Count,
-                    const std::vector<event> &DepEvents
-                        _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+                    const std::vector<event> &DepEvents,
+                    const detail::code_location &CodeLoc) {
   detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
   return impl->memcpy(impl, Dest, Src, Count, DepEvents);
 }
 
-event queue::mem_advise(const void *Ptr, size_t Length,
-                        pi_mem_advice Advice _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+event queue::mem_advise(const void *Ptr, size_t Length, pi_mem_advice Advice,
+                        const detail::code_location &CodeLoc) {
   detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
   return mem_advise(Ptr, Length, int(Advice));
 }
 
-event queue::mem_advise(const void *Ptr, size_t Length,
-                        int Advice _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+event queue::mem_advise(const void *Ptr, size_t Length, int Advice,
+                        const detail::code_location &CodeLoc) {
   detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
   return impl->mem_advise(impl, Ptr, Length, pi_mem_advice(Advice), {});
 }
 
 event queue::mem_advise(const void *Ptr, size_t Length, int Advice,
-                        event DepEvent _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+                        event DepEvent, const detail::code_location &CodeLoc) {
   detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
   return impl->mem_advise(impl, Ptr, Length, pi_mem_advice(Advice), {DepEvent});
 }
 
 event queue::mem_advise(const void *Ptr, size_t Length, int Advice,
-                        const std::vector<event> &DepEvents
-                            _CODELOCPARAMDEF(&CodeLoc)) {
-  _CODELOCARG(&CodeLoc);
+                        const std::vector<event> &DepEvents,
+                        const detail::code_location &CodeLoc) {
   detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
   return impl->mem_advise(impl, Ptr, Length, pi_mem_advice(Advice), DepEvents);
 }

From c87e7802ef8087e81d66c41819fa05ab76fbd408 Mon Sep 17 00:00:00 2001
From: Byoungro So <byoungro.so@intel.com>
Date: Tue, 13 Jun 2023 23:09:42 -0700
Subject: [PATCH 15/55] [SYCL] Disable the ZE_DEBUG tests on Windows (#9854)

We should not run ZE_DEBUG tests on Windows.

---------

Signed-off-by: Byoungro So <byoungro.so@intel.com>
---
 sycl/test-e2e/Reduction/reduction_complex_nums.cpp             | 3 +++
 sycl/test-e2e/Reduction/reduction_nd_N_queue_shortcut.cpp      | 3 +++
 sycl/test-e2e/Reduction/reduction_nd_reducer_skip.cpp          | 3 +++
 sycl/test-e2e/Reduction/reduction_range_1d_reducer_skip.cpp    | 3 +++
 sycl/test-e2e/Reduction/reduction_range_2d_dw_reducer_skip.cpp | 3 +++
 sycl/test-e2e/Reduction/reduction_range_3d_rw_reducer_skip.cpp | 3 +++
 6 files changed, 18 insertions(+)

diff --git a/sycl/test-e2e/Reduction/reduction_complex_nums.cpp b/sycl/test-e2e/Reduction/reduction_complex_nums.cpp
index 84cd7f545e989..ac069d7d1e414 100644
--- a/sycl/test-e2e/Reduction/reduction_complex_nums.cpp
+++ b/sycl/test-e2e/Reduction/reduction_complex_nums.cpp
@@ -1,6 +1,9 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
+// Windows doesn't yet have full shutdown().
+// UNSUPPORTED: ze_debug && windows
+
 #include <algorithm>
 #include <complex>
 #include <numeric>
diff --git a/sycl/test-e2e/Reduction/reduction_nd_N_queue_shortcut.cpp b/sycl/test-e2e/Reduction/reduction_nd_N_queue_shortcut.cpp
index b8d625d47bc1d..5c2a9edc4682c 100644
--- a/sycl/test-e2e/Reduction/reduction_nd_N_queue_shortcut.cpp
+++ b/sycl/test-e2e/Reduction/reduction_nd_N_queue_shortcut.cpp
@@ -7,6 +7,9 @@
 // This test only checks that the method queue::parallel_for() accepting
 // reduction, can be properly translated into queue::submit + parallel_for().
 
+// Windows doesn't yet have full shutdown().
+// UNSUPPORTED: ze_debug && windows
+
 #include "reduction_utils.hpp"
 
 using namespace sycl;
diff --git a/sycl/test-e2e/Reduction/reduction_nd_reducer_skip.cpp b/sycl/test-e2e/Reduction/reduction_nd_reducer_skip.cpp
index fab80b0848420..ab2a583ce2f4c 100644
--- a/sycl/test-e2e/Reduction/reduction_nd_reducer_skip.cpp
+++ b/sycl/test-e2e/Reduction/reduction_nd_reducer_skip.cpp
@@ -4,6 +4,9 @@
 // Group algorithms are not supported on Nvidia.
 // XFAIL: hip_nvidia
 
+// Windows doesn't yet have full shutdown().
+// UNSUPPORTED: ze_debug && windows
+
 // This test performs basic checks of parallel_for(nd_range, reduction, func)
 // with reductions initialized with a one element buffer. Additionally, some
 // reducers will not be written to.
diff --git a/sycl/test-e2e/Reduction/reduction_range_1d_reducer_skip.cpp b/sycl/test-e2e/Reduction/reduction_range_1d_reducer_skip.cpp
index 002d84745bf13..bb8c56d1a8c2d 100644
--- a/sycl/test-e2e/Reduction/reduction_range_1d_reducer_skip.cpp
+++ b/sycl/test-e2e/Reduction/reduction_range_1d_reducer_skip.cpp
@@ -1,6 +1,9 @@
 // RUN: %{build} -o %t.out %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 %}
 // RUN: %{run} %t.out
 
+// Windows doesn't yet have full shutdown().
+// UNSUPPORTED: ze_debug && windows
+
 // This test performs basic checks of parallel_for(range<1>, reduction, func)
 // with reductions initialized with a one element buffer. Additionally, some
 // reducers will not be written to.
diff --git a/sycl/test-e2e/Reduction/reduction_range_2d_dw_reducer_skip.cpp b/sycl/test-e2e/Reduction/reduction_range_2d_dw_reducer_skip.cpp
index 06522222a1577..4a2c7fd24ac00 100644
--- a/sycl/test-e2e/Reduction/reduction_range_2d_dw_reducer_skip.cpp
+++ b/sycl/test-e2e/Reduction/reduction_range_2d_dw_reducer_skip.cpp
@@ -1,6 +1,9 @@
 // RUN: %{build} -o %t.out %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 %}
 // RUN: %{run} %t.out
 
+// Windows doesn't yet have full shutdown().
+// UNSUPPORTED: ze_debug && windows
+
 // This test performs basic checks of parallel_for(range<2>, reduction, func)
 // with reductions initialized with a one element buffer. Additionally, some
 // reducers will not be written to.
diff --git a/sycl/test-e2e/Reduction/reduction_range_3d_rw_reducer_skip.cpp b/sycl/test-e2e/Reduction/reduction_range_3d_rw_reducer_skip.cpp
index 2252498f87644..85960235d2e3c 100644
--- a/sycl/test-e2e/Reduction/reduction_range_3d_rw_reducer_skip.cpp
+++ b/sycl/test-e2e/Reduction/reduction_range_3d_rw_reducer_skip.cpp
@@ -1,6 +1,9 @@
 // RUN: %{build} -o %t.out %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_60 %}
 // RUN: %{run} %t.out
 
+// Windows doesn't yet have full shutdown().
+// UNSUPPORTED: ze_debug && windows
+
 // This test performs basic checks of parallel_for(range<3>, reduction, func)
 // with reductions initialized with a one element buffer. Additionally, some
 // reducers will not be written to.

From ec59d449a75ec60e17734884ec3bf587862181f5 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Wed, 14 Jun 2023 11:42:58 +0100
Subject: [PATCH 16/55] [SYCL][CUDA] Port CUDA plugin to Unified Runtime
 (#9512)

This moves the CUDA plugin implementation to Unified Runtime; and
changes the pi_cuda plugin to use pi2ur to implement PI. The changes to
the implementation have been kept to a minimum and should be
functionally the same. Documentation and comments have been moved
verbatim, other than changing PI references to UR.

This PR is based on top of the Level Zero adapter (#8744) so will only
be ready when that is merged.

---------

Co-authored-by: Petr Vesely <petr.vesely@codeplay.com>
Co-authored-by: Omar Ahmed <omar.ahmed@codeplay.com>
Co-authored-by: Martin Morrison-Grant <martin.morrisongrant@codeplay.com>
Co-authored-by: Aaron Greig <aaron.greig@codeplay.com>
---
 sycl/plugins/cuda/CMakeLists.txt              |   35 +-
 sycl/plugins/cuda/pi_cuda.cpp                 | 6109 +----------------
 sycl/plugins/cuda/pi_cuda.hpp                 |  988 +--
 sycl/plugins/unified_runtime/CMakeLists.txt   |   45 +
 sycl/plugins/unified_runtime/pi2ur.hpp        |   23 +-
 .../ur/adapters/cuda/common.cpp               |  106 +
 .../ur/adapters/cuda/common.hpp               |   59 +
 .../ur/adapters/cuda/context.cpp              |  154 +
 .../ur/adapters/cuda/context.hpp              |  139 +
 .../ur/adapters/cuda/device.cpp               | 1201 ++++
 .../ur/adapters/cuda/device.hpp               |   59 +
 .../ur/adapters/cuda/enqueue.cpp              | 1739 +++++
 .../ur/adapters/cuda/event.cpp                |  306 +
 .../ur/adapters/cuda/event.hpp                |  189 +
 .../ur/adapters/cuda/kernel.cpp               |  380 +
 .../ur/adapters/cuda/kernel.hpp               |  200 +
 .../ur/adapters/cuda/memory.cpp               |  507 ++
 .../ur/adapters/cuda/memory.hpp               |  185 +
 .../ur/adapters/cuda/platform.cpp             |  203 +
 .../ur/adapters/cuda/platform.hpp             |   15 +
 .../ur/adapters/cuda/program.cpp              |  476 ++
 .../ur/adapters/cuda/program.hpp              |   54 +
 .../ur/adapters/cuda/queue.cpp                |  331 +
 .../ur/adapters/cuda/queue.hpp                |  244 +
 .../ur/adapters/cuda/sampler.cpp              |   86 +
 .../ur/adapters/cuda/sampler.hpp              |   29 +
 .../ur/adapters}/cuda/tracing.cpp             |    0
 .../ur/adapters/cuda/ur_interface_loader.cpp  |  264 +
 .../unified_runtime/ur/adapters/cuda/usm.cpp  |  256 +
 sycl/plugins/unified_runtime/ur/ur.hpp        |   20 +-
 .../sync_two_queues_event_dep.cpp             |    1 +
 sycl/unittests/pi/cuda/CMakeLists.txt         |    2 +
 32 files changed, 7445 insertions(+), 6960 deletions(-)
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/platform.hpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/program.hpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.hpp
 rename sycl/plugins/{ => unified_runtime/ur/adapters}/cuda/tracing.cpp (100%)
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
 create mode 100644 sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp

diff --git a/sycl/plugins/cuda/CMakeLists.txt b/sycl/plugins/cuda/CMakeLists.txt
index d3e742267af34..2570b6f7e7348 100644
--- a/sycl/plugins/cuda/CMakeLists.txt
+++ b/sycl/plugins/cuda/CMakeLists.txt
@@ -48,18 +48,51 @@ endif()
 
 add_sycl_plugin(cuda
   SOURCES
+    # Some code is shared with the UR adapter
+    "../unified_runtime/pi2ur.hpp"
+    "../unified_runtime/pi2ur.cpp"
+    "../unified_runtime/ur/ur.hpp"
+    "../unified_runtime/ur/ur.cpp"
+    "../unified_runtime/ur/usm_allocator.cpp"
+    "../unified_runtime/ur/usm_allocator.hpp"
+    "../unified_runtime/ur/adapters/cuda/common.cpp"
+    "../unified_runtime/ur/adapters/cuda/common.hpp"
+    "../unified_runtime/ur/adapters/cuda/context.cpp"
+    "../unified_runtime/ur/adapters/cuda/context.hpp"
+    "../unified_runtime/ur/adapters/cuda/device.cpp"
+    "../unified_runtime/ur/adapters/cuda/device.hpp"
+    "../unified_runtime/ur/adapters/cuda/enqueue.cpp"
+    "../unified_runtime/ur/adapters/cuda/event.cpp"
+    "../unified_runtime/ur/adapters/cuda/event.hpp"
+    "../unified_runtime/ur/adapters/cuda/kernel.cpp"
+    "../unified_runtime/ur/adapters/cuda/kernel.hpp"
+    "../unified_runtime/ur/adapters/cuda/memory.cpp"
+    "../unified_runtime/ur/adapters/cuda/memory.hpp"
+    "../unified_runtime/ur/adapters/cuda/platform.cpp"
+    "../unified_runtime/ur/adapters/cuda/platform.hpp"
+    "../unified_runtime/ur/adapters/cuda/program.cpp"
+    "../unified_runtime/ur/adapters/cuda/program.hpp"
+    "../unified_runtime/ur/adapters/cuda/queue.cpp"
+    "../unified_runtime/ur/adapters/cuda/queue.hpp"
+    "../unified_runtime/ur/adapters/cuda/sampler.cpp"
+    "../unified_runtime/ur/adapters/cuda/sampler.hpp"
+    "../unified_runtime/ur/adapters/cuda/tracing.cpp"
+    "../unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp"
+    "../unified_runtime/ur/adapters/cuda/usm.cpp"
+    # --- 
     "${sycl_inc_dir}/sycl/detail/pi.h"
     "${sycl_inc_dir}/sycl/detail/pi.hpp"
     "pi_cuda.hpp"
     "pi_cuda.cpp"
-    "tracing.cpp"
     ${XPTI_PROXY_SRC}
   INCLUDE_DIRS
     ${sycl_inc_dir}
     ${XPTI_INCLUDE}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../unified_runtime
   LIBRARIES
     cudadrv
     ${XPTI_LIBS}
+    UnifiedRuntime-Headers
   HEADER "${CMAKE_CURRENT_SOURCE_DIR}/include/features.hpp"
 )
 
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index dd68c196e94c1..9af47b47a6b2a 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -16,5880 +16,11 @@
 #include <sycl/detail/defines.hpp>
 #include <sycl/detail/pi.hpp>
 
-#include <algorithm>
-#include <cassert>
-#include <chrono>
-#include <cmath>
-#include <cuda.h>
-#include <cuda_device_runtime_api.h>
-#include <limits>
-#include <memory>
-#include <mutex>
-#include <regex>
-#include <string_view>
-
-// Forward declarations
-void enableCUDATracing();
-void disableCUDATracing();
-
-namespace {
-std::string getCudaVersionString() {
-  int driver_version = 0;
-  cuDriverGetVersion(&driver_version);
-  // The version is returned as (1000 major + 10 minor).
-  std::stringstream stream;
-  stream << "CUDA " << driver_version / 1000 << "."
-         << driver_version % 1000 / 10;
-  return stream.str();
-}
-
-pi_result map_error(CUresult result) {
-  switch (result) {
-  case CUDA_SUCCESS:
-    return PI_SUCCESS;
-  case CUDA_ERROR_NOT_PERMITTED:
-    return PI_ERROR_INVALID_OPERATION;
-  case CUDA_ERROR_INVALID_CONTEXT:
-    return PI_ERROR_INVALID_CONTEXT;
-  case CUDA_ERROR_INVALID_DEVICE:
-    return PI_ERROR_INVALID_DEVICE;
-  case CUDA_ERROR_INVALID_VALUE:
-    return PI_ERROR_INVALID_VALUE;
-  case CUDA_ERROR_OUT_OF_MEMORY:
-    return PI_ERROR_OUT_OF_HOST_MEMORY;
-  case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
-    return PI_ERROR_OUT_OF_RESOURCES;
-  default:
-    return PI_ERROR_UNKNOWN;
-  }
-}
-
-// Global variables for PI_ERROR_PLUGIN_SPECIFIC_ERROR
-constexpr size_t MaxMessageSize = 256;
-thread_local pi_result ErrorMessageCode = PI_SUCCESS;
-thread_local char ErrorMessage[MaxMessageSize];
-
-// Utility function for setting a message and warning
-static void setErrorMessage(const char *message, pi_result error_code) {
-  assert(strlen(message) <= MaxMessageSize);
-  strcpy(ErrorMessage, message);
-  ErrorMessageCode = error_code;
-}
-
-// Returns plugin specific error and warning messages
-pi_result cuda_piPluginGetLastError(char **message) {
-  *message = &ErrorMessage[0];
-  return ErrorMessageCode;
-}
-
-// Returns plugin specific backend option.
-// Current support is only for optimization options.
-// Return empty string for cuda.
-// TODO: Determine correct string to be passed.
-pi_result cuda_piPluginGetBackendOption(pi_platform,
-                                        const char *frontend_option,
-                                        const char **backend_option) {
-  using namespace std::literals;
-  if (frontend_option == nullptr)
-    return PI_ERROR_INVALID_VALUE;
-  if (frontend_option == "-O0"sv || frontend_option == "-O1"sv ||
-      frontend_option == "-O2"sv || frontend_option == "-O3"sv ||
-      frontend_option == ""sv) {
-    *backend_option = "";
-    return PI_SUCCESS;
-  }
-  return PI_ERROR_INVALID_VALUE;
-}
-
-// Iterates over the event wait list, returns correct pi_result error codes.
-// Invokes the callback for the latest event of each queue in the wait list.
-// The callback must take a single pi_event argument and return a pi_result.
-template <typename Func>
-pi_result forLatestEvents(const pi_event *event_wait_list,
-                          std::size_t num_events_in_wait_list, Func &&f) {
-
-  if (event_wait_list == nullptr || num_events_in_wait_list == 0) {
-    return PI_ERROR_INVALID_EVENT_WAIT_LIST;
-  }
-
-  // Fast path if we only have a single event
-  if (num_events_in_wait_list == 1) {
-    return f(event_wait_list[0]);
-  }
-
-  std::vector<pi_event> events{event_wait_list,
-                               event_wait_list + num_events_in_wait_list};
-  std::sort(events.begin(), events.end(), [](pi_event e0, pi_event e1) {
-    // Tiered sort creating sublists of streams (smallest value first) in which
-    // the corresponding events are sorted into a sequence of newest first.
-    return e0->get_stream() < e1->get_stream() ||
-           (e0->get_stream() == e1->get_stream() &&
-            e0->get_event_id() > e1->get_event_id());
-  });
-
-  bool first = true;
-  CUstream lastSeenStream = 0;
-  for (pi_event event : events) {
-    if (!event || (!first && event->get_stream() == lastSeenStream)) {
-      continue;
-    }
-
-    first = false;
-    lastSeenStream = event->get_stream();
-
-    auto result = f(event);
-    if (result != PI_SUCCESS) {
-      return result;
-    }
-  }
-
-  return PI_SUCCESS;
-}
-
-/// Converts CUDA error into PI error codes, and outputs error information
-/// to stderr.
-/// If PI_CUDA_ABORT env variable is defined, it aborts directly instead of
-/// throwing the error. This is intended for debugging purposes.
-/// \return PI_SUCCESS if \param result was CUDA_SUCCESS.
-/// \throw pi_error exception (integer) if input was not success.
-///
-pi_result check_error(CUresult result, const char *function, int line,
-                      const char *file) {
-  if (result == CUDA_SUCCESS || result == CUDA_ERROR_DEINITIALIZED) {
-    return PI_SUCCESS;
-  }
-
-  if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr) {
-    const char *errorString = nullptr;
-    const char *errorName = nullptr;
-    cuGetErrorName(result, &errorName);
-    cuGetErrorString(result, &errorString);
-    std::stringstream ss;
-    ss << "\nPI CUDA ERROR:"
-       << "\n\tValue:           " << result
-       << "\n\tName:            " << errorName
-       << "\n\tDescription:     " << errorString
-       << "\n\tFunction:        " << function << "\n\tSource Location: " << file
-       << ":" << line << "\n"
-       << std::endl;
-    std::cerr << ss.str();
-  }
-
-  if (std::getenv("PI_CUDA_ABORT") != nullptr) {
-    std::abort();
-  }
-
-  throw map_error(result);
-}
-
-/// \cond NODOXY
-#define PI_CHECK_ERROR(result) check_error(result, __func__, __LINE__, __FILE__)
-
-/// ScopedContext is used across all PI CUDA plugin implementation to ensure
-/// that the proper CUDA context is active for the given PI context.
-//
-/// This class will only replace the context if necessary, and will leave the
-/// new context active on the current thread. If there was an active context
-/// already it will simply be replaced.
-//
-/// Previously active contexts are not restored for two reasons:
-/// * Performance: context switches are expensive so leaving the context active
-///   means subsequent SYCL calls with the same context will be cheaper.
-/// * Multi-threading cleanup: contexts are set active per thread and deleting a
-///   context will only deactivate it for the current thread. This means other
-///   threads may end up with deleted active contexts. In particular this can
-///   happen with host_tasks as they run in a thread pool. When the context
-///   associated with these tasks is deleted it will remain active in the
-///   threads of the thread pool. So it would be invalid for any other task
-///   running on these threads to try to restore the deleted context. With the
-///   current implementation this is not an issue because the active deleted
-///   context will just be replaced.
-//
-/// This approach does mean that CUDA interop tasks should NOT expect their
-/// contexts to be restored by SYCL.
-class ScopedContext {
-public:
-  ScopedContext(pi_context ctxt) {
-    if (!ctxt) {
-      throw PI_ERROR_INVALID_CONTEXT;
-    }
-
-    set_context(ctxt->get());
-  }
-
-  ScopedContext(CUcontext ctxt) { set_context(ctxt); }
-
-  ~ScopedContext() {}
-
-private:
-  void set_context(CUcontext desired) {
-    CUcontext original = nullptr;
-
-    PI_CHECK_ERROR(cuCtxGetCurrent(&original));
-
-    // Make sure the desired context is active on the current thread, setting
-    // it if necessary
-    if (original != desired) {
-      PI_CHECK_ERROR(cuCtxSetCurrent(desired));
-    }
-  }
-};
-
-/// \cond NODOXY
-template <typename T, typename Assign>
-pi_result getInfoImpl(size_t param_value_size, void *param_value,
-                      size_t *param_value_size_ret, T value, size_t value_size,
-                      Assign &&assign_func) {
-
-  if (param_value != nullptr) {
-
-    if (param_value_size < value_size) {
-      return PI_ERROR_INVALID_VALUE;
-    }
-
-    assign_func(param_value, value, value_size);
-  }
-
-  if (param_value_size_ret != nullptr) {
-    *param_value_size_ret = value_size;
-  }
-
-  return PI_SUCCESS;
-}
-
-template <typename T>
-pi_result getInfo(size_t param_value_size, void *param_value,
-                  size_t *param_value_size_ret, T value) {
-
-  auto assignment = [](void *param_value, T value, size_t value_size) {
-    // Ignore unused parameter
-    (void)value_size;
-
-    *static_cast<T *>(param_value) = value;
-  };
-
-  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
-                     sizeof(T), assignment);
-}
-
-template <typename T>
-pi_result getInfoArray(size_t array_length, size_t param_value_size,
-                       void *param_value, size_t *param_value_size_ret,
-                       T *value) {
-  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
-                     array_length * sizeof(T), memcpy);
-}
-
-template <>
-pi_result getInfo<const char *>(size_t param_value_size, void *param_value,
-                                size_t *param_value_size_ret,
-                                const char *value) {
-  return getInfoArray(strlen(value) + 1, param_value_size, param_value,
-                      param_value_size_ret, value);
-}
-
-int getAttribute(pi_device device, CUdevice_attribute attribute) {
-  int value;
-  sycl::detail::pi::assertion(
-      cuDeviceGetAttribute(&value, attribute, device->get()) == CUDA_SUCCESS);
-  return value;
-}
-/// \endcond
-
-// Determine local work sizes that result in uniform work groups.
-// The default threadsPerBlock only require handling the first work_dim
-// dimension.
-void guessLocalWorkSize(_pi_device *device, size_t *threadsPerBlock,
-                        const size_t *global_work_size,
-                        const size_t maxThreadsPerBlock[3], pi_kernel kernel,
-                        pi_uint32 local_size) {
-  assert(threadsPerBlock != nullptr);
-  assert(global_work_size != nullptr);
-  assert(kernel != nullptr);
-  int minGrid, maxBlockSize, maxBlockDim[3];
-
-  static auto isPrime = [](size_t number) -> bool {
-    auto lastNumToCheck = ceil(sqrt(number));
-    if (number < 2)
-      return false;
-    if (number == 2)
-      return true;
-    if (number % 2 == 0)
-      return false;
-    for (int i = 3; i <= lastNumToCheck; i += 2) {
-      if (number % i == 0)
-        return false;
-    }
-    return true;
-  };
-
-  cuDeviceGetAttribute(&maxBlockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
-                       device->get());
-  cuDeviceGetAttribute(&maxBlockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
-                       device->get());
-
-  PI_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
-      &minGrid, &maxBlockSize, kernel->get(), NULL, local_size,
-      maxThreadsPerBlock[0]));
-
-  threadsPerBlock[2] = std::min(global_work_size[2], size_t(maxBlockDim[2]));
-  threadsPerBlock[1] =
-      std::min(global_work_size[1], std::min(maxBlockSize / threadsPerBlock[2],
-                                             size_t(maxBlockDim[1])));
-  maxBlockDim[0] = maxBlockSize / (threadsPerBlock[1] * threadsPerBlock[2]);
-  threadsPerBlock[0] =
-      std::min(maxThreadsPerBlock[0],
-               std::min(global_work_size[0], size_t(maxBlockDim[0])));
-
-  // When global_work_size[0] is prime threadPerBlock[0] will later computed as
-  // 1, which is not efficient configuration. In such case we use
-  // global_work_size[0] + 1 to compute threadPerBlock[0].
-  int adjusted_0_dim_global_work_size =
-      (isPrime(global_work_size[0]) &&
-       (threadsPerBlock[0] != global_work_size[0]))
-          ? global_work_size[0] + 1
-          : global_work_size[0];
-
-  static auto isPowerOf2 = [](size_t value) -> bool {
-    return value && !(value & (value - 1));
-  };
-
-  // Find a local work group size that is a divisor of the global
-  // work group size to produce uniform work groups.
-  // Additionally, for best compute utilisation, the local size has
-  // to be a power of two.
-  while (0u != (adjusted_0_dim_global_work_size % threadsPerBlock[0]) ||
-         !isPowerOf2(threadsPerBlock[0])) {
-    --threadsPerBlock[0];
-  }
-}
-
-pi_result enqueueEventsWait(pi_queue command_queue, CUstream stream,
-                            pi_uint32 num_events_in_wait_list,
-                            const pi_event *event_wait_list) {
-  if (!event_wait_list) {
-    return PI_SUCCESS;
-  }
-  try {
-    ScopedContext active(command_queue->get_context());
-
-    auto result = forLatestEvents(
-        event_wait_list, num_events_in_wait_list,
-        [stream](pi_event event) -> pi_result {
-          if (event->get_stream() == stream) {
-            return PI_SUCCESS;
-          } else {
-            return PI_CHECK_ERROR(cuStreamWaitEvent(stream, event->get(), 0));
-          }
-        });
-
-    if (result != PI_SUCCESS) {
-      return result;
-    }
-    return PI_SUCCESS;
-  } catch (pi_result err) {
-    return err;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-}
-
-template <typename PtrT>
-void getUSMHostOrDevicePtr(PtrT usm_ptr, CUmemorytype *out_mem_type,
-                           CUdeviceptr *out_dev_ptr, PtrT *out_host_ptr) {
-  // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE
-  // checks with PI_CHECK_ERROR are not suggested
-  CUresult ret = cuPointerGetAttribute(
-      out_mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)usm_ptr);
-  assert((*out_mem_type != CU_MEMORYTYPE_ARRAY &&
-          *out_mem_type != CU_MEMORYTYPE_UNIFIED) &&
-         "ARRAY, UNIFIED types are not supported!");
-
-  // pointer not known to the CUDA subsystem (possibly a system allocated ptr)
-  if (ret == CUDA_ERROR_INVALID_VALUE) {
-    *out_mem_type = CU_MEMORYTYPE_HOST;
-    *out_dev_ptr = 0;
-    *out_host_ptr = usm_ptr;
-
-    // todo: resets the above "non-stick" error
-  } else if (ret == CUDA_SUCCESS) {
-    *out_dev_ptr = (*out_mem_type == CU_MEMORYTYPE_DEVICE)
-                       ? reinterpret_cast<CUdeviceptr>(usm_ptr)
-                       : 0;
-    *out_host_ptr = (*out_mem_type == CU_MEMORYTYPE_HOST) ? usm_ptr : nullptr;
-  } else {
-    PI_CHECK_ERROR(ret);
-  }
-}
-
-bool getMaxRegistersJitOptionValue(const std::string &build_options,
-                                   unsigned int &value) {
-  using namespace std::string_view_literals;
-  const std::size_t optionPos = build_options.find_first_of("maxrregcount"sv);
-  if (optionPos == std::string::npos) {
-    return false;
-  }
-
-  const std::size_t delimPos = build_options.find('=', optionPos + 1u);
-  if (delimPos == std::string::npos) {
-    return false;
-  }
-
-  const std::size_t length = build_options.length();
-  const std::size_t startPos = delimPos + 1u;
-  if (delimPos == std::string::npos || startPos >= length) {
-    return false;
-  }
-
-  std::size_t pos = startPos;
-  while (pos < length &&
-         std::isdigit(static_cast<unsigned char>(build_options[pos]))) {
-    pos++;
-  }
-
-  const std::string valueString =
-      build_options.substr(startPos, pos - startPos);
-  if (valueString.empty()) {
-    return false;
-  }
-
-  value = static_cast<unsigned int>(std::stoi(valueString));
-  return true;
-}
-
-// Helper to verify out-of-registers case (exceeded block max registers).
-// If the kernel requires a number of registers for the entire thread
-// block exceeds the hardware limitations, then the cuLaunchKernel call
-// will fail to launch with CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES error.
-bool hasExceededMaxRegistersPerBlock(pi_device device, pi_kernel kernel,
-                                     size_t blockSize) {
-  assert(device);
-  assert(kernel);
-
-  int maxRegsPerBlock{0};
-  PI_CHECK_ERROR(cuDeviceGetAttribute(
-      &maxRegsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
-      device->get()));
-
-  int regsPerThread{0};
-  PI_CHECK_ERROR(cuFuncGetAttribute(&regsPerThread, CU_FUNC_ATTRIBUTE_NUM_REGS,
-                                    kernel->get()));
-
-  return blockSize * regsPerThread > size_t(maxRegsPerBlock);
-}
-
-} // anonymous namespace
-
-/// ------ Error handling, matching OpenCL plugin semantics.
-namespace sycl {
-__SYCL_INLINE_VER_NAMESPACE(_V1) {
-namespace detail {
-namespace pi {
-
-// Report error and no return (keeps compiler from printing warnings).
-// TODO: Probably change that to throw a catchable exception,
-//       but for now it is useful to see every failure.
-//
-[[noreturn]] void die(const char *Message) {
-  std::cerr << "pi_die: " << Message << std::endl;
-  std::terminate();
-}
-
-// Reports error messages
-void cuPrint(const char *Message) {
-  std::cerr << "pi_print: " << Message << std::endl;
-}
-
-void assertion(bool Condition, const char *Message) {
-  if (!Condition)
-    die(Message);
-}
-
-} // namespace pi
-} // namespace detail
-} // __SYCL_INLINE_VER_NAMESPACE(_V1)
-} // namespace sycl
-
-//--------------
-// PI object implementation
-
-extern "C" {
-
-// Required in a number of functions, so forward declare here
-pi_result cuda_piEnqueueEventsWait(pi_queue command_queue,
-                                   pi_uint32 num_events_in_wait_list,
-                                   const pi_event *event_wait_list,
-                                   pi_event *event);
-pi_result cuda_piEnqueueEventsWaitWithBarrier(pi_queue command_queue,
-                                              pi_uint32 num_events_in_wait_list,
-                                              const pi_event *event_wait_list,
-                                              pi_event *event);
-pi_result cuda_piEventRelease(pi_event event);
-pi_result cuda_piEventRetain(pi_event event);
-
-} // extern "C"
-
-/// \endcond
-
-void _pi_queue::compute_stream_wait_for_barrier_if_needed(CUstream stream,
-                                                          pi_uint32 stream_i) {
-  if (barrier_event_ && !compute_applied_barrier_[stream_i]) {
-    PI_CHECK_ERROR(cuStreamWaitEvent(stream, barrier_event_, 0));
-    compute_applied_barrier_[stream_i] = true;
-  }
-}
-
-void _pi_queue::transfer_stream_wait_for_barrier_if_needed(CUstream stream,
-                                                           pi_uint32 stream_i) {
-  if (barrier_event_ && !transfer_applied_barrier_[stream_i]) {
-    PI_CHECK_ERROR(cuStreamWaitEvent(stream, barrier_event_, 0));
-    transfer_applied_barrier_[stream_i] = true;
-  }
-}
-
-CUstream _pi_queue::get_next_compute_stream(pi_uint32 *stream_token) {
-  pi_uint32 stream_i;
-  pi_uint32 token;
-  while (true) {
-    if (num_compute_streams_ < compute_streams_.size()) {
-      // the check above is for performance - so as not to lock mutex every time
-      std::lock_guard<std::mutex> guard(compute_stream_mutex_);
-      // The second check is done after mutex is locked so other threads can not
-      // change num_compute_streams_ after that
-      if (num_compute_streams_ < compute_streams_.size()) {
-        PI_CHECK_ERROR(
-            cuStreamCreate(&compute_streams_[num_compute_streams_++], flags_));
-      }
-    }
-    token = compute_stream_idx_++;
-    stream_i = token % compute_streams_.size();
-    // if a stream has been reused before it was next selected round-robin
-    // fashion, we want to delay its next use and instead select another one
-    // that is more likely to have completed all the enqueued work.
-    if (delay_compute_[stream_i]) {
-      delay_compute_[stream_i] = false;
-    } else {
-      break;
-    }
-  }
-  if (stream_token) {
-    *stream_token = token;
-  }
-  CUstream res = compute_streams_[stream_i];
-  compute_stream_wait_for_barrier_if_needed(res, stream_i);
-  return res;
-}
-
-CUstream _pi_queue::get_next_compute_stream(pi_uint32 num_events_in_wait_list,
-                                            const pi_event *event_wait_list,
-                                            _pi_stream_guard &guard,
-                                            pi_uint32 *stream_token) {
-  for (pi_uint32 i = 0; i < num_events_in_wait_list; i++) {
-    pi_uint32 token = event_wait_list[i]->get_compute_stream_token();
-    if (event_wait_list[i]->get_queue() == this && can_reuse_stream(token)) {
-      std::unique_lock<std::mutex> compute_sync_guard(
-          compute_stream_sync_mutex_);
-      // redo the check after lock to avoid data races on
-      // last_sync_compute_streams_
-      if (can_reuse_stream(token)) {
-        pi_uint32 stream_i = token % delay_compute_.size();
-        delay_compute_[stream_i] = true;
-        if (stream_token) {
-          *stream_token = token;
-        }
-        guard = _pi_stream_guard{std::move(compute_sync_guard)};
-        CUstream res = event_wait_list[i]->get_stream();
-        compute_stream_wait_for_barrier_if_needed(res, stream_i);
-        return res;
-      }
-    }
-  }
-  guard = {};
-  return get_next_compute_stream(stream_token);
-}
-
-CUstream _pi_queue::get_next_transfer_stream() {
-  if (transfer_streams_.empty()) { // for example in in-order queue
-    return get_next_compute_stream();
-  }
-  if (num_transfer_streams_ < transfer_streams_.size()) {
-    // the check above is for performance - so as not to lock mutex every time
-    std::lock_guard<std::mutex> guard(transfer_stream_mutex_);
-    // The second check is done after mutex is locked so other threads can not
-    // change num_transfer_streams_ after that
-    if (num_transfer_streams_ < transfer_streams_.size()) {
-      PI_CHECK_ERROR(
-          cuStreamCreate(&transfer_streams_[num_transfer_streams_++], flags_));
-    }
-  }
-  pi_uint32 stream_i = transfer_stream_idx_++ % transfer_streams_.size();
-  CUstream res = transfer_streams_[stream_i];
-  transfer_stream_wait_for_barrier_if_needed(res, stream_i);
-  return res;
-}
-
-_pi_event::_pi_event(pi_command_type type, pi_context context, pi_queue queue,
-                     CUstream stream, pi_uint32 stream_token)
-    : commandType_{type}, refCount_{1}, has_ownership_{true},
-      hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false},
-      streamToken_{stream_token}, evEnd_{nullptr}, evStart_{nullptr},
-      evQueued_{nullptr}, queue_{queue}, stream_{stream}, context_{context} {
-
-  bool profilingEnabled = queue_->properties_ & PI_QUEUE_FLAG_PROFILING_ENABLE;
-
-  PI_CHECK_ERROR(cuEventCreate(
-      &evEnd_, profilingEnabled ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING));
-
-  if (profilingEnabled) {
-    PI_CHECK_ERROR(cuEventCreate(&evQueued_, CU_EVENT_DEFAULT));
-    PI_CHECK_ERROR(cuEventCreate(&evStart_, CU_EVENT_DEFAULT));
-  }
-
-  if (queue_ != nullptr) {
-    cuda_piQueueRetain(queue_);
-  }
-  cuda_piContextRetain(context_);
-}
-
-_pi_event::_pi_event(pi_context context, CUevent eventNative)
-    : commandType_{PI_COMMAND_TYPE_USER}, refCount_{1}, has_ownership_{false},
-      hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false},
-      streamToken_{std::numeric_limits<pi_uint32>::max()}, evEnd_{eventNative},
-      evStart_{nullptr}, evQueued_{nullptr}, queue_{nullptr},
-      context_{context} {
-  cuda_piContextRetain(context_);
-}
-
-_pi_event::~_pi_event() {
-  if (queue_ != nullptr) {
-    cuda_piQueueRelease(queue_);
-  }
-  cuda_piContextRelease(context_);
-}
-
-pi_result _pi_event::start() {
-  assert(!is_started());
-  pi_result result = PI_SUCCESS;
-
-  try {
-    if (queue_->properties_ & PI_QUEUE_FLAG_PROFILING_ENABLE) {
-      // NOTE: This relies on the default stream to be unused.
-      result = PI_CHECK_ERROR(cuEventRecord(evQueued_, 0));
-      result = PI_CHECK_ERROR(cuEventRecord(evStart_, stream_));
-    }
-  } catch (pi_result error) {
-    result = error;
-  }
-
-  isStarted_ = true;
-  return result;
-}
-
-bool _pi_event::is_completed() const noexcept {
-  if (!isRecorded_) {
-    return false;
-  }
-  if (!hasBeenWaitedOn_) {
-    const CUresult ret = cuEventQuery(evEnd_);
-    if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_NOT_READY) {
-      PI_CHECK_ERROR(ret);
-      return false;
-    }
-    if (ret == CUDA_ERROR_NOT_READY) {
-      return false;
-    }
-  }
-  return true;
-}
-
-pi_uint64 _pi_device::get_elapsed_time(CUevent ev) const {
-  float miliSeconds = 0.0f;
-
-  PI_CHECK_ERROR(cuEventElapsedTime(&miliSeconds, evBase_, ev));
-
-  return static_cast<pi_uint64>(miliSeconds * 1.0e6);
-}
-
-pi_uint64 _pi_event::get_queued_time() const {
-  assert(is_started());
-  return queue_->get_device()->get_elapsed_time(evQueued_);
-}
-
-pi_uint64 _pi_event::get_start_time() const {
-  assert(is_started());
-  return queue_->get_device()->get_elapsed_time(evStart_);
-}
-
-pi_uint64 _pi_event::get_end_time() const {
-  assert(is_started() && is_recorded());
-  return queue_->get_device()->get_elapsed_time(evEnd_);
-}
-
-pi_result _pi_event::record() {
-
-  if (is_recorded() || !is_started()) {
-    return PI_ERROR_INVALID_EVENT;
-  }
-
-  pi_result result = PI_ERROR_INVALID_OPERATION;
-
-  if (!queue_) {
-    return PI_ERROR_INVALID_QUEUE;
-  }
-
-  try {
-    eventId_ = queue_->get_next_event_id();
-    if (eventId_ == 0) {
-      sycl::detail::pi::die(
-          "Unrecoverable program state reached in event identifier overflow");
-    }
-    result = PI_CHECK_ERROR(cuEventRecord(evEnd_, stream_));
-  } catch (pi_result error) {
-    result = error;
-  }
-
-  if (result == PI_SUCCESS) {
-    isRecorded_ = true;
-  }
-
-  return result;
-}
-
-pi_result _pi_event::wait() {
-  pi_result retErr;
-  try {
-    retErr = PI_CHECK_ERROR(cuEventSynchronize(evEnd_));
-    hasBeenWaitedOn_ = true;
-  } catch (pi_result error) {
-    retErr = error;
-  }
-
-  return retErr;
-}
-
-pi_result _pi_event::release() {
-  if (!backend_has_ownership())
-    return PI_SUCCESS;
-
-  assert(queue_ != nullptr);
-
-  PI_CHECK_ERROR(cuEventDestroy(evEnd_));
-
-  if (queue_->properties_ & PI_QUEUE_FLAG_PROFILING_ENABLE) {
-    PI_CHECK_ERROR(cuEventDestroy(evQueued_));
-    PI_CHECK_ERROR(cuEventDestroy(evStart_));
-  }
-
-  return PI_SUCCESS;
-}
-
-// makes all future work submitted to queue wait for all work captured in event.
-pi_result enqueueEventWait(pi_queue queue, pi_event event) {
-  // for native events, the cuStreamWaitEvent call is used.
-  // This makes all future work submitted to stream wait for all
-  // work captured in event.
-  queue->for_each_stream([e = event->get()](CUstream s) {
-    PI_CHECK_ERROR(cuStreamWaitEvent(s, e, 0));
-  });
-  return PI_SUCCESS;
-}
-
-_pi_program::_pi_program(pi_context ctxt)
-    : module_{nullptr}, binary_{}, binarySizeInBytes_{0}, refCount_{1},
-      context_{ctxt}, kernelReqdWorkGroupSizeMD_{} {
-  cuda_piContextRetain(context_);
-}
-
-_pi_program::~_pi_program() { cuda_piContextRelease(context_); }
-
-std::pair<std::string, std::string>
-splitMetadataName(const std::string &metadataName) {
-  size_t splitPos = metadataName.rfind('@');
-  if (splitPos == std::string::npos)
-    return std::make_pair(metadataName, std::string{});
-  return std::make_pair(metadataName.substr(0, splitPos),
-                        metadataName.substr(splitPos, metadataName.length()));
-}
-
-pi_result _pi_program::set_metadata(const pi_device_binary_property *metadata,
-                                    size_t length) {
-  for (size_t i = 0; i < length; ++i) {
-    const pi_device_binary_property metadataElement = metadata[i];
-    std::string metadataElementName{metadataElement->Name};
-
-    auto [prefix, tag] = splitMetadataName(metadataElementName);
-
-    if (tag == __SYCL_PI_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) {
-      // If metadata is reqd_work_group_size, record it for the corresponding
-      // kernel name.
-      size_t MDElemsSize = metadataElement->ValSize - sizeof(std::uint64_t);
-
-      // Expect between 1 and 3 32-bit integer values.
-      assert(MDElemsSize >= sizeof(std::uint32_t) &&
-             MDElemsSize <= sizeof(std::uint32_t) * 3 &&
-             "Unexpected size for reqd_work_group_size metadata");
-
-      // Get pointer to data, skipping 64-bit size at the start of the data.
-      const char *ValuePtr =
-          reinterpret_cast<const char *>(metadataElement->ValAddr) +
-          sizeof(std::uint64_t);
-      // Read values and pad with 1's for values not present.
-      std::uint32_t reqdWorkGroupElements[] = {1, 1, 1};
-      std::memcpy(reqdWorkGroupElements, ValuePtr, MDElemsSize);
-      kernelReqdWorkGroupSizeMD_[prefix] =
-          std::make_tuple(reqdWorkGroupElements[0], reqdWorkGroupElements[1],
-                          reqdWorkGroupElements[2]);
-    } else if (tag == __SYCL_PI_PROGRAM_METADATA_GLOBAL_ID_MAPPING) {
-      const char *metadataValPtr =
-          reinterpret_cast<const char *>(metadataElement->ValAddr) +
-          sizeof(std::uint64_t);
-      const char *metadataValPtrEnd =
-          metadataValPtr + metadataElement->ValSize - sizeof(std::uint64_t);
-      globalIDMD_[prefix] = std::string{metadataValPtr, metadataValPtrEnd};
-    }
-  }
-  return PI_SUCCESS;
-}
-
-pi_result _pi_program::set_binary(const char *source, size_t length) {
-  assert((binary_ == nullptr && binarySizeInBytes_ == 0) &&
-         "Re-setting program binary data which has already been set");
-  binary_ = source;
-  binarySizeInBytes_ = length;
-  return PI_SUCCESS;
-}
-
-pi_result _pi_program::build_program(const char *build_options) {
-
-  this->buildOptions_ = build_options;
-
-  constexpr const unsigned int numberOfOptions = 4u;
-
-  std::vector<CUjit_option> options(numberOfOptions);
-  std::vector<void *> optionVals(numberOfOptions);
-
-  // Pass a buffer for info messages
-  options[0] = CU_JIT_INFO_LOG_BUFFER;
-  optionVals[0] = (void *)infoLog_;
-  // Pass the size of the info buffer
-  options[1] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
-  optionVals[1] = (void *)(long)MAX_LOG_SIZE;
-  // Pass a buffer for error message
-  options[2] = CU_JIT_ERROR_LOG_BUFFER;
-  optionVals[2] = (void *)errorLog_;
-  // Pass the size of the error buffer
-  options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
-  optionVals[3] = (void *)(long)MAX_LOG_SIZE;
-
-  if (!buildOptions_.empty()) {
-    unsigned int maxRegs;
-    bool valid = getMaxRegistersJitOptionValue(buildOptions_, maxRegs);
-    if (valid) {
-      options.push_back(CU_JIT_MAX_REGISTERS);
-      optionVals.push_back(reinterpret_cast<void *>(maxRegs));
-    }
-  }
-
-  auto result = PI_CHECK_ERROR(
-      cuModuleLoadDataEx(&module_, static_cast<const void *>(binary_),
-                         options.size(), options.data(), optionVals.data()));
-
-  const auto success = (result == PI_SUCCESS);
-
-  buildStatus_ =
-      success ? PI_PROGRAM_BUILD_STATUS_SUCCESS : PI_PROGRAM_BUILD_STATUS_ERROR;
-
-  // If no exception, result is correct
-  return success ? PI_SUCCESS : PI_ERROR_BUILD_PROGRAM_FAILURE;
-}
-
-/// Finds kernel names by searching for entry points in the PTX source, as the
-/// CUDA driver API doesn't expose an operation for this.
-/// Note: This is currently only being used by the SYCL program class for the
-///       has_kernel method, so an alternative would be to move the has_kernel
-///       query to PI and use cuModuleGetFunction to check for a kernel.
-/// Note: Another alternative is to add kernel names as metadata, like with
-///       reqd_work_group_size.
-std::string getKernelNames(pi_program) {
-  sycl::detail::pi::die("getKernelNames not implemented");
-  return {};
-}
-
-//-- PI API implementation
-extern "C" {
-
-pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
-                               size_t param_value_size, void *param_value,
-                               size_t *param_value_size_ret);
-
-/// Obtains the CUDA platform.
-/// There is only one CUDA platform, and contains all devices on the system.
-/// Triggers the CUDA Driver initialization (cuInit) the first time, so this
-/// must be the first PI API called.
-///
-/// However because multiple devices in a context is not currently supported,
-/// place each device in a separate platform.
-///
-pi_result cuda_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms,
-                              pi_uint32 *num_platforms) {
-
-  try {
-    static std::once_flag initFlag;
-    static pi_uint32 numPlatforms = 1;
-    static std::vector<_pi_platform> platformIds;
-
-    if (num_entries == 0 && platforms != nullptr) {
-      return PI_ERROR_INVALID_VALUE;
-    }
-    if (platforms == nullptr && num_platforms == nullptr) {
-      return PI_ERROR_INVALID_VALUE;
-    }
-
-    pi_result err = PI_SUCCESS;
-
-    std::call_once(
-        initFlag,
-        [](pi_result &err) {
-          if (cuInit(0) != CUDA_SUCCESS) {
-            numPlatforms = 0;
-            return;
-          }
-          int numDevices = 0;
-          err = PI_CHECK_ERROR(cuDeviceGetCount(&numDevices));
-          if (numDevices == 0) {
-            numPlatforms = 0;
-            return;
-          }
-          try {
-            // make one platform per device
-            numPlatforms = numDevices;
-            platformIds.resize(numDevices);
-
-            for (int i = 0; i < numDevices; ++i) {
-              CUdevice device;
-              err = PI_CHECK_ERROR(cuDeviceGet(&device, i));
-              CUcontext context;
-              err = PI_CHECK_ERROR(cuDevicePrimaryCtxRetain(&context, device));
-
-              ScopedContext active(context);
-              CUevent evBase;
-              err = PI_CHECK_ERROR(cuEventCreate(&evBase, CU_EVENT_DEFAULT));
-
-              // Use default stream to record base event counter
-              err = PI_CHECK_ERROR(cuEventRecord(evBase, 0));
-
-              platformIds[i].devices_.emplace_back(
-                  new _pi_device{device, context, evBase, &platformIds[i]});
-
-              {
-                const auto &dev = platformIds[i].devices_.back().get();
-                size_t maxWorkGroupSize = 0u;
-                size_t maxThreadsPerBlock[3] = {};
-                pi_result retError = cuda_piDeviceGetInfo(
-                    dev, PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES,
-                    sizeof(maxThreadsPerBlock), maxThreadsPerBlock, nullptr);
-                assert(retError == PI_SUCCESS);
-                (void)retError;
-
-                retError = cuda_piDeviceGetInfo(
-                    dev, PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE,
-                    sizeof(maxWorkGroupSize), &maxWorkGroupSize, nullptr);
-                assert(retError == PI_SUCCESS);
-
-                dev->save_max_work_item_sizes(sizeof(maxThreadsPerBlock),
-                                              maxThreadsPerBlock);
-                dev->save_max_work_group_size(maxWorkGroupSize);
-              }
-            }
-          } catch (const std::bad_alloc &) {
-            // Signal out-of-memory situation
-            for (int i = 0; i < numDevices; ++i) {
-              platformIds[i].devices_.clear();
-            }
-            platformIds.clear();
-            err = PI_ERROR_OUT_OF_HOST_MEMORY;
-          } catch (...) {
-            // Clear and rethrow to allow retry
-            for (int i = 0; i < numDevices; ++i) {
-              platformIds[i].devices_.clear();
-            }
-            platformIds.clear();
-            throw;
-          }
-        },
-        err);
-
-    if (num_platforms != nullptr) {
-      *num_platforms = numPlatforms;
-    }
-
-    if (platforms != nullptr) {
-      for (unsigned i = 0; i < std::min(num_entries, numPlatforms); ++i) {
-        platforms[i] = &platformIds[i];
-      }
-    }
-
-    return err;
-  } catch (pi_result err) {
-    return err;
-  } catch (...) {
-    return PI_ERROR_OUT_OF_RESOURCES;
-  }
-}
-
-pi_result cuda_piPlatformGetInfo([[maybe_unused]] pi_platform platform,
-                                 pi_platform_info param_name,
-                                 size_t param_value_size, void *param_value,
-                                 size_t *param_value_size_ret) {
-  assert(platform != nullptr);
-
-  switch (param_name) {
-  case PI_PLATFORM_INFO_NAME:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   "NVIDIA CUDA BACKEND");
-  case PI_PLATFORM_INFO_VENDOR:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   "NVIDIA Corporation");
-  case PI_PLATFORM_INFO_PROFILE:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   "FULL PROFILE");
-  case PI_PLATFORM_INFO_VERSION: {
-    auto version = getCudaVersionString();
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   version.c_str());
-  }
-  case PI_PLATFORM_INFO_EXTENSIONS: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, "");
-  }
-  case PI_EXT_PLATFORM_INFO_BACKEND: {
-    return getInfo<pi_platform_backend>(param_value_size, param_value,
-                                        param_value_size_ret,
-                                        PI_EXT_PLATFORM_BACKEND_CUDA);
-  }
-  default:
-    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-  }
-  sycl::detail::pi::die("Platform info request not implemented");
-  return {};
-}
-
-/// \param devices List of devices available on the system
-/// \param num_devices Number of elements in the list of devices
-/// Requesting a non-GPU device triggers an error, all PI CUDA devices
-/// are GPUs.
-///
-pi_result cuda_piDevicesGet(pi_platform platform, pi_device_type device_type,
-                            pi_uint32 num_entries, pi_device *devices,
-                            pi_uint32 *num_devices) {
-
-  pi_result err = PI_SUCCESS;
-  const bool askingForDefault = device_type == PI_DEVICE_TYPE_DEFAULT;
-  const bool askingForGPU = device_type & PI_DEVICE_TYPE_GPU;
-  const bool returnDevices = askingForDefault || askingForGPU;
-
-  size_t numDevices = returnDevices ? platform->devices_.size() : 0;
-
-  try {
-    if (num_devices) {
-      *num_devices = numDevices;
-    }
-
-    if (returnDevices && devices) {
-      for (size_t i = 0; i < std::min(size_t(num_entries), numDevices); ++i) {
-        devices[i] = platform->devices_[i].get();
-      }
-    }
-
-    return err;
-  } catch (pi_result err) {
-    return err;
-  } catch (...) {
-    return PI_ERROR_OUT_OF_RESOURCES;
-  }
-}
-
-/// \return PI_SUCCESS if the function is executed successfully
-/// CUDA devices are always root devices so retain always returns success.
-pi_result cuda_piDeviceRetain(pi_device) { return PI_SUCCESS; }
-
-pi_result cuda_piContextGetInfo(pi_context context, pi_context_info param_name,
-                                size_t param_value_size, void *param_value,
-                                size_t *param_value_size_ret) {
-
-  switch (param_name) {
-  case PI_CONTEXT_INFO_NUM_DEVICES:
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1);
-  case PI_CONTEXT_INFO_DEVICES:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   context->get_device());
-  case PI_CONTEXT_INFO_REFERENCE_COUNT:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   context->get_reference_count());
-  case PI_EXT_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES:
-  case PI_EXT_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES:
-  case PI_EXT_CONTEXT_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES:
-  case PI_EXT_CONTEXT_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: {
-    // These queries should be dealt with in context_impl.cpp by calling the
-    // queries of each device separately and building the intersection set.
-    setErrorMessage("These queries should have never come here.",
-                    PI_ERROR_INVALID_ARG_VALUE);
-    return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
-  }
-  case PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT:
-    return getInfo<pi_bool>(param_value_size, param_value, param_value_size_ret,
-                            true);
-  case PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT:
-  case PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMSET2D_SUPPORT:
-    // 2D USM operations currently not supported.
-    return getInfo<pi_bool>(param_value_size, param_value, param_value_size_ret,
-                            false);
-  default:
-    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-  }
-
-  return PI_ERROR_OUT_OF_RESOURCES;
-}
-
-pi_result cuda_piContextRetain(pi_context context) {
-  assert(context != nullptr);
-  assert(context->get_reference_count() > 0);
-
-  context->increment_reference_count();
-  return PI_SUCCESS;
-}
-
-pi_result cuda_piextContextSetExtendedDeleter(
-    pi_context context, pi_context_extended_deleter function, void *user_data) {
-  context->set_extended_deleter(function, user_data);
-  return PI_SUCCESS;
-}
-
-/// Not applicable to CUDA, devices cannot be partitioned.
-pi_result cuda_piDevicePartition(pi_device,
-                                 const pi_device_partition_property *,
-                                 pi_uint32, pi_device *, pi_uint32 *) {
-  return {};
-}
-
-/// \return If available, the first binary that is PTX
-///
-pi_result cuda_piextDeviceSelectBinary(pi_device device,
-                                       pi_device_binary *binaries,
-                                       pi_uint32 num_binaries,
-                                       pi_uint32 *selected_binary) {
-  // Ignore unused parameter
-  (void)device;
-
-  if (!binaries) {
-    sycl::detail::pi::die("No list of device images provided");
-  }
-  if (num_binaries < 1) {
-    sycl::detail::pi::die("No binary images in the list");
-  }
-
-  // Look for an image for the NVPTX64 target, and return the first one that is
-  // found
-  for (pi_uint32 i = 0; i < num_binaries; i++) {
-    if (strcmp(binaries[i]->DeviceTargetSpec,
-               __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64) == 0) {
-      *selected_binary = i;
-      return PI_SUCCESS;
-    }
-  }
-
-  // No image can be loaded for the given device
-  return PI_ERROR_INVALID_BINARY;
-}
-
-pi_result cuda_piextGetDeviceFunctionPointer([[maybe_unused]] pi_device device,
-                                             pi_program program,
-                                             const char *func_name,
-                                             pi_uint64 *func_pointer_ret) {
-  // Check if device passed is the same the device bound to the context
-  assert(device == program->get_context()->get_device());
-  assert(func_pointer_ret != nullptr);
-
-  CUfunction func;
-  CUresult ret = cuModuleGetFunction(&func, program->get(), func_name);
-  *func_pointer_ret = reinterpret_cast<pi_uint64>(func);
-  pi_result retError = PI_SUCCESS;
-
-  if (ret != CUDA_SUCCESS && ret != CUDA_ERROR_NOT_FOUND)
-    retError = PI_CHECK_ERROR(ret);
-  if (ret == CUDA_ERROR_NOT_FOUND) {
-    *func_pointer_ret = 0;
-    retError = PI_ERROR_INVALID_KERNEL_NAME;
-  }
-
-  return retError;
-}
-
-/// \return PI_SUCCESS always since CUDA devices are always root devices.
-///
-pi_result cuda_piDeviceRelease(pi_device) { return PI_SUCCESS; }
-
-pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
-                               size_t param_value_size, void *param_value,
-                               size_t *param_value_size_ret) {
-
-  static constexpr pi_uint32 max_work_item_dimensions = 3u;
-
-  assert(device != nullptr);
-
-  ScopedContext active(device->get_context());
-
-  switch (param_name) {
-  case PI_DEVICE_INFO_TYPE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   PI_DEVICE_TYPE_GPU);
-  }
-  case PI_DEVICE_INFO_VENDOR_ID: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 4318u);
-  }
-  case PI_DEVICE_INFO_MAX_COMPUTE_UNITS: {
-    int compute_units = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&compute_units,
-                             CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(compute_units >= 0);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   pi_uint32(compute_units));
-  }
-  case PI_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   max_work_item_dimensions);
-  }
-  case PI_DEVICE_INFO_MAX_WORK_ITEM_SIZES: {
-    size_t return_sizes[max_work_item_dimensions];
-
-    int max_x = 0, max_y = 0, max_z = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&max_x, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(max_x >= 0);
-
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&max_y, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(max_y >= 0);
-
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&max_z, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(max_z >= 0);
-
-    return_sizes[0] = size_t(max_x);
-    return_sizes[1] = size_t(max_y);
-    return_sizes[2] = size_t(max_z);
-    return getInfoArray(max_work_item_dimensions, param_value_size, param_value,
-                        param_value_size_ret, return_sizes);
-  }
-
-  case PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_3D: {
-    size_t return_sizes[max_work_item_dimensions];
-    int max_x = 0, max_y = 0, max_z = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&max_x, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(max_x >= 0);
-
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&max_y, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(max_y >= 0);
-
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&max_z, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(max_z >= 0);
-
-    return_sizes[0] = size_t(max_x);
-    return_sizes[1] = size_t(max_y);
-    return_sizes[2] = size_t(max_z);
-    return getInfoArray(max_work_item_dimensions, param_value_size, param_value,
-                        param_value_size_ret, return_sizes);
-  }
-
-  case PI_DEVICE_INFO_MAX_WORK_GROUP_SIZE: {
-    int max_work_group_size = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&max_work_group_size,
-                             CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                             device->get()) == CUDA_SUCCESS);
-
-    sycl::detail::pi::assertion(max_work_group_size >= 0);
-
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   size_t(max_work_group_size));
-  }
-  case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  }
-  case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  }
-  case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  }
-  case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  }
-  case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  }
-  case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  }
-  case PI_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
-  }
-  case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  }
-  case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  }
-  case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  }
-  case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  }
-  case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  }
-  case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  }
-  case PI_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
-  }
-  case PI_DEVICE_INFO_MAX_NUM_SUB_GROUPS: {
-    // Number of sub-groups = max block size / warp size + possible remainder
-    int max_threads = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&max_threads,
-                             CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                             device->get()) == CUDA_SUCCESS);
-    int warpSize = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-                             device->get()) == CUDA_SUCCESS);
-    int maxWarps = (max_threads + warpSize - 1) / warpSize;
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   static_cast<uint32_t>(maxWarps));
-  }
-  case PI_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: {
-    // Volta provides independent thread scheduling
-    // TODO: Revisit for previous generation GPUs
-    int major = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&major,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             device->get()) == CUDA_SUCCESS);
-    bool ifp = (major >= 7);
-    return getInfo(param_value_size, param_value, param_value_size_ret, ifp);
-  }
-
-  case PI_DEVICE_INFO_ATOMIC_64: {
-    int major = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&major,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             device->get()) == CUDA_SUCCESS);
-
-    bool atomic64 = (major >= 6) ? true : false;
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   atomic64);
-  }
-  case PI_EXT_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
-    pi_memory_order_capabilities capabilities =
-        PI_MEMORY_ORDER_RELAXED | PI_MEMORY_ORDER_ACQUIRE |
-        PI_MEMORY_ORDER_RELEASE | PI_MEMORY_ORDER_ACQ_REL;
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   capabilities);
-  }
-  case PI_EXT_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
-    int major = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&major,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             device->get()) == CUDA_SUCCESS);
-    pi_memory_order_capabilities capabilities =
-        (major >= 7) ? PI_MEMORY_SCOPE_WORK_ITEM | PI_MEMORY_SCOPE_SUB_GROUP |
-                           PI_MEMORY_SCOPE_WORK_GROUP | PI_MEMORY_SCOPE_DEVICE |
-                           PI_MEMORY_SCOPE_SYSTEM
-                     : PI_MEMORY_SCOPE_WORK_ITEM | PI_MEMORY_SCOPE_SUB_GROUP |
-                           PI_MEMORY_SCOPE_WORK_GROUP | PI_MEMORY_SCOPE_DEVICE;
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   capabilities);
-  }
-  case PI_EXT_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: {
-    // SYCL2020 4.6.4.2 minimum mandated capabilities for
-    // atomic_fence_order_capabilities.
-    pi_memory_order_capabilities capabilities =
-        PI_MEMORY_ORDER_RELAXED | PI_MEMORY_ORDER_ACQUIRE |
-        PI_MEMORY_ORDER_RELEASE | PI_MEMORY_ORDER_ACQ_REL;
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   capabilities);
-  }
-  case PI_EXT_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: {
-    // SYCL2020 4.6.4.2 minimum mandated capabilities for
-    // atomic_fence/memory_scope_capabilities.
-    // Because scopes are hierarchical, wider scopes support all narrower
-    // scopes. At a minimum, each device must support WORK_ITEM, SUB_GROUP and
-    // WORK_GROUP. (https://github.com/KhronosGroup/SYCL-Docs/pull/382)
-    pi_memory_scope_capabilities capabilities = PI_MEMORY_SCOPE_WORK_ITEM |
-                                                PI_MEMORY_SCOPE_SUB_GROUP |
-                                                PI_MEMORY_SCOPE_WORK_GROUP;
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   capabilities);
-  }
-  case PI_EXT_ONEAPI_DEVICE_INFO_BFLOAT16_MATH_FUNCTIONS: {
-    int major = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&major,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             device->get()) == CUDA_SUCCESS);
-
-    bool bfloat16 = (major >= 8) ? true : false;
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   bfloat16);
-  }
-  case PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
-    // NVIDIA devices only support one sub-group size (the warp size)
-    int warpSize = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-                             device->get()) == CUDA_SUCCESS);
-    size_t sizes[1] = {static_cast<size_t>(warpSize)};
-    return getInfoArray<size_t>(1, param_value_size, param_value,
-                                param_value_size_ret, sizes);
-  }
-  case PI_DEVICE_INFO_MAX_CLOCK_FREQUENCY: {
-    int clock_freq = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&clock_freq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(clock_freq >= 0);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   pi_uint32(clock_freq) / 1000u);
-  }
-  case PI_DEVICE_INFO_ADDRESS_BITS: {
-    auto bits = pi_uint32{std::numeric_limits<uintptr_t>::digits};
-    return getInfo(param_value_size, param_value, param_value_size_ret, bits);
-  }
-  case PI_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: {
-    // Max size of memory object allocation in bytes.
-    // The minimum value is max(min(1024 × 1024 ×
-    // 1024, 1/4th of CL_DEVICE_GLOBAL_MEM_SIZE),
-    // 32 × 1024 × 1024) for devices that are not of type
-    // CL_DEVICE_TYPE_CUSTOM.
-
-    size_t global = 0;
-    sycl::detail::pi::assertion(cuDeviceTotalMem(&global, device->get()) ==
-                                CUDA_SUCCESS);
-
-    auto quarter_global = static_cast<pi_uint32>(global / 4u);
-
-    auto max_alloc = std::max(std::min(1024u * 1024u * 1024u, quarter_global),
-                              32u * 1024u * 1024u);
-
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   pi_uint64{max_alloc});
-  }
-  case PI_DEVICE_INFO_IMAGE_SUPPORT: {
-    pi_bool enabled = PI_FALSE;
-
-    if (std::getenv("SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT") != nullptr) {
-      enabled = PI_TRUE;
-    } else {
-      sycl::detail::pi::cuPrint(
-          "Images are not fully supported by the CUDA BE, their support is "
-          "disabled by default. Their partial support can be activated by "
-          "setting SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT environment variable at "
-          "runtime.");
-    }
-
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   enabled);
-  }
-  case PI_DEVICE_INFO_MAX_READ_IMAGE_ARGS: {
-    // This call doesn't match to CUDA as it doesn't have images, but instead
-    // surfaces and textures. No clear call in the CUDA API to determine this,
-    // but some searching found as of SM 2.x 128 are supported.
-    return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
-  }
-  case PI_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS: {
-    // This call doesn't match to CUDA as it doesn't have images, but instead
-    // surfaces and textures. No clear call in the CUDA API to determine this,
-    // but some searching found as of SM 2.x 128 are supported.
-    return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
-  }
-  case PI_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: {
-    // Take the smaller of maximum surface and maximum texture height.
-    int tex_height = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&tex_height,
-                             CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(tex_height >= 0);
-    int surf_height = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&surf_height,
-                             CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(surf_height >= 0);
-
-    int min = std::min(tex_height, surf_height);
-
-    return getInfo(param_value_size, param_value, param_value_size_ret, min);
-  }
-  case PI_DEVICE_INFO_IMAGE2D_MAX_WIDTH: {
-    // Take the smaller of maximum surface and maximum texture width.
-    int tex_width = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&tex_width,
-                             CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(tex_width >= 0);
-    int surf_width = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&surf_width,
-                             CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(surf_width >= 0);
-
-    int min = std::min(tex_width, surf_width);
-
-    return getInfo(param_value_size, param_value, param_value_size_ret, min);
-  }
-  case PI_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: {
-    // Take the smaller of maximum surface and maximum texture height.
-    int tex_height = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&tex_height,
-                             CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(tex_height >= 0);
-    int surf_height = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&surf_height,
-                             CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(surf_height >= 0);
-
-    int min = std::min(tex_height, surf_height);
-
-    return getInfo(param_value_size, param_value, param_value_size_ret, min);
-  }
-  case PI_DEVICE_INFO_IMAGE3D_MAX_WIDTH: {
-    // Take the smaller of maximum surface and maximum texture width.
-    int tex_width = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&tex_width,
-                             CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(tex_width >= 0);
-    int surf_width = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&surf_width,
-                             CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(surf_width >= 0);
-
-    int min = std::min(tex_width, surf_width);
-
-    return getInfo(param_value_size, param_value, param_value_size_ret, min);
-  }
-  case PI_DEVICE_INFO_IMAGE3D_MAX_DEPTH: {
-    // Take the smaller of maximum surface and maximum texture depth.
-    int tex_depth = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&tex_depth,
-                             CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(tex_depth >= 0);
-    int surf_depth = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&surf_depth,
-                             CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(surf_depth >= 0);
-
-    int min = std::min(tex_depth, surf_depth);
-
-    return getInfo(param_value_size, param_value, param_value_size_ret, min);
-  }
-  case PI_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: {
-    // Take the smaller of maximum surface and maximum texture width.
-    int tex_width = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&tex_width,
-                             CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(tex_width >= 0);
-    int surf_width = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&surf_width,
-                             CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(surf_width >= 0);
-
-    int min = std::min(tex_width, surf_width);
-
-    return getInfo(param_value_size, param_value, param_value_size_ret, min);
-  }
-  case PI_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   size_t(0));
-  }
-  case PI_DEVICE_INFO_MAX_SAMPLERS: {
-    // This call is kind of meaningless for cuda, as samplers don't exist.
-    // Closest thing is textures, which is 128.
-    return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
-  }
-  case PI_DEVICE_INFO_MAX_PARAMETER_SIZE: {
-    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/#function-parameters
-    // __global__ function parameters are passed to the device via constant
-    // memory and are limited to 4 KB.
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   size_t{4000u});
-  }
-  case PI_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: {
-    int mem_base_addr_align = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&mem_base_addr_align,
-                             CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
-                             device->get()) == CUDA_SUCCESS);
-    // Multiply by 8 as clGetDeviceInfo returns this value in bits
-    mem_base_addr_align *= 8;
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   mem_base_addr_align);
-  }
-  case PI_DEVICE_INFO_HALF_FP_CONFIG: {
-    // TODO: is this config consistent across all NVIDIA GPUs?
-    return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
-  }
-  case PI_DEVICE_INFO_SINGLE_FP_CONFIG: {
-    // TODO: is this config consistent across all NVIDIA GPUs?
-    auto config = PI_FP_DENORM | PI_FP_INF_NAN | PI_FP_ROUND_TO_NEAREST |
-                  PI_FP_ROUND_TO_ZERO | PI_FP_ROUND_TO_INF | PI_FP_FMA |
-                  PI_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT;
-    return getInfo(param_value_size, param_value, param_value_size_ret, config);
-  }
-  case PI_DEVICE_INFO_DOUBLE_FP_CONFIG: {
-    // TODO: is this config consistent across all NVIDIA GPUs?
-    auto config = PI_FP_DENORM | PI_FP_INF_NAN | PI_FP_ROUND_TO_NEAREST |
-                  PI_FP_ROUND_TO_ZERO | PI_FP_ROUND_TO_INF | PI_FP_FMA;
-    return getInfo(param_value_size, param_value, param_value_size_ret, config);
-  }
-  case PI_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE: {
-    // TODO: is this config consistent across all NVIDIA GPUs?
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   PI_DEVICE_MEM_CACHE_TYPE_READ_WRITE_CACHE);
-  }
-  case PI_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE: {
-    // The value is documented for all existing GPUs in the CUDA programming
-    // guidelines, section "H.3.2. Global Memory".
-    return getInfo(param_value_size, param_value, param_value_size_ret, 128u);
-  }
-  case PI_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: {
-    int cache_size = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&cache_size, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(cache_size >= 0);
-    // The L2 cache is global to the GPU.
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   pi_uint64(cache_size));
-  }
-  case PI_DEVICE_INFO_GLOBAL_MEM_SIZE: {
-    size_t bytes = 0;
-    // Runtime API has easy access to this value, driver API info is scarse.
-    sycl::detail::pi::assertion(cuDeviceTotalMem(&bytes, device->get()) ==
-                                CUDA_SUCCESS);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   pi_uint64{bytes});
-  }
-  case PI_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: {
-    int constant_memory = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&constant_memory,
-                             CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(constant_memory >= 0);
-
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   pi_uint64(constant_memory));
-  }
-  case PI_DEVICE_INFO_MAX_CONSTANT_ARGS: {
-    // TODO: is there a way to retrieve this from CUDA driver API?
-    // Hard coded to value returned by clinfo for OpenCL 1.2 CUDA | GeForce GTX
-    // 1060 3GB
-    return getInfo(param_value_size, param_value, param_value_size_ret, 9u);
-  }
-  case PI_DEVICE_INFO_LOCAL_MEM_TYPE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   PI_DEVICE_LOCAL_MEM_TYPE_LOCAL);
-  }
-  case PI_DEVICE_INFO_LOCAL_MEM_SIZE: {
-    // OpenCL's "local memory" maps most closely to CUDA's "shared memory".
-    // CUDA has its own definition of "local memory", which maps to OpenCL's
-    // "private memory".
-    int local_mem_size = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&local_mem_size,
-                             CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(local_mem_size >= 0);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   pi_uint64(local_mem_size));
-  }
-  case PI_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: {
-    int ecc_enabled = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&ecc_enabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED,
-                             device->get()) == CUDA_SUCCESS);
-
-    sycl::detail::pi::assertion((ecc_enabled == 0) | (ecc_enabled == 1));
-    auto result = static_cast<pi_bool>(ecc_enabled);
-    return getInfo(param_value_size, param_value, param_value_size_ret, result);
-  }
-  case PI_DEVICE_INFO_HOST_UNIFIED_MEMORY: {
-    int is_integrated = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&is_integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED,
-                             device->get()) == CUDA_SUCCESS);
-
-    sycl::detail::pi::assertion((is_integrated == 0) | (is_integrated == 1));
-    auto result = static_cast<pi_bool>(is_integrated);
-    return getInfo(param_value_size, param_value, param_value_size_ret, result);
-  }
-  case PI_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: {
-    // Hard coded to value returned by clinfo for OpenCL 1.2 CUDA | GeForce GTX
-    // 1060 3GB
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   size_t{1000u});
-  }
-  case PI_DEVICE_INFO_ENDIAN_LITTLE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   PI_TRUE);
-  }
-  case PI_DEVICE_INFO_AVAILABLE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   PI_TRUE);
-  }
-  case PI_DEVICE_INFO_BUILD_ON_SUBDEVICE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   PI_TRUE);
-  }
-  case PI_DEVICE_INFO_COMPILER_AVAILABLE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   PI_TRUE);
-  }
-  case PI_DEVICE_INFO_LINKER_AVAILABLE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   PI_TRUE);
-  }
-  case PI_DEVICE_INFO_EXECUTION_CAPABILITIES: {
-    auto capability = PI_DEVICE_EXEC_CAPABILITIES_KERNEL;
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   capability);
-  }
-  case PI_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: {
-    // The mandated minimum capability:
-    auto capability = PI_QUEUE_FLAG_PROFILING_ENABLE |
-                      PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   capability);
-  }
-  case PI_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: {
-    // The mandated minimum capability:
-    auto capability = PI_QUEUE_FLAG_PROFILING_ENABLE;
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   capability);
-  }
-  case PI_DEVICE_INFO_BUILT_IN_KERNELS: {
-    // An empty string is returned if no built-in kernels are supported by the
-    // device.
-    return getInfo(param_value_size, param_value, param_value_size_ret, "");
-  }
-  case PI_DEVICE_INFO_PLATFORM: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   device->get_platform());
-  }
-  case PI_DEVICE_INFO_NAME: {
-    static constexpr size_t MAX_DEVICE_NAME_LENGTH = 256u;
-    char name[MAX_DEVICE_NAME_LENGTH];
-    sycl::detail::pi::assertion(cuDeviceGetName(name, MAX_DEVICE_NAME_LENGTH,
-                                                device->get()) == CUDA_SUCCESS);
-    return getInfoArray(strlen(name) + 1, param_value_size, param_value,
-                        param_value_size_ret, name);
-  }
-  case PI_DEVICE_INFO_VENDOR: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   "NVIDIA Corporation");
-  }
-  case PI_DEVICE_INFO_DRIVER_VERSION: {
-    auto version = getCudaVersionString();
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   version.c_str());
-  }
-  case PI_DEVICE_INFO_PROFILE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, "CUDA");
-  }
-  case PI_DEVICE_INFO_REFERENCE_COUNT: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   device->get_reference_count());
-  }
-  case PI_DEVICE_INFO_VERSION: {
-    std::stringstream s;
-    int major;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&major,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             device->get()) == CUDA_SUCCESS);
-    s << major;
-
-    int minor;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&minor,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
-                             device->get()) == CUDA_SUCCESS);
-    s << "." << minor;
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   s.str().c_str());
-  }
-  case PI_DEVICE_INFO_OPENCL_C_VERSION: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, "");
-  }
-  case PI_DEVICE_INFO_EXTENSIONS: {
-
-    std::string SupportedExtensions = "cl_khr_fp64 cl_khr_subgroups ";
-    SupportedExtensions += PI_DEVICE_INFO_EXTENSION_DEVICELIB_ASSERT;
-    SupportedExtensions += " ";
-
-    int major = 0;
-    int minor = 0;
-
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&major,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&minor,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
-                             device->get()) == CUDA_SUCCESS);
-
-    if ((major >= 6) || ((major == 5) && (minor >= 3))) {
-      SupportedExtensions += "cl_khr_fp16 ";
-    }
-
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   SupportedExtensions.c_str());
-  }
-  case PI_DEVICE_INFO_PRINTF_BUFFER_SIZE: {
-    // The minimum value for the FULL profile is 1 MB.
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   size_t{1024u});
-  }
-  case PI_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   PI_TRUE);
-  }
-  case PI_DEVICE_INFO_PARENT_DEVICE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   nullptr);
-  }
-  case PI_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
-  }
-  case PI_DEVICE_INFO_PARTITION_PROPERTIES: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   static_cast<pi_device_partition_property>(0u));
-  }
-  case PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: {
-    return getInfo(param_value_size, param_value, param_value_size_ret, 0u);
-  }
-  case PI_DEVICE_INFO_PARTITION_TYPE: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   static_cast<pi_device_partition_property>(0u));
-  }
-
-    // Intel USM extensions
-
-  case PI_DEVICE_INFO_USM_HOST_SUPPORT: {
-    // from cl_intel_unified_shared_memory: "The host memory access capabilities
-    // apply to any host allocation."
-    //
-    // query if/how the device can access page-locked host memory, possibly
-    // through PCIe, using the same pointer as the host
-    pi_bitfield value = {};
-    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) {
-      // the device shares a unified address space with the host
-      if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
-          6) {
-        // compute capability 6.x introduces operations that are atomic with
-        // respect to other CPUs and GPUs in the system
-        value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS |
-                PI_USM_CONCURRENT_ACCESS | PI_USM_CONCURRENT_ATOMIC_ACCESS;
-      } else {
-        // on GPU architectures with compute capability lower than 6.x, atomic
-        // operations from the GPU to CPU memory will not be atomic with respect
-        // to CPU initiated atomic operations
-        value = PI_USM_ACCESS | PI_USM_CONCURRENT_ACCESS;
-      }
-    }
-    return getInfo(param_value_size, param_value, param_value_size_ret, value);
-  }
-  case PI_DEVICE_INFO_USM_DEVICE_SUPPORT: {
-    // from cl_intel_unified_shared_memory:
-    // "The device memory access capabilities apply to any device allocation
-    // associated with this device."
-    //
-    // query how the device can access memory allocated on the device itself (?)
-    pi_bitfield value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS |
-                        PI_USM_CONCURRENT_ACCESS |
-                        PI_USM_CONCURRENT_ATOMIC_ACCESS;
-    return getInfo(param_value_size, param_value, param_value_size_ret, value);
-  }
-  case PI_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: {
-    // from cl_intel_unified_shared_memory:
-    // "The single device shared memory access capabilities apply to any shared
-    // allocation associated with this device."
-    //
-    // query if/how the device can access managed memory associated to it
-    pi_bitfield value = {};
-    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) {
-      // the device can allocate managed memory on this system
-      value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS;
-    }
-    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
-      // the device can coherently access managed memory concurrently with the
-      // CPU
-      value |= PI_USM_CONCURRENT_ACCESS;
-      if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
-          6) {
-        // compute capability 6.x introduces operations that are atomic with
-        // respect to other CPUs and GPUs in the system
-        value |= PI_USM_CONCURRENT_ATOMIC_ACCESS;
-      }
-    }
-    return getInfo(param_value_size, param_value, param_value_size_ret, value);
-  }
-  case PI_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: {
-    // from cl_intel_unified_shared_memory:
-    // "The cross-device shared memory access capabilities apply to any shared
-    // allocation associated with this device, or to any shared memory
-    // allocation on another device that also supports the same cross-device
-    // shared memory access capability."
-    //
-    // query if/how the device can access managed memory associated to other
-    // devices
-    pi_bitfield value = {};
-    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) {
-      // the device can allocate managed memory on this system
-      value |= PI_USM_ACCESS;
-    }
-    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
-      // all devices with the CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
-      // attribute can coherently access managed memory concurrently with the
-      // CPU
-      value |= PI_USM_CONCURRENT_ACCESS;
-    }
-    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
-        6) {
-      // compute capability 6.x introduces operations that are atomic with
-      // respect to other CPUs and GPUs in the system
-      if (value & PI_USM_ACCESS)
-        value |= PI_USM_ATOMIC_ACCESS;
-      if (value & PI_USM_CONCURRENT_ACCESS)
-        value |= PI_USM_CONCURRENT_ATOMIC_ACCESS;
-    }
-    return getInfo(param_value_size, param_value, param_value_size_ret, value);
-  }
-  case PI_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: {
-    // from cl_intel_unified_shared_memory:
-    // "The shared system memory access capabilities apply to any allocations
-    // made by a system allocator, such as malloc or new."
-    //
-    // query if/how the device can access pageable host memory allocated by the
-    // system allocator
-    pi_bitfield value = {};
-    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS)) {
-      // the device suppports coherently accessing pageable memory without
-      // calling cuMemHostRegister/cudaHostRegister on it
-      if (getAttribute(device,
-                       CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED)) {
-        // the link between the device and the host supports native atomic
-        // operations
-        value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS |
-                PI_USM_CONCURRENT_ACCESS | PI_USM_CONCURRENT_ATOMIC_ACCESS;
-      } else {
-        // the link between the device and the host does not support native
-        // atomic operations
-        value = PI_USM_ACCESS | PI_USM_CONCURRENT_ACCESS;
-      }
-    }
-    return getInfo(param_value_size, param_value, param_value_size_ret, value);
-  }
-  case PI_EXT_ONEAPI_DEVICE_INFO_CUDA_ASYNC_BARRIER: {
-    int value =
-        getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 8;
-    return getInfo(param_value_size, param_value, param_value_size_ret, value);
-  }
-  case PI_DEVICE_INFO_BACKEND_VERSION: {
-    int major =
-        getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
-    int minor =
-        getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
-    std::string result = std::to_string(major) + "." + std::to_string(minor);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   result.c_str());
-  }
-
-  case PI_EXT_INTEL_DEVICE_INFO_FREE_MEMORY: {
-    size_t FreeMemory = 0;
-    size_t TotalMemory = 0;
-    sycl::detail::pi::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) ==
-                                    CUDA_SUCCESS,
-                                "failed cuMemGetInfo() API.");
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   FreeMemory);
-  }
-  case PI_EXT_INTEL_DEVICE_INFO_MEMORY_CLOCK_RATE: {
-    int value = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(value >= 0);
-    // Convert kilohertz to megahertz when returning.
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   value / 1000);
-  }
-  case PI_EXT_INTEL_DEVICE_INFO_MEMORY_BUS_WIDTH: {
-    int value = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&value,
-                             CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(value >= 0);
-    return getInfo(param_value_size, param_value, param_value_size_ret, value);
-  }
-  case PI_EXT_INTEL_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   pi_int32{1});
-  }
-
-  case PI_DEVICE_INFO_DEVICE_ID: {
-    int value = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
-                             device->get()) == CUDA_SUCCESS);
-    sycl::detail::pi::assertion(value >= 0);
-    return getInfo(param_value_size, param_value, param_value_size_ret, value);
-  }
-
-  case PI_DEVICE_INFO_UUID: {
-    CUuuid uuid;
-#if (CUDA_VERSION >= 11040)
-    sycl::detail::pi::assertion(cuDeviceGetUuid_v2(&uuid, device->get()) ==
-                                CUDA_SUCCESS);
-#else
-    sycl::detail::pi::assertion(cuDeviceGetUuid(&uuid, device->get()) ==
-                                CUDA_SUCCESS);
-#endif
-    std::array<unsigned char, 16> name;
-    std::copy(uuid.bytes, uuid.bytes + 16, name.begin());
-    return getInfoArray(16, param_value_size, param_value, param_value_size_ret,
-                        name.data());
-  }
-
-  case PI_DEVICE_INFO_MAX_MEM_BANDWIDTH: {
-    int major = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&major,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-                             device->get()) == CUDA_SUCCESS);
-
-    int minor = 0;
-    sycl::detail::pi::assertion(
-        cuDeviceGetAttribute(&minor,
-                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
-                             device->get()) == CUDA_SUCCESS);
-
-    // Some specific devices seem to need special handling. See reference
-    // https://github.com/jeffhammond/HPCInfo/blob/master/cuda/gpu-detect.cu
-    bool is_xavier_agx = major == 7 && minor == 2;
-    bool is_orin_agx = major == 8 && minor == 7;
-
-    int memory_clock_khz = 0;
-    if (is_xavier_agx) {
-      memory_clock_khz = 2133000;
-    } else if (is_orin_agx) {
-      memory_clock_khz = 3200000;
-    } else {
-      sycl::detail::pi::assertion(
-          cuDeviceGetAttribute(&memory_clock_khz,
-                               CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
-                               device->get()) == CUDA_SUCCESS);
-    }
-
-    int memory_bus_width = 0;
-    if (is_orin_agx) {
-      memory_bus_width = 256;
-    } else {
-      sycl::detail::pi::assertion(
-          cuDeviceGetAttribute(&memory_bus_width,
-                               CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
-                               device->get()) == CUDA_SUCCESS);
-    }
-
-    uint64_t memory_bandwidth =
-        uint64_t(memory_clock_khz) * memory_bus_width * 250;
-
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   memory_bandwidth);
-  }
-  case PI_EXT_INTEL_DEVICE_INFO_MEM_CHANNEL_SUPPORT: {
-    // The mem-channel buffer property is not supported on CUDA devices.
-    return getInfo<pi_bool>(param_value_size, param_value, param_value_size_ret,
-                            false);
-  }
-  case PI_DEVICE_INFO_IMAGE_SRGB: {
-    // The sRGB images are not supported on CUDA.
-    return getInfo<pi_bool>(param_value_size, param_value, param_value_size_ret,
-                            false);
-  }
-
-  case PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP: {
-    // Maximum number of 32-bit registers available to a thread block.
-    // Note: This number is shared by all thread blocks simultaneously resident
-    // on a multiprocessor.
-    int max_registers{-1};
-    PI_CHECK_ERROR(cuDeviceGetAttribute(
-        &max_registers, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
-        device->get()));
-
-    sycl::detail::pi::assertion(max_registers >= 0);
-
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   static_cast<uint32_t>(max_registers));
-  }
-
-  case PI_DEVICE_INFO_PCI_ADDRESS: {
-    constexpr size_t AddressBufferSize = 13;
-    char AddressBuffer[AddressBufferSize];
-    sycl::detail::pi::assertion(
-        cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, device->get()) ==
-        CUDA_SUCCESS);
-    // CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written
-    sycl::detail::pi::assertion(strnlen(AddressBuffer, AddressBufferSize) ==
-                                12);
-    return getInfoArray(strnlen(AddressBuffer, AddressBufferSize - 1) + 1,
-                        param_value_size, param_value, param_value_size_ret,
-                        AddressBuffer);
-  }
-  // TODO: Investigate if this information is available on CUDA.
-  case PI_DEVICE_INFO_GPU_EU_COUNT:
-  case PI_DEVICE_INFO_GPU_EU_SIMD_WIDTH:
-  case PI_DEVICE_INFO_GPU_SLICES:
-  case PI_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE:
-  case PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
-  case PI_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
-    return PI_ERROR_INVALID_VALUE;
-
-  default:
-    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-  }
-  sycl::detail::pi::die("Device info request not implemented");
-  return {};
-}
-
-/// Gets the native CUDA handle of a PI device object
-///
-/// \param[in] device The PI device to get the native CUDA object of.
-/// \param[out] nativeHandle Set to the native handle of the PI device object.
-///
-/// \return PI_SUCCESS
-pi_result cuda_piextDeviceGetNativeHandle(pi_device device,
-                                          pi_native_handle *nativeHandle) {
-  *nativeHandle = static_cast<pi_native_handle>(device->get());
-  return PI_SUCCESS;
-}
-
-/// Created a PI device object from a CUDA device handle.
-/// NOTE: The created PI object does not take ownership of the native handle.
-///
-/// \param[in] nativeHandle The native handle to create PI device object from.
-/// \param[in] platform is the PI platform of the device.
-/// \param[out] device Set to the PI device object created from native handle.
-///
-/// \return TBD
-pi_result cuda_piextDeviceCreateWithNativeHandle(pi_native_handle nativeHandle,
-                                                 pi_platform platform,
-                                                 pi_device *piDevice) {
-  assert(piDevice != nullptr);
-
-  CUdevice cu_device = static_cast<CUdevice>(nativeHandle);
-
-  auto is_device = [=](std::unique_ptr<_pi_device> &dev) {
-    return dev->get() == cu_device;
-  };
-
-  // If a platform is provided just check if the device is in it
-  if (platform) {
-    auto search_res = std::find_if(begin(platform->devices_),
-                                   end(platform->devices_), is_device);
-    if (search_res != end(platform->devices_)) {
-      *piDevice = (*search_res).get();
-      return PI_SUCCESS;
-    }
-  }
-
-  // Get list of platforms
-  pi_uint32 num_platforms;
-  pi_result result = cuda_piPlatformsGet(0, nullptr, &num_platforms);
-  if (result != PI_SUCCESS)
-    return result;
-
-  pi_platform *plat =
-      static_cast<pi_platform *>(malloc(num_platforms * sizeof(pi_platform)));
-  result = cuda_piPlatformsGet(num_platforms, plat, nullptr);
-  if (result != PI_SUCCESS)
-    return result;
-
-  // Iterate through platforms to find device that matches nativeHandle
-  for (pi_uint32 j = 0; j < num_platforms; ++j) {
-    auto search_res = std::find_if(begin(plat[j]->devices_),
-                                   end(plat[j]->devices_), is_device);
-    if (search_res != end(plat[j]->devices_)) {
-      *piDevice = (*search_res).get();
-      return PI_SUCCESS;
-    }
-  }
-
-  // If the provided nativeHandle cannot be matched to an
-  // existing device return error
-  return PI_ERROR_INVALID_OPERATION;
-}
-
-/* Context APIs */
-
-/// Create a PI CUDA context.
-///
-/// By default creates a scoped context and keeps the last active CUDA context
-/// on top of the CUDA context stack.
-/// With the __SYCL_PI_CONTEXT_PROPERTIES_CUDA_PRIMARY key/id and a value of
-/// PI_TRUE creates a primary CUDA context and activates it on the CUDA context
-/// stack.
-///
-/// \param[in] properties 0 terminated array of key/id-value combinations. Can
-/// be nullptr. Only accepts property key/id
-/// __SYCL_PI_CONTEXT_PROPERTIES_CUDA_PRIMARY with a pi_bool value.
-/// \param[in] num_devices Number of devices to create the context for.
-/// \param[in] devices Devices to create the context for.
-/// \param[in] pfn_notify Callback, currently unused.
-/// \param[in] user_data User data for callback.
-/// \param[out] retcontext Set to created context on success.
-///
-/// \return PI_SUCCESS on success, otherwise an error return code.
-pi_result cuda_piContextCreate(
-    [[maybe_unused]] const pi_context_properties *properties,
-    [[maybe_unused]] pi_uint32 num_devices, const pi_device *devices,
-    [[maybe_unused]] void (*pfn_notify)(const char *errinfo,
-                                        const void *private_info, size_t cb,
-                                        void *user_data),
-    [[maybe_unused]] void *user_data, pi_context *retcontext) {
-
-  assert(devices != nullptr);
-  // TODO: How to implement context callback?
-  assert(pfn_notify == nullptr);
-  assert(user_data == nullptr);
-  assert(num_devices == 1);
-  // Need input context
-  assert(retcontext != nullptr);
-  pi_result errcode_ret = PI_SUCCESS;
-
-  std::unique_ptr<_pi_context> piContextPtr{nullptr};
-  try {
-    piContextPtr = std::unique_ptr<_pi_context>(new _pi_context{*devices});
-    *retcontext = piContextPtr.release();
-  } catch (pi_result err) {
-    errcode_ret = err;
-  } catch (...) {
-    errcode_ret = PI_ERROR_OUT_OF_RESOURCES;
-  }
-  return errcode_ret;
-}
-
-pi_result cuda_piContextRelease(pi_context ctxt) {
-  assert(ctxt != nullptr);
-
-  if (ctxt->decrement_reference_count() > 0) {
-    return PI_SUCCESS;
-  }
-  ctxt->invoke_extended_deleters();
-
-  std::unique_ptr<_pi_context> context{ctxt};
-
-  return PI_SUCCESS;
-}
-
-/// Gets the native CUDA handle of a PI context object
-///
-/// \param[in] context The PI context to get the native CUDA object of.
-/// \param[out] nativeHandle Set to the native handle of the PI context object.
-///
-/// \return PI_SUCCESS
-pi_result cuda_piextContextGetNativeHandle(pi_context context,
-                                           pi_native_handle *nativeHandle) {
-  *nativeHandle = reinterpret_cast<pi_native_handle>(context->get());
-  return PI_SUCCESS;
-}
-
-/// Created a PI context object from a CUDA context handle.
-/// NOTE: The created PI object does not take ownership of the native handle.
-///
-/// \param[in] nativeHandle The native handle to create PI context object from.
-/// \param[out] context Set to the PI context object created from native handle.
-///
-/// \return TBD
-pi_result cuda_piextContextCreateWithNativeHandle(pi_native_handle nativeHandle,
-                                                  pi_uint32 num_devices,
-                                                  const pi_device *devices,
-                                                  bool ownNativeHandle,
-                                                  pi_context *piContext) {
-  (void)nativeHandle;
-  (void)num_devices;
-  (void)devices;
-  (void)ownNativeHandle;
-  (void)piContext;
-  assert(piContext != nullptr);
-  assert(ownNativeHandle == false);
-
-  return PI_ERROR_INVALID_OPERATION;
-}
-
-/// Creates a PI Memory object using a CUDA memory allocation.
-/// Can trigger a manual copy depending on the mode.
-/// \TODO Implement USE_HOST_PTR using cuHostRegister
-///
-pi_result
-cuda_piMemBufferCreate(pi_context context, pi_mem_flags flags, size_t size,
-                       void *host_ptr, pi_mem *ret_mem,
-                       [[maybe_unused]] const pi_mem_properties *properties) {
-  // Need input memory object
-  assert(ret_mem != nullptr);
-  assert((properties == nullptr || *properties == 0) &&
-         "no mem properties goes to cuda RT yet");
-  // Currently, USE_HOST_PTR is not implemented using host register
-  // since this triggers a weird segfault after program ends.
-  // Setting this constant to true enables testing that behavior.
-  const bool enableUseHostPtr = false;
-  const bool performInitialCopy =
-      (flags & PI_MEM_FLAGS_HOST_PTR_COPY) ||
-      ((flags & PI_MEM_FLAGS_HOST_PTR_USE) && !enableUseHostPtr);
-  pi_result retErr = PI_SUCCESS;
-  pi_mem retMemObj = nullptr;
-
-  try {
-    ScopedContext active(context);
-    CUdeviceptr ptr;
-    _pi_mem::mem_::buffer_mem_::alloc_mode allocMode =
-        _pi_mem::mem_::buffer_mem_::alloc_mode::classic;
-
-    if ((flags & PI_MEM_FLAGS_HOST_PTR_USE) && enableUseHostPtr) {
-      retErr = PI_CHECK_ERROR(
-          cuMemHostRegister(host_ptr, size, CU_MEMHOSTREGISTER_DEVICEMAP));
-      retErr = PI_CHECK_ERROR(cuMemHostGetDevicePointer(&ptr, host_ptr, 0));
-      allocMode = _pi_mem::mem_::buffer_mem_::alloc_mode::use_host_ptr;
-    } else if (flags & PI_MEM_FLAGS_HOST_PTR_ALLOC) {
-      retErr = PI_CHECK_ERROR(cuMemAllocHost(&host_ptr, size));
-      retErr = PI_CHECK_ERROR(cuMemHostGetDevicePointer(&ptr, host_ptr, 0));
-      allocMode = _pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr;
-    } else {
-      retErr = PI_CHECK_ERROR(cuMemAlloc(&ptr, size));
-      if (flags & PI_MEM_FLAGS_HOST_PTR_COPY) {
-        allocMode = _pi_mem::mem_::buffer_mem_::alloc_mode::copy_in;
-      }
-    }
-
-    if (retErr == PI_SUCCESS) {
-      pi_mem parentBuffer = nullptr;
-
-      auto piMemObj = std::unique_ptr<_pi_mem>(
-          new _pi_mem{context, parentBuffer, allocMode, ptr, host_ptr, size});
-      if (piMemObj != nullptr) {
-        retMemObj = piMemObj.release();
-        if (performInitialCopy) {
-          // Operates on the default stream of the current CUDA context.
-          retErr = PI_CHECK_ERROR(cuMemcpyHtoD(ptr, host_ptr, size));
-          // Synchronize with default stream implicitly used by cuMemcpyHtoD
-          // to make buffer data available on device before any other PI call
-          // uses it.
-          if (retErr == PI_SUCCESS) {
-            CUstream defaultStream = 0;
-            retErr = PI_CHECK_ERROR(cuStreamSynchronize(defaultStream));
-          }
-        }
-      } else {
-        retErr = PI_ERROR_OUT_OF_HOST_MEMORY;
-      }
-    }
-  } catch (pi_result err) {
-    retErr = err;
-  } catch (...) {
-    retErr = PI_ERROR_OUT_OF_RESOURCES;
-  }
-
-  *ret_mem = retMemObj;
-
-  return retErr;
-}
-
-/// Decreases the reference count of the Mem object.
-/// If this is zero, calls the relevant CUDA Free function
-/// \return PI_SUCCESS unless deallocation error
-///
-pi_result cuda_piMemRelease(pi_mem memObj) {
-  assert((memObj != nullptr) && "PI_ERROR_INVALID_MEM_OBJECTS");
-
-  pi_result ret = PI_SUCCESS;
-
-  try {
-
-    // Do nothing if there are other references
-    if (memObj->decrement_reference_count() > 0) {
-      return PI_SUCCESS;
-    }
-
-    // make sure memObj is released in case PI_CHECK_ERROR throws
-    std::unique_ptr<_pi_mem> uniqueMemObj(memObj);
-
-    if (memObj->is_sub_buffer()) {
-      return PI_SUCCESS;
-    }
-
-    ScopedContext active(uniqueMemObj->get_context());
-
-    if (memObj->mem_type_ == _pi_mem::mem_type::buffer) {
-      switch (uniqueMemObj->mem_.buffer_mem_.allocMode_) {
-      case _pi_mem::mem_::buffer_mem_::alloc_mode::copy_in:
-      case _pi_mem::mem_::buffer_mem_::alloc_mode::classic:
-        ret = PI_CHECK_ERROR(cuMemFree(uniqueMemObj->mem_.buffer_mem_.ptr_));
-        break;
-      case _pi_mem::mem_::buffer_mem_::alloc_mode::use_host_ptr:
-        ret = PI_CHECK_ERROR(
-            cuMemHostUnregister(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
-        break;
-      case _pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr:
-        ret = PI_CHECK_ERROR(
-            cuMemFreeHost(uniqueMemObj->mem_.buffer_mem_.hostPtr_));
-      };
-    } else if (memObj->mem_type_ == _pi_mem::mem_type::surface) {
-      ret = PI_CHECK_ERROR(
-          cuSurfObjectDestroy(uniqueMemObj->mem_.surface_mem_.get_surface()));
-      ret = PI_CHECK_ERROR(
-          cuArrayDestroy(uniqueMemObj->mem_.surface_mem_.get_array()));
-    }
-
-  } catch (pi_result err) {
-    ret = err;
-  } catch (...) {
-    ret = PI_ERROR_OUT_OF_RESOURCES;
-  }
-
-  if (ret != PI_SUCCESS) {
-    // A reported CUDA error is either an implementation or an asynchronous CUDA
-    // error for which it is unclear if the function that reported it succeeded
-    // or not. Either way, the state of the program is compromised and likely
-    // unrecoverable.
-    sycl::detail::pi::die(
-        "Unrecoverable program state reached in cuda_piMemRelease");
-  }
-
-  return PI_SUCCESS;
-}
-
-/// Implements a buffer partition in the CUDA backend.
-/// A buffer partition (or a sub-buffer, in OpenCL terms) is simply implemented
-/// as an offset over an existing CUDA allocation.
-///
-pi_result cuda_piMemBufferPartition(
-    pi_mem parent_buffer, pi_mem_flags flags,
-    [[maybe_unused]] pi_buffer_create_type buffer_create_type,
-    void *buffer_create_info, pi_mem *memObj) {
-  assert((parent_buffer != nullptr) && "PI_ERROR_INVALID_MEM_OBJECT");
-  assert(parent_buffer->is_buffer() && "PI_ERROR_INVALID_MEM_OBJECTS");
-  assert(!parent_buffer->is_sub_buffer() && "PI_ERROR_INVALID_MEM_OBJECT");
-
-  // Default value for flags means PI_MEM_FLAGS_ACCCESS_RW.
-  if (flags == 0) {
-    flags = PI_MEM_FLAGS_ACCESS_RW;
-  }
-
-  assert((flags == PI_MEM_FLAGS_ACCESS_RW) && "PI_ERROR_INVALID_VALUE");
-  assert((buffer_create_type == PI_BUFFER_CREATE_TYPE_REGION) &&
-         "PI_ERROR_INVALID_VALUE");
-  assert((buffer_create_info != nullptr) && "PI_ERROR_INVALID_VALUE");
-  assert(memObj != nullptr);
-
-  const auto bufferRegion =
-      *reinterpret_cast<pi_buffer_region>(buffer_create_info);
-  assert((bufferRegion.size != 0u) && "PI_ERROR_INVALID_BUFFER_SIZE");
-
-  assert((bufferRegion.origin <= (bufferRegion.origin + bufferRegion.size)) &&
-         "Overflow");
-  assert(((bufferRegion.origin + bufferRegion.size) <=
-          parent_buffer->mem_.buffer_mem_.get_size()) &&
-         "PI_ERROR_INVALID_BUFFER_SIZE");
-  // Retained indirectly due to retaining parent buffer below.
-  pi_context context = parent_buffer->context_;
-  _pi_mem::mem_::buffer_mem_::alloc_mode allocMode =
-      _pi_mem::mem_::buffer_mem_::alloc_mode::classic;
-
-  assert(parent_buffer->mem_.buffer_mem_.ptr_ !=
-         _pi_mem::mem_::buffer_mem_::native_type{0});
-  _pi_mem::mem_::buffer_mem_::native_type ptr =
-      parent_buffer->mem_.buffer_mem_.ptr_ + bufferRegion.origin;
-
-  void *hostPtr = nullptr;
-  if (parent_buffer->mem_.buffer_mem_.hostPtr_) {
-    hostPtr = static_cast<char *>(parent_buffer->mem_.buffer_mem_.hostPtr_) +
-              bufferRegion.origin;
-  }
-
-  std::unique_ptr<_pi_mem> retMemObj{nullptr};
-  try {
-    retMemObj = std::unique_ptr<_pi_mem>{new _pi_mem{
-        context, parent_buffer, allocMode, ptr, hostPtr, bufferRegion.size}};
-  } catch (pi_result err) {
-    *memObj = nullptr;
-    return err;
-  } catch (...) {
-    *memObj = nullptr;
-    return PI_ERROR_OUT_OF_HOST_MEMORY;
-  }
-
-  *memObj = retMemObj.release();
-  return PI_SUCCESS;
-}
-
-pi_result cuda_piMemGetInfo(pi_mem, pi_mem_info, size_t, void *, size_t *) {
-  sycl::detail::pi::die("cuda_piMemGetInfo not implemented");
-}
-
-/// Gets the native CUDA handle of a PI mem object
-///
-/// \param[in] mem The PI mem to get the native CUDA object of.
-/// \param[out] nativeHandle Set to the native handle of the PI mem object.
-///
-/// \return PI_SUCCESS
-pi_result cuda_piextMemGetNativeHandle(pi_mem mem,
-                                       pi_native_handle *nativeHandle) {
-  *nativeHandle = static_cast<pi_native_handle>(mem->mem_.buffer_mem_.get());
-  return PI_SUCCESS;
-}
-
-/// Created a PI mem object from a CUDA mem handle.
-/// TODO: Implement this.
-/// NOTE: The created PI object takes ownership of the native handle.
-///
-/// \param[in] nativeHandle The native handle to create PI mem object from.
-/// \param[in] context The PI context of the memory allocation.
-/// \param[in] ownNativeHandle Indicates if we own the native memory handle or
-/// it came from interop that asked to not transfer the ownership to SYCL RT.
-/// \param[out] mem Set to the PI mem object created from native handle.
-///
-/// \return TBD
-pi_result cuda_piextMemCreateWithNativeHandle(pi_native_handle, pi_context,
-                                              bool, pi_mem *) {
-  sycl::detail::pi::die(
-      "Creation of PI mem from native handle not implemented");
-  return {};
-}
-
-/// Created a PI image mem object from a CUDA image mem handle.
-/// TODO: Implement this.
-/// NOTE: The created PI object takes ownership of the native handle.
-///
-/// \param[in] pi_native_handle The native handle to create PI mem object from.
-/// \param[in] pi_context The PI context of the memory allocation.
-/// \param[in] ownNativeHandle Boolean indicates if we own the native memory
-/// handle or it came from interop that asked to not transfer the ownership to
-/// SYCL RT. \param[in] pi_image_format The format of the image. \param[in]
-/// pi_image_desc The description information for the image. \param[out] pi_mem
-/// Set to the PI mem object created from native handle.
-///
-/// \return TBD
-pi_result cuda_piextMemImageCreateWithNativeHandle(pi_native_handle, pi_context,
-                                                   bool,
-                                                   const pi_image_format *,
-                                                   const pi_image_desc *,
-                                                   pi_mem *) {
-  sycl::detail::pi::die(
-      "Creation of PI mem from native image handle not implemented");
-  return {};
-}
-
-/// Creates a `pi_queue` object on the CUDA backend.
-/// Valid properties
-/// * __SYCL_PI_CUDA_USE_DEFAULT_STREAM -> CU_STREAM_DEFAULT
-/// * __SYCL_PI_CUDA_SYNC_WITH_DEFAULT -> CU_STREAM_NON_BLOCKING
-/// \return Pi queue object mapping to a CUStream
-///
-pi_result cuda_piQueueCreate(pi_context context, pi_device device,
-                             pi_queue_properties properties, pi_queue *queue) {
-  try {
-    std::unique_ptr<_pi_queue> queueImpl{nullptr};
-
-    if (context->get_device() != device) {
-      *queue = nullptr;
-      return PI_ERROR_INVALID_DEVICE;
-    }
-
-    unsigned int flags = 0;
-    if (properties == __SYCL_PI_CUDA_USE_DEFAULT_STREAM) {
-      flags = CU_STREAM_DEFAULT;
-    } else if (properties == __SYCL_PI_CUDA_SYNC_WITH_DEFAULT) {
-      flags = 0;
-    } else {
-      flags = CU_STREAM_NON_BLOCKING;
-    }
-
-    const bool is_out_of_order =
-        properties & PI_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
-
-    std::vector<CUstream> computeCuStreams(
-        is_out_of_order ? _pi_queue::default_num_compute_streams : 1);
-    std::vector<CUstream> transferCuStreams(
-        is_out_of_order ? _pi_queue::default_num_transfer_streams : 0);
-
-    queueImpl = std::unique_ptr<_pi_queue>(
-        new _pi_queue{std::move(computeCuStreams), std::move(transferCuStreams),
-                      context, device, properties, flags});
-
-    *queue = queueImpl.release();
-
-    return PI_SUCCESS;
-  } catch (pi_result err) {
-
-    return err;
-
-  } catch (...) {
-
-    return PI_ERROR_OUT_OF_RESOURCES;
-  }
-}
-pi_result cuda_piextQueueCreate(pi_context Context, pi_device Device,
-                                pi_queue_properties *Properties,
-                                pi_queue *Queue) {
-  assert(Properties);
-  // Expect flags mask to be passed first.
-  assert(Properties[0] == PI_QUEUE_FLAGS);
-  if (Properties[0] != PI_QUEUE_FLAGS)
-    return PI_ERROR_INVALID_VALUE;
-  pi_queue_properties Flags = Properties[1];
-  // Extra data isn't supported yet.
-  assert(Properties[2] == 0);
-  if (Properties[2] != 0)
-    return PI_ERROR_INVALID_VALUE;
-  return cuda_piQueueCreate(Context, Device, Flags, Queue);
-}
-
-pi_result cuda_piQueueGetInfo(pi_queue command_queue, pi_queue_info param_name,
-                              size_t param_value_size, void *param_value,
-                              size_t *param_value_size_ret) {
-  assert(command_queue != nullptr);
-
-  switch (param_name) {
-  case PI_QUEUE_INFO_CONTEXT:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   command_queue->context_);
-  case PI_QUEUE_INFO_DEVICE:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   command_queue->device_);
-  case PI_QUEUE_INFO_REFERENCE_COUNT:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   command_queue->get_reference_count());
-  case PI_QUEUE_INFO_PROPERTIES:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   command_queue->properties_);
-  case PI_EXT_ONEAPI_QUEUE_INFO_EMPTY: {
-    try {
-      bool IsReady = command_queue->all_of([](CUstream s) -> bool {
-        const CUresult ret = cuStreamQuery(s);
-        if (ret == CUDA_SUCCESS)
-          return true;
-
-        if (ret == CUDA_ERROR_NOT_READY)
-          return false;
-
-        PI_CHECK_ERROR(ret);
-        return false;
-      });
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     IsReady);
-    } catch (pi_result err) {
-      return err;
-    } catch (...) {
-      return PI_ERROR_OUT_OF_RESOURCES;
-    }
-  }
-  default:
-    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-  }
-  sycl::detail::pi::die("Queue info request not implemented");
-  return {};
-}
-
-pi_result cuda_piQueueRetain(pi_queue command_queue) {
-  assert(command_queue != nullptr);
-  assert(command_queue->get_reference_count() > 0);
-
-  command_queue->increment_reference_count();
-  return PI_SUCCESS;
-}
-
-pi_result cuda_piQueueRelease(pi_queue command_queue) {
-  assert(command_queue != nullptr);
-
-  if (command_queue->decrement_reference_count() > 0) {
-    return PI_SUCCESS;
-  }
-
-  try {
-    std::unique_ptr<_pi_queue> queueImpl(command_queue);
-
-    if (!command_queue->backend_has_ownership())
-      return PI_SUCCESS;
-
-    ScopedContext active(command_queue->get_context());
-
-    command_queue->for_each_stream([](CUstream s) {
-      PI_CHECK_ERROR(cuStreamSynchronize(s));
-      PI_CHECK_ERROR(cuStreamDestroy(s));
-    });
-
-    return PI_SUCCESS;
-  } catch (pi_result err) {
-    return err;
-  } catch (...) {
-    return PI_ERROR_OUT_OF_RESOURCES;
-  }
-}
-
-pi_result cuda_piQueueFinish(pi_queue command_queue) {
-  pi_result result = PI_SUCCESS;
-
-  try {
-
-    assert(command_queue !=
-           nullptr); // need PI_ERROR_INVALID_EXTERNAL_HANDLE error code
-    ScopedContext active(command_queue->get_context());
-
-    command_queue->sync_streams</*ResetUsed=*/true>([&result](CUstream s) {
-      result = PI_CHECK_ERROR(cuStreamSynchronize(s));
-    });
-
-  } catch (pi_result err) {
-
-    result = err;
-
-  } catch (...) {
-
-    result = PI_ERROR_OUT_OF_RESOURCES;
-  }
-
-  return result;
-}
-
-// There is no CUDA counterpart for queue flushing and we don't run into the
-// same problem of having to flush cross-queue dependencies as some of the
-// other plugins, so it can be left as no-op.
-pi_result cuda_piQueueFlush(pi_queue command_queue) {
-  (void)command_queue;
-  return PI_SUCCESS;
-}
-
-/// Gets the native CUDA handle of a PI queue object
-///
-/// \param[in] queue The PI queue to get the native CUDA object of.
-/// \param[in] NativeHandleDesc Pointer to additional native handle info.
-/// \param[out] nativeHandle Set to the native handle of the PI queue object.
-///
-/// \return PI_SUCCESS
-pi_result cuda_piextQueueGetNativeHandle(pi_queue queue,
-                                         pi_native_handle *nativeHandle,
-                                         int32_t *NativeHandleDesc) {
-  *NativeHandleDesc = 0;
-  ScopedContext active(queue->get_context());
-  *nativeHandle =
-      reinterpret_cast<pi_native_handle>(queue->get_next_compute_stream());
-  return PI_SUCCESS;
-}
-
-/// Created a PI queue object from a CUDA queue handle.
-/// NOTE: The created PI object does not take ownership of the native handle.
-///
-/// \param[in] nativeHandle The native handle to create PI queue object from.
-/// \param[in] nativeHandleDesc Info about the native handle.
-/// \param[in] context is the PI context of the queue.
-/// \param[out] queue Set to the PI queue object created from native handle.
-/// \param ownNativeHandle tells if SYCL RT should assume the ownership of
-///        the native handle, if it can.
-///
-/// \return TBD
-pi_result cuda_piextQueueCreateWithNativeHandle(
-    pi_native_handle nativeHandle, int32_t NativeHandleDesc, pi_context context,
-    pi_device device, bool ownNativeHandle, pi_queue_properties *Properties,
-    pi_queue *queue) {
-  (void)NativeHandleDesc;
-  (void)device;
-  (void)ownNativeHandle;
-  (void)Properties;
-  assert(ownNativeHandle == false);
-
-  unsigned int flags;
-  CUstream cuStream = reinterpret_cast<CUstream>(nativeHandle);
-
-  auto retErr = PI_CHECK_ERROR(cuStreamGetFlags(cuStream, &flags));
-
-  pi_queue_properties properties = 0;
-  if (flags == CU_STREAM_DEFAULT)
-    properties = __SYCL_PI_CUDA_USE_DEFAULT_STREAM;
-  else if (flags == CU_STREAM_NON_BLOCKING)
-    properties = __SYCL_PI_CUDA_SYNC_WITH_DEFAULT;
-  else
-    sycl::detail::pi::die("Unknown cuda stream");
-
-  std::vector<CUstream> computeCuStreams(1, cuStream);
-  std::vector<CUstream> transferCuStreams(0);
-
-  // Create queue and set num_compute_streams to 1, as computeCuStreams has
-  // valid stream
-  *queue = new _pi_queue{std::move(computeCuStreams),
-                         std::move(transferCuStreams),
-                         context,
-                         context->get_device(),
-                         properties,
-                         flags,
-                         /*backend_owns*/ false};
-  (*queue)->num_compute_streams_ = 1;
-
-  return retErr;
-}
-
-pi_result cuda_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer,
-                                       pi_bool blocking_write, size_t offset,
-                                       size_t size, const void *ptr,
-                                       pi_uint32 num_events_in_wait_list,
-                                       const pi_event *event_wait_list,
-                                       pi_event *event) {
-
-  assert(buffer != nullptr);
-  assert(command_queue != nullptr);
-  pi_result retErr = PI_SUCCESS;
-  CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get();
-  std::unique_ptr<_pi_event> retImplEv{nullptr};
-
-  try {
-    ScopedContext active(command_queue->get_context());
-    CUstream cuStream = command_queue->get_next_transfer_stream();
-
-    retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
-                               event_wait_list);
-
-    if (event) {
-      retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
-          PI_COMMAND_TYPE_MEM_BUFFER_WRITE, command_queue, cuStream));
-      retImplEv->start();
-    }
-
-    retErr =
-        PI_CHECK_ERROR(cuMemcpyHtoDAsync(devPtr + offset, ptr, size, cuStream));
-
-    if (event) {
-      retErr = retImplEv->record();
-    }
-
-    if (blocking_write) {
-      retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
-    }
-
-    if (event) {
-      *event = retImplEv.release();
-    }
-  } catch (pi_result err) {
-    retErr = err;
-  }
-  return retErr;
-}
-
-pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer,
-                                      pi_bool blocking_read, size_t offset,
-                                      size_t size, void *ptr,
-                                      pi_uint32 num_events_in_wait_list,
-                                      const pi_event *event_wait_list,
-                                      pi_event *event) {
-
-  assert(buffer != nullptr);
-  assert(command_queue != nullptr);
-  pi_result retErr = PI_SUCCESS;
-  CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get();
-  std::unique_ptr<_pi_event> retImplEv{nullptr};
-
-  try {
-    ScopedContext active(command_queue->get_context());
-    CUstream cuStream = command_queue->get_next_transfer_stream();
-
-    retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
-                               event_wait_list);
-
-    if (event) {
-      retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
-          PI_COMMAND_TYPE_MEM_BUFFER_READ, command_queue, cuStream));
-      retImplEv->start();
-    }
-
-    retErr =
-        PI_CHECK_ERROR(cuMemcpyDtoHAsync(ptr, devPtr + offset, size, cuStream));
-
-    if (event) {
-      retErr = retImplEv->record();
-    }
-
-    if (blocking_read) {
-      retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
-    }
-
-    if (event) {
-      *event = retImplEv.release();
-    }
-
-  } catch (pi_result err) {
-    retErr = err;
-  }
-  return retErr;
-}
-
-pi_result cuda_piEventsWait(pi_uint32 num_events, const pi_event *event_list) {
-
-  try {
-    assert(num_events != 0);
-    assert(event_list);
-    if (num_events == 0) {
-      return PI_ERROR_INVALID_VALUE;
-    }
-
-    if (!event_list) {
-      return PI_ERROR_INVALID_EVENT;
-    }
-
-    auto context = event_list[0]->get_context();
-    ScopedContext active(context);
-
-    auto waitFunc = [context](pi_event event) -> pi_result {
-      if (!event) {
-        return PI_ERROR_INVALID_EVENT;
-      }
-
-      if (event->get_context() != context) {
-        return PI_ERROR_INVALID_CONTEXT;
-      }
-
-      return event->wait();
-    };
-    return forLatestEvents(event_list, num_events, waitFunc);
-  } catch (pi_result err) {
-    return err;
-  } catch (...) {
-    return PI_ERROR_OUT_OF_RESOURCES;
-  }
-}
-
-pi_result cuda_piKernelCreate(pi_program program, const char *kernel_name,
-                              pi_kernel *kernel) {
-  assert(kernel != nullptr);
-  assert(program != nullptr);
-
-  pi_result retErr = PI_SUCCESS;
-  std::unique_ptr<_pi_kernel> retKernel{nullptr};
-
-  try {
-    ScopedContext active(program->get_context());
-
-    CUfunction cuFunc;
-    retErr = PI_CHECK_ERROR(
-        cuModuleGetFunction(&cuFunc, program->get(), kernel_name));
-
-    std::string kernel_name_woffset = std::string(kernel_name) + "_with_offset";
-    CUfunction cuFuncWithOffsetParam;
-    CUresult offsetRes = cuModuleGetFunction(
-        &cuFuncWithOffsetParam, program->get(), kernel_name_woffset.c_str());
-
-    // If there is no kernel with global offset parameter we mark it as missing
-    if (offsetRes == CUDA_ERROR_NOT_FOUND) {
-      cuFuncWithOffsetParam = nullptr;
-    } else {
-      retErr = PI_CHECK_ERROR(offsetRes);
-    }
-
-    retKernel = std::unique_ptr<_pi_kernel>(
-        new _pi_kernel{cuFunc, cuFuncWithOffsetParam, kernel_name, program,
-                       program->get_context()});
-  } catch (pi_result err) {
-    retErr = err;
-  } catch (...) {
-    retErr = PI_ERROR_OUT_OF_HOST_MEMORY;
-  }
-
-  *kernel = retKernel.release();
-  return retErr;
-}
-
-pi_result cuda_piKernelSetArg(pi_kernel kernel, pi_uint32 arg_index,
-                              size_t arg_size, const void *arg_value) {
-
-  assert(kernel != nullptr);
-  pi_result retErr = PI_SUCCESS;
-  try {
-    if (arg_value) {
-      kernel->set_kernel_arg(arg_index, arg_size, arg_value);
-    } else {
-      kernel->set_kernel_local_arg(arg_index, arg_size);
-    }
-  } catch (pi_result err) {
-    retErr = err;
-  }
-  return retErr;
-}
-
-pi_result cuda_piextKernelSetArgMemObj(pi_kernel kernel, pi_uint32 arg_index,
-                                       const pi_mem *arg_value) {
-
-  assert(kernel != nullptr);
-  assert(arg_value != nullptr);
-
-  // Below sets kernel arg when zero-sized buffers are handled.
-  // In such case the corresponding memory is null.
-  if (*arg_value == nullptr) {
-    kernel->set_kernel_arg(arg_index, 0, nullptr);
-    return PI_SUCCESS;
-  }
-
-  pi_result retErr = PI_SUCCESS;
-  try {
-    pi_mem arg_mem = *arg_value;
-    if (arg_mem->mem_type_ == _pi_mem::mem_type::surface) {
-      CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
-      PI_CHECK_ERROR(cuArray3DGetDescriptor(
-          &arrayDesc, arg_mem->mem_.surface_mem_.get_array()));
-      if (arrayDesc.Format != CU_AD_FORMAT_UNSIGNED_INT32 &&
-          arrayDesc.Format != CU_AD_FORMAT_SIGNED_INT32 &&
-          arrayDesc.Format != CU_AD_FORMAT_HALF &&
-          arrayDesc.Format != CU_AD_FORMAT_FLOAT) {
-        setErrorMessage("PI CUDA kernels only support images with channel "
-                        "types int32, uint32, float, and half.",
-                        PI_ERROR_PLUGIN_SPECIFIC_ERROR);
-        return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
-      }
-      CUsurfObject cuSurf = arg_mem->mem_.surface_mem_.get_surface();
-      kernel->set_kernel_arg(arg_index, sizeof(cuSurf), (void *)&cuSurf);
-    } else {
-      CUdeviceptr cuPtr = arg_mem->mem_.buffer_mem_.get();
-      kernel->set_kernel_arg(arg_index, sizeof(CUdeviceptr), (void *)&cuPtr);
-    }
-  } catch (pi_result err) {
-    retErr = err;
-  }
-  return retErr;
-}
-
-pi_result cuda_piextKernelSetArgSampler(pi_kernel kernel, pi_uint32 arg_index,
-                                        const pi_sampler *arg_value) {
-
-  assert(kernel != nullptr);
-  assert(arg_value != nullptr);
-
-  pi_result retErr = PI_SUCCESS;
-  try {
-    pi_uint32 samplerProps = (*arg_value)->props_;
-    kernel->set_kernel_arg(arg_index, sizeof(pi_uint32), (void *)&samplerProps);
-  } catch (pi_result err) {
-    retErr = err;
-  }
-  return retErr;
-}
-
-pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device,
-                                    pi_kernel_group_info param_name,
-                                    size_t param_value_size, void *param_value,
-                                    size_t *param_value_size_ret) {
-
-  // Here we want to query about a kernel's cuda blocks!
-
-  if (kernel != nullptr) {
-
-    switch (param_name) {
-    case PI_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: {
-      size_t global_work_size[3] = {0, 0, 0};
-
-      int max_block_dimX{0}, max_block_dimY{0}, max_block_dimZ{0};
-      sycl::detail::pi::assertion(
-          cuDeviceGetAttribute(&max_block_dimX,
-                               CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
-                               device->get()) == CUDA_SUCCESS);
-      sycl::detail::pi::assertion(
-          cuDeviceGetAttribute(&max_block_dimY,
-                               CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
-                               device->get()) == CUDA_SUCCESS);
-      sycl::detail::pi::assertion(
-          cuDeviceGetAttribute(&max_block_dimZ,
-                               CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
-                               device->get()) == CUDA_SUCCESS);
-
-      int max_grid_dimX{0}, max_grid_dimY{0}, max_grid_dimZ{0};
-      sycl::detail::pi::assertion(
-          cuDeviceGetAttribute(&max_grid_dimX,
-                               CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
-                               device->get()) == CUDA_SUCCESS);
-      sycl::detail::pi::assertion(
-          cuDeviceGetAttribute(&max_grid_dimY,
-                               CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
-                               device->get()) == CUDA_SUCCESS);
-      sycl::detail::pi::assertion(
-          cuDeviceGetAttribute(&max_grid_dimZ,
-                               CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
-                               device->get()) == CUDA_SUCCESS);
-
-      global_work_size[0] = max_block_dimX * max_grid_dimX;
-      global_work_size[1] = max_block_dimY * max_grid_dimY;
-      global_work_size[2] = max_block_dimZ * max_grid_dimZ;
-      return getInfoArray(3, param_value_size, param_value,
-                          param_value_size_ret, global_work_size);
-    }
-    case PI_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
-      int max_threads = 0;
-      sycl::detail::pi::assertion(
-          cuFuncGetAttribute(&max_threads,
-                             CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                             kernel->get()) == CUDA_SUCCESS);
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     size_t(max_threads));
-    }
-    case PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: {
-      size_t group_size[3] = {0, 0, 0};
-      const auto &reqd_wg_size_md_map =
-          kernel->program_->kernelReqdWorkGroupSizeMD_;
-      const auto reqd_wg_size_md = reqd_wg_size_md_map.find(kernel->name_);
-      if (reqd_wg_size_md != reqd_wg_size_md_map.end()) {
-        const auto reqd_wg_size = reqd_wg_size_md->second;
-        group_size[0] = std::get<0>(reqd_wg_size);
-        group_size[1] = std::get<1>(reqd_wg_size);
-        group_size[2] = std::get<2>(reqd_wg_size);
-      }
-      return getInfoArray(3, param_value_size, param_value,
-                          param_value_size_ret, group_size);
-    }
-    case PI_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: {
-      // OpenCL LOCAL == CUDA SHARED
-      int bytes = 0;
-      sycl::detail::pi::assertion(
-          cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
-                             kernel->get()) == CUDA_SUCCESS);
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     pi_uint64(bytes));
-    }
-    case PI_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: {
-      // Work groups should be multiples of the warp size
-      int warpSize = 0;
-      sycl::detail::pi::assertion(
-          cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-                               device->get()) == CUDA_SUCCESS);
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     static_cast<size_t>(warpSize));
-    }
-    case PI_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: {
-      // OpenCL PRIVATE == CUDA LOCAL
-      int bytes = 0;
-      sycl::detail::pi::assertion(
-          cuFuncGetAttribute(&bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
-                             kernel->get()) == CUDA_SUCCESS);
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     pi_uint64(bytes));
-    }
-    case PI_KERNEL_GROUP_INFO_NUM_REGS: {
-      int numRegs = 0;
-      sycl::detail::pi::assertion(
-          cuFuncGetAttribute(&numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS,
-                             kernel->get()) == CUDA_SUCCESS);
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     pi_uint32(numRegs));
-    }
-    default:
-      __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-    }
-  }
-
-  return PI_ERROR_INVALID_KERNEL;
-}
-
-pi_result cuda_piEnqueueKernelLaunch(
-    pi_queue command_queue, pi_kernel kernel, pi_uint32 work_dim,
-    const size_t *global_work_offset, const size_t *global_work_size,
-    const size_t *local_work_size, pi_uint32 num_events_in_wait_list,
-    const pi_event *event_wait_list, pi_event *event) {
-
-  // Preconditions
-  assert(command_queue != nullptr);
-  assert(command_queue->get_context() == kernel->get_context());
-  assert(kernel != nullptr);
-  assert(global_work_offset != nullptr);
-  assert(work_dim > 0);
-  assert(work_dim < 4);
-
-  if (*global_work_size == 0) {
-    return cuda_piEnqueueEventsWaitWithBarrier(
-        command_queue, num_events_in_wait_list, event_wait_list, event);
-  }
-
-  // Set the number of threads per block to the number of threads per warp
-  // by default unless user has provided a better number
-  size_t threadsPerBlock[3] = {32u, 1u, 1u};
-  size_t maxWorkGroupSize = 0u;
-  size_t maxThreadsPerBlock[3] = {};
-  bool providedLocalWorkGroupSize = (local_work_size != nullptr);
-  pi_uint32 local_size = kernel->get_local_size();
-  pi_result retError = PI_SUCCESS;
-
-  try {
-    // Set the active context here as guessLocalWorkSize needs an active context
-    ScopedContext active(command_queue->get_context());
-    {
-      size_t *reqdThreadsPerBlock = kernel->reqdThreadsPerBlock_;
-      maxWorkGroupSize = command_queue->device_->get_max_work_group_size();
-      command_queue->device_->get_max_work_item_sizes(
-          sizeof(maxThreadsPerBlock), maxThreadsPerBlock);
-
-      if (providedLocalWorkGroupSize) {
-        auto isValid = [&](int dim) {
-          if (reqdThreadsPerBlock[dim] != 0 &&
-              local_work_size[dim] != reqdThreadsPerBlock[dim])
-            return PI_ERROR_INVALID_WORK_GROUP_SIZE;
-
-          if (local_work_size[dim] > maxThreadsPerBlock[dim])
-            return PI_ERROR_INVALID_WORK_GROUP_SIZE;
-          // Checks that local work sizes are a divisor of the global work sizes
-          // which includes that the local work sizes are neither larger than
-          // the global work sizes and not 0.
-          if (0u == local_work_size[dim])
-            return PI_ERROR_INVALID_WORK_GROUP_SIZE;
-          if (0u != (global_work_size[dim] % local_work_size[dim]))
-            return PI_ERROR_INVALID_WORK_GROUP_SIZE;
-          threadsPerBlock[dim] = local_work_size[dim];
-          return PI_SUCCESS;
-        };
-
-        size_t kernelLocalWorkGroupSize = 0;
-        for (size_t dim = 0; dim < work_dim; dim++) {
-          auto err = isValid(dim);
-          if (err != PI_SUCCESS)
-            return err;
-          // If no error then sum the total local work size per dim.
-          kernelLocalWorkGroupSize += local_work_size[dim];
-        }
-
-        if (hasExceededMaxRegistersPerBlock(command_queue->device_, kernel,
-                                            kernelLocalWorkGroupSize)) {
-          return PI_ERROR_INVALID_WORK_GROUP_SIZE;
-        }
-      } else {
-        guessLocalWorkSize(command_queue->device_, threadsPerBlock,
-                           global_work_size, maxThreadsPerBlock, kernel,
-                           local_size);
-      }
-    }
-
-    if (maxWorkGroupSize <
-        size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2])) {
-      return PI_ERROR_INVALID_WORK_GROUP_SIZE;
-    }
-
-    size_t blocksPerGrid[3] = {1u, 1u, 1u};
-
-    for (size_t i = 0; i < work_dim; i++) {
-      blocksPerGrid[i] =
-          (global_work_size[i] + threadsPerBlock[i] - 1) / threadsPerBlock[i];
-    }
-
-    std::unique_ptr<_pi_event> retImplEv{nullptr};
-
-    pi_uint32 stream_token;
-    _pi_stream_guard guard;
-    CUstream cuStream = command_queue->get_next_compute_stream(
-        num_events_in_wait_list, event_wait_list, guard, &stream_token);
-    CUfunction cuFunc = kernel->get();
-
-    retError = enqueueEventsWait(command_queue, cuStream,
-                                 num_events_in_wait_list, event_wait_list);
-
-    // Set the implicit global offset parameter if kernel has offset variant
-    if (kernel->get_with_offset_parameter()) {
-      std::uint32_t cuda_implicit_offset[3] = {0, 0, 0};
-      if (global_work_offset) {
-        for (size_t i = 0; i < work_dim; i++) {
-          cuda_implicit_offset[i] =
-              static_cast<std::uint32_t>(global_work_offset[i]);
-          if (global_work_offset[i] != 0) {
-            cuFunc = kernel->get_with_offset_parameter();
-          }
-        }
-      }
-      kernel->set_implicit_offset_arg(sizeof(cuda_implicit_offset),
-                                      cuda_implicit_offset);
-    }
-
-    auto &argIndices = kernel->get_arg_indices();
-
-    if (event) {
-      retImplEv = std::unique_ptr<_pi_event>(
-          _pi_event::make_native(PI_COMMAND_TYPE_NDRANGE_KERNEL, command_queue,
-                                 cuStream, stream_token));
-      retImplEv->start();
-    }
-
-    // Set local mem max size if env var is present
-    static const char *local_mem_sz_ptr =
-        std::getenv("SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE");
-
-    if (local_mem_sz_ptr) {
-      int device_max_local_mem = 0;
-      cuDeviceGetAttribute(
-          &device_max_local_mem,
-          CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
-          command_queue->get_device()->get());
-
-      static const int env_val = std::atoi(local_mem_sz_ptr);
-      if (env_val <= 0 || env_val > device_max_local_mem) {
-        setErrorMessage("Invalid value specified for "
-                        "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE",
-                        PI_ERROR_PLUGIN_SPECIFIC_ERROR);
-        return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
-      }
-      PI_CHECK_ERROR(cuFuncSetAttribute(
-          cuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, env_val));
-    }
-
-    retError = PI_CHECK_ERROR(cuLaunchKernel(
-        cuFunc, blocksPerGrid[0], blocksPerGrid[1], blocksPerGrid[2],
-        threadsPerBlock[0], threadsPerBlock[1], threadsPerBlock[2], local_size,
-        cuStream, const_cast<void **>(argIndices.data()), nullptr));
-    if (local_size != 0)
-      kernel->clear_local_size();
-
-    if (event) {
-      retError = retImplEv->record();
-      *event = retImplEv.release();
-    }
-  } catch (pi_result err) {
-    retError = err;
-  }
-  return retError;
-}
-
-/// \TODO Not implemented
-pi_result cuda_piEnqueueNativeKernel(pi_queue, void (*)(void *), void *, size_t,
-                                     pi_uint32, const pi_mem *, const void **,
-                                     pi_uint32, const pi_event *, pi_event *) {
-  sycl::detail::pi::die("Not implemented in CUDA backend");
-  return {};
-}
-
-pi_result cuda_piextKernelCreateWithNativeHandle(pi_native_handle, pi_context,
-                                                 pi_program, bool,
-                                                 pi_kernel *) {
-  sycl::detail::pi::die("Unsupported operation");
-  return PI_SUCCESS;
-}
-
-/// \TODO Not implemented
-pi_result cuda_piMemImageCreate(pi_context context, pi_mem_flags flags,
-                                const pi_image_format *image_format,
-                                const pi_image_desc *image_desc, void *host_ptr,
-                                pi_mem *ret_mem) {
-  // Need input memory object
-  assert(ret_mem != nullptr);
-  const bool performInitialCopy = (flags & PI_MEM_FLAGS_HOST_PTR_COPY) ||
-                                  ((flags & PI_MEM_FLAGS_HOST_PTR_USE));
-  pi_result retErr = PI_SUCCESS;
-
-  // We only support RBGA channel order
-  // TODO: check SYCL CTS and spec. May also have to support BGRA
-  if (image_format->image_channel_order !=
-      pi_image_channel_order::PI_IMAGE_CHANNEL_ORDER_RGBA) {
-    sycl::detail::pi::die(
-        "cuda_piMemImageCreate only supports RGBA channel order");
-  }
-
-  // We have to use cuArray3DCreate, which has some caveats. The height and
-  // depth parameters must be set to 0 produce 1D or 2D arrays. image_desc gives
-  // a minimum value of 1, so we need to convert the answer.
-  CUDA_ARRAY3D_DESCRIPTOR array_desc;
-  array_desc.NumChannels = 4; // Only support 4 channel image
-  array_desc.Flags = 0;       // No flags required
-  array_desc.Width = image_desc->image_width;
-  if (image_desc->image_type == PI_MEM_TYPE_IMAGE1D) {
-    array_desc.Height = 0;
-    array_desc.Depth = 0;
-  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE2D) {
-    array_desc.Height = image_desc->image_height;
-    array_desc.Depth = 0;
-  } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE3D) {
-    array_desc.Height = image_desc->image_height;
-    array_desc.Depth = image_desc->image_depth;
-  }
-
-  // We need to get this now in bytes for calculating the total image size later
-  size_t pixel_type_size_bytes;
-
-  switch (image_format->image_channel_data_type) {
-  case PI_IMAGE_CHANNEL_TYPE_UNORM_INT8:
-  case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8:
-    array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
-    pixel_type_size_bytes = 1;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT8:
-    array_desc.Format = CU_AD_FORMAT_SIGNED_INT8;
-    pixel_type_size_bytes = 1;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_UNORM_INT16:
-  case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16:
-    array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT16;
-    pixel_type_size_bytes = 2;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT16:
-    array_desc.Format = CU_AD_FORMAT_SIGNED_INT16;
-    pixel_type_size_bytes = 2;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_HALF_FLOAT:
-    array_desc.Format = CU_AD_FORMAT_HALF;
-    pixel_type_size_bytes = 2;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32:
-    array_desc.Format = CU_AD_FORMAT_UNSIGNED_INT32;
-    pixel_type_size_bytes = 4;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_SIGNED_INT32:
-    array_desc.Format = CU_AD_FORMAT_SIGNED_INT32;
-    pixel_type_size_bytes = 4;
-    break;
-  case PI_IMAGE_CHANNEL_TYPE_FLOAT:
-    array_desc.Format = CU_AD_FORMAT_FLOAT;
-    pixel_type_size_bytes = 4;
-    break;
-  default:
-    sycl::detail::pi::die(
-        "cuda_piMemImageCreate given unsupported image_channel_data_type");
-  }
-
-  // When a dimension isn't used image_desc has the size set to 1
-  size_t pixel_size_bytes =
-      pixel_type_size_bytes * 4; // 4 is the only number of channels we support
-  size_t image_size_bytes = pixel_size_bytes * image_desc->image_width *
-                            image_desc->image_height * image_desc->image_depth;
-
-  ScopedContext active(context);
-  CUarray image_array;
-  retErr = PI_CHECK_ERROR(cuArray3DCreate(&image_array, &array_desc));
-
-  try {
-    if (performInitialCopy) {
-      // We have to use a different copy function for each image dimensionality
-      if (image_desc->image_type == PI_MEM_TYPE_IMAGE1D) {
-        retErr = PI_CHECK_ERROR(
-            cuMemcpyHtoA(image_array, 0, host_ptr, image_size_bytes));
-      } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE2D) {
-        CUDA_MEMCPY2D cpy_desc;
-        memset(&cpy_desc, 0, sizeof(cpy_desc));
-        cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
-        cpy_desc.srcHost = host_ptr;
-        cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
-        cpy_desc.dstArray = image_array;
-        cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->image_width;
-        cpy_desc.Height = image_desc->image_height;
-        retErr = PI_CHECK_ERROR(cuMemcpy2D(&cpy_desc));
-      } else if (image_desc->image_type == PI_MEM_TYPE_IMAGE3D) {
-        CUDA_MEMCPY3D cpy_desc;
-        memset(&cpy_desc, 0, sizeof(cpy_desc));
-        cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
-        cpy_desc.srcHost = host_ptr;
-        cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
-        cpy_desc.dstArray = image_array;
-        cpy_desc.WidthInBytes = pixel_size_bytes * image_desc->image_width;
-        cpy_desc.Height = image_desc->image_height;
-        cpy_desc.Depth = image_desc->image_depth;
-        retErr = PI_CHECK_ERROR(cuMemcpy3D(&cpy_desc));
-      }
-    }
-
-    // CUDA_RESOURCE_DESC is a union of different structs, shown here
-    // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html
-    // We need to fill it as described here to use it for a surface or texture
-    // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__SURFOBJECT.html
-    // CUDA_RESOURCE_DESC::resType must be CU_RESOURCE_TYPE_ARRAY and
-    // CUDA_RESOURCE_DESC::res::array::hArray must be set to a valid CUDA array
-    // handle.
-    // CUDA_RESOURCE_DESC::flags must be set to zero
-
-    CUDA_RESOURCE_DESC image_res_desc;
-    image_res_desc.res.array.hArray = image_array;
-    image_res_desc.resType = CU_RESOURCE_TYPE_ARRAY;
-    image_res_desc.flags = 0;
-
-    CUsurfObject surface;
-    retErr = PI_CHECK_ERROR(cuSurfObjectCreate(&surface, &image_res_desc));
-
-    auto piMemObj = std::unique_ptr<_pi_mem>(new _pi_mem{
-        context, image_array, surface, image_desc->image_type, host_ptr});
-
-    if (piMemObj == nullptr) {
-      return PI_ERROR_OUT_OF_HOST_MEMORY;
-    }
-
-    *ret_mem = piMemObj.release();
-  } catch (pi_result err) {
-    cuArrayDestroy(image_array);
-    return err;
-  } catch (...) {
-    cuArrayDestroy(image_array);
-    return PI_ERROR_UNKNOWN;
-  }
-
-  return retErr;
-}
-
-/// \TODO Not implemented
-pi_result cuda_piMemImageGetInfo(pi_mem, pi_image_info, size_t, void *,
-                                 size_t *) {
-  sycl::detail::pi::die("cuda_piMemImageGetInfo not implemented");
-  return {};
-}
-
-pi_result cuda_piMemRetain(pi_mem mem) {
-  assert(mem != nullptr);
-  assert(mem->get_reference_count() > 0);
-  mem->increment_reference_count();
-  return PI_SUCCESS;
-}
-
-/// Not used as CUDA backend only creates programs from binary.
-/// See \ref cuda_piclProgramCreateWithBinary.
-///
-pi_result cuda_piclProgramCreateWithSource(pi_context, pi_uint32, const char **,
-                                           const size_t *, pi_program *) {
-  sycl::detail::pi::cuPrint("cuda_piclProgramCreateWithSource not implemented");
-  return PI_ERROR_INVALID_OPERATION;
-}
-
-/// Loads the images from a PI program into a CUmodule that can be
-/// used later on to extract functions (kernels).
-/// See \ref _pi_program for implementation details.
-///
-pi_result cuda_piProgramBuild(
-    pi_program program, [[maybe_unused]] pi_uint32 num_devices,
-    [[maybe_unused]] const pi_device *device_list, const char *options,
-    [[maybe_unused]] void (*pfn_notify)(pi_program program, void *user_data),
-    [[maybe_unused]] void *user_data) {
-
-  assert(program != nullptr);
-  assert(num_devices == 1 || num_devices == 0);
-  assert(device_list != nullptr || num_devices == 0);
-  assert(pfn_notify == nullptr);
-  assert(user_data == nullptr);
-  pi_result retError = PI_SUCCESS;
-
-  try {
-    ScopedContext active(program->get_context());
-
-    program->build_program(options);
-
-  } catch (pi_result err) {
-    retError = err;
-  }
-  return retError;
-}
-
-/// \TODO Not implemented
-pi_result cuda_piProgramCreate(pi_context, const void *, size_t, pi_program *) {
-  sycl::detail::pi::die("cuda_piProgramCreate not implemented");
-  return {};
-}
-
-/// Loads images from a list of PTX or CUBIN binaries.
-/// Note: No calls to CUDA driver API in this function, only store binaries
-/// for later.
-///
-/// Note: Only supports one device
-///
-pi_result cuda_piProgramCreateWithBinary(
-    pi_context context, [[maybe_unused]] pi_uint32 num_devices,
-    [[maybe_unused]] const pi_device *device_list, const size_t *lengths,
-    const unsigned char **binaries, size_t num_metadata_entries,
-    const pi_device_binary_property *metadata, pi_int32 *binary_status,
-    pi_program *program) {
-  // Ignore unused parameter
-  (void)binary_status;
-
-  assert(context != nullptr);
-  assert(binaries != nullptr);
-  assert(program != nullptr);
-  assert(device_list != nullptr);
-  assert(num_devices == 1 && "CUDA contexts are for a single device");
-  assert((context->get_device()->get() == device_list[0]->get()) &&
-         "Mismatch between devices context and passed context when creating "
-         "program from binary");
-
-  pi_result retError = PI_SUCCESS;
-
-  std::unique_ptr<_pi_program> retProgram{new _pi_program{context}};
-
-  retProgram->set_metadata(metadata, num_metadata_entries);
-
-  const bool has_length = (lengths != nullptr);
-  size_t length = has_length
-                      ? lengths[0]
-                      : strlen(reinterpret_cast<const char *>(binaries[0])) + 1;
-
-  assert(length != 0);
-
-  retProgram->set_binary(reinterpret_cast<const char *>(binaries[0]), length);
-
-  *program = retProgram.release();
-
-  return retError;
-}
-
-pi_result cuda_piProgramGetInfo(pi_program program, pi_program_info param_name,
-                                size_t param_value_size, void *param_value,
-                                size_t *param_value_size_ret) {
-  assert(program != nullptr);
-
-  switch (param_name) {
-  case PI_PROGRAM_INFO_REFERENCE_COUNT:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   program->get_reference_count());
-  case PI_PROGRAM_INFO_CONTEXT:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   program->context_);
-  case PI_PROGRAM_INFO_NUM_DEVICES:
-    return getInfo(param_value_size, param_value, param_value_size_ret, 1u);
-  case PI_PROGRAM_INFO_DEVICES:
-    return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
-                        &program->context_->deviceId_);
-  case PI_PROGRAM_INFO_SOURCE:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   program->binary_);
-  case PI_PROGRAM_INFO_BINARY_SIZES:
-    return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
-                        &program->binarySizeInBytes_);
-  case PI_PROGRAM_INFO_BINARIES:
-    return getInfoArray(1, param_value_size, param_value, param_value_size_ret,
-                        &program->binary_);
-  case PI_PROGRAM_INFO_KERNEL_NAMES: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   getKernelNames(program).c_str());
-  }
-  default:
-    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-  }
-  sycl::detail::pi::die("Program info request not implemented");
-  return {};
-}
-
-/// Creates a new PI program object that is the outcome of linking all input
-/// programs.
-/// \TODO Implement linker options, requires mapping of OpenCL to CUDA
-///
-pi_result cuda_piProgramLink(
-    pi_context context, [[maybe_unused]] pi_uint32 num_devices,
-    [[maybe_unused]] const pi_device *device_list, const char *options,
-    pi_uint32 num_input_programs, const pi_program *input_programs,
-    [[maybe_unused]] void (*pfn_notify)(pi_program program, void *user_data),
-    [[maybe_unused]] void *user_data, pi_program *ret_program) {
-
-  assert(ret_program != nullptr);
-  assert(num_devices == 1 || num_devices == 0);
-  assert(device_list != nullptr || num_devices == 0);
-  assert(pfn_notify == nullptr);
-  assert(user_data == nullptr);
-  pi_result retError = PI_SUCCESS;
-
-  try {
-    ScopedContext active(context);
-
-    CUlinkState state;
-    std::unique_ptr<_pi_program> retProgram{new _pi_program{context}};
-
-    retError = PI_CHECK_ERROR(cuLinkCreate(0, nullptr, nullptr, &state));
-    try {
-      for (size_t i = 0; i < num_input_programs; ++i) {
-        pi_program program = input_programs[i];
-        retError = PI_CHECK_ERROR(cuLinkAddData(
-            state, CU_JIT_INPUT_PTX, const_cast<char *>(program->binary_),
-            program->binarySizeInBytes_, nullptr, 0, nullptr, nullptr));
-      }
-      void *cubin = nullptr;
-      size_t cubinSize = 0;
-      retError = PI_CHECK_ERROR(cuLinkComplete(state, &cubin, &cubinSize));
-
-      retError =
-          retProgram->set_binary(static_cast<const char *>(cubin), cubinSize);
-
-      if (retError != PI_SUCCESS) {
-        return retError;
-      }
-
-      retError = retProgram->build_program(options);
-
-      if (retError != PI_SUCCESS) {
-        return retError;
-      }
-    } catch (...) {
-      // Upon error attempt cleanup
-      PI_CHECK_ERROR(cuLinkDestroy(state));
-      throw;
-    }
-
-    retError = PI_CHECK_ERROR(cuLinkDestroy(state));
-    *ret_program = retProgram.release();
-
-  } catch (pi_result err) {
-    retError = err;
-  }
-  return retError;
-}
-
-/// Creates a new program that is the outcome of the compilation of the headers
-///  and the program.
-/// \TODO Implement asynchronous compilation
-///
-pi_result cuda_piProgramCompile(
-    pi_program program, [[maybe_unused]] pi_uint32 num_devices,
-    [[maybe_unused]] const pi_device *device_list, const char *options,
-    [[maybe_unused]] pi_uint32 num_input_headers,
-    const pi_program *input_headers, const char **header_include_names,
-    [[maybe_unused]] void (*pfn_notify)(pi_program program, void *user_data),
-    [[maybe_unused]] void *user_data) {
-  // Ignore unused parameters
-  (void)header_include_names;
-  (void)input_headers;
-
-  assert(program != nullptr);
-  assert(num_devices == 1 || num_devices == 0);
-  assert(device_list != nullptr || num_devices == 0);
-  assert(pfn_notify == nullptr);
-  assert(user_data == nullptr);
-  assert(num_input_headers == 0);
-  pi_result retError = PI_SUCCESS;
-
-  try {
-    ScopedContext active(program->get_context());
-
-    program->build_program(options);
-
-  } catch (pi_result err) {
-    retError = err;
-  }
-  return retError;
-}
-
-pi_result cuda_piProgramGetBuildInfo(pi_program program, pi_device device,
-                                     pi_program_build_info param_name,
-                                     size_t param_value_size, void *param_value,
-                                     size_t *param_value_size_ret) {
-  // Ignore unused parameter
-  (void)device;
-
-  assert(program != nullptr);
-
-  switch (param_name) {
-  case PI_PROGRAM_BUILD_INFO_STATUS: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   program->buildStatus_);
-  }
-  case PI_PROGRAM_BUILD_INFO_OPTIONS:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   program->buildOptions_.c_str());
-  case PI_PROGRAM_BUILD_INFO_LOG:
-    return getInfoArray(program->MAX_LOG_SIZE, param_value_size, param_value,
-                        param_value_size_ret, program->infoLog_);
-  default:
-    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-  }
-  sycl::detail::pi::die("Program Build info request not implemented");
-  return {};
-}
-
-pi_result cuda_piProgramRetain(pi_program program) {
-  assert(program != nullptr);
-  assert(program->get_reference_count() > 0);
-  program->increment_reference_count();
-  return PI_SUCCESS;
-}
-
-/// Decreases the reference count of a pi_program object.
-/// When the reference count reaches 0, it unloads the module from
-/// the context.
-pi_result cuda_piProgramRelease(pi_program program) {
-  assert(program != nullptr);
-
-  // double delete or someone is messing with the ref count.
-  // either way, cannot safely proceed.
-  assert(program->get_reference_count() != 0 &&
-         "Reference count overflow detected in cuda_piProgramRelease.");
-
-  // decrement ref count. If it is 0, delete the program.
-  if (program->decrement_reference_count() == 0) {
-
-    std::unique_ptr<_pi_program> program_ptr{program};
-
-    pi_result result = PI_ERROR_INVALID_PROGRAM;
-
-    try {
-      ScopedContext active(program->get_context());
-      auto cuModule = program->get();
-      result = PI_CHECK_ERROR(cuModuleUnload(cuModule));
-    } catch (...) {
-      result = PI_ERROR_OUT_OF_RESOURCES;
-    }
-
-    return result;
-  }
-
-  return PI_SUCCESS;
-}
-
-/// Gets the native CUDA handle of a PI program object
-///
-/// \param[in] program The PI program to get the native CUDA object of.
-/// \param[out] nativeHandle Set to the native handle of the PI program object.
-///
-/// \return TBD
-pi_result cuda_piextProgramGetNativeHandle(pi_program program,
-                                           pi_native_handle *nativeHandle) {
-  *nativeHandle = reinterpret_cast<pi_native_handle>(program->get());
-  return PI_SUCCESS;
-}
-
-/// Created a PI program object from a CUDA program handle.
-/// TODO: Implement this.
-/// NOTE: The created PI object takes ownership of the native handle.
-///
-/// \param[in] nativeHandle The native handle to create PI program object from.
-/// \param[in] context The PI context of the program.
-/// \param[out] program Set to the PI program object created from native handle.
-///
-/// \return TBD
-pi_result cuda_piextProgramCreateWithNativeHandle(pi_native_handle, pi_context,
-                                                  bool, pi_program *) {
-  sycl::detail::pi::die(
-      "Creation of PI program from native handle not implemented");
-  return {};
-}
-
-pi_result cuda_piKernelGetInfo(pi_kernel kernel, pi_kernel_info param_name,
-                               size_t param_value_size, void *param_value,
-                               size_t *param_value_size_ret) {
-
-  if (kernel != nullptr) {
-
-    switch (param_name) {
-    case PI_KERNEL_INFO_FUNCTION_NAME:
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     kernel->get_name());
-    case PI_KERNEL_INFO_NUM_ARGS:
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     kernel->get_num_args());
-    case PI_KERNEL_INFO_REFERENCE_COUNT:
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     kernel->get_reference_count());
-    case PI_KERNEL_INFO_CONTEXT: {
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     kernel->get_context());
-    }
-    case PI_KERNEL_INFO_PROGRAM: {
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     kernel->get_program());
-    }
-    case PI_KERNEL_INFO_ATTRIBUTES: {
-      return getInfo(param_value_size, param_value, param_value_size_ret, "");
-    }
-    default: {
-      __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-    }
-    }
-  }
-
-  return PI_ERROR_INVALID_KERNEL;
-}
-
-pi_result cuda_piKernelGetSubGroupInfo(
-    pi_kernel kernel, pi_device device, pi_kernel_sub_group_info param_name,
-    size_t input_value_size, const void *input_value, size_t param_value_size,
-    void *param_value, size_t *param_value_size_ret) {
-  // Ignore unused parameters
-  (void)input_value_size;
-  (void)input_value;
-
-  if (kernel != nullptr) {
-    switch (param_name) {
-    case PI_KERNEL_MAX_SUB_GROUP_SIZE: {
-      // Sub-group size is equivalent to warp size
-      int warpSize = 0;
-      sycl::detail::pi::assertion(
-          cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
-                               device->get()) == CUDA_SUCCESS);
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     static_cast<uint32_t>(warpSize));
-    }
-    case PI_KERNEL_MAX_NUM_SUB_GROUPS: {
-      // Number of sub-groups = max block size / warp size + possible remainder
-      int max_threads = 0;
-      sycl::detail::pi::assertion(
-          cuFuncGetAttribute(&max_threads,
-                             CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
-                             kernel->get()) == CUDA_SUCCESS);
-      int warpSize = 0;
-      cuda_piKernelGetSubGroupInfo(kernel, device, PI_KERNEL_MAX_SUB_GROUP_SIZE,
-                                   0, nullptr, sizeof(uint32_t), &warpSize,
-                                   nullptr);
-      int maxWarps = (max_threads + warpSize - 1) / warpSize;
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     static_cast<uint32_t>(maxWarps));
-    }
-    case PI_KERNEL_COMPILE_NUM_SUB_GROUPS: {
-      // Return value of 0 => not specified
-      // TODO: Revisit if PTX is generated for compile-time work-group sizes
-      return getInfo(param_value_size, param_value, param_value_size_ret, 0);
-    }
-    case PI_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL: {
-      // Return value of 0 => unspecified or "auto" sub-group size
-      // Correct for now, since warp size may be read from special register
-      // TODO: Return warp size once default is primary sub-group size
-      // TODO: Revisit if we can recover [[sub_group_size]] attribute from PTX
-      return getInfo(param_value_size, param_value, param_value_size_ret, 0);
-    }
-    default:
-      __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-    }
-  }
-  return PI_ERROR_INVALID_KERNEL;
-}
-
-pi_result cuda_piKernelRetain(pi_kernel kernel) {
-  assert(kernel != nullptr);
-  assert(kernel->get_reference_count() > 0u);
-
-  kernel->increment_reference_count();
-  return PI_SUCCESS;
-}
-
-pi_result cuda_piKernelRelease(pi_kernel kernel) {
-  assert(kernel != nullptr);
-
-  // double delete or someone is messing with the ref count.
-  // either way, cannot safely proceed.
-  assert(kernel->get_reference_count() != 0 &&
-         "Reference count overflow detected in cuda_piKernelRelease.");
-
-  // decrement ref count. If it is 0, delete the program.
-  if (kernel->decrement_reference_count() == 0) {
-    // no internal cuda resources to clean up. Just delete it.
-    delete kernel;
-    return PI_SUCCESS;
-  }
-
-  return PI_SUCCESS;
-}
-
-// A NOP for the CUDA backend
-pi_result cuda_piKernelSetExecInfo(pi_kernel, pi_kernel_exec_info, size_t,
-                                   const void *) {
-  return PI_SUCCESS;
-}
-
-pi_result cuda_piextProgramSetSpecializationConstant(pi_program, pi_uint32,
-                                                     size_t, const void *) {
-  // This entry point is only used for native specialization constants (SPIR-V),
-  // and the CUDA plugin is AOT only so this entry point is not supported.
-  sycl::detail::pi::die("Native specialization constants are not supported");
-  return {};
-}
-
-pi_result cuda_piextKernelSetArgPointer(pi_kernel kernel, pi_uint32 arg_index,
-                                        size_t arg_size,
-                                        const void *arg_value) {
-  kernel->set_kernel_arg(arg_index, arg_size, arg_value);
-  return PI_SUCCESS;
-}
-
-//
-// Events
-//
-pi_result cuda_piEventCreate(pi_context, pi_event *) {
-  sycl::detail::pi::die("PI Event Create not implemented in CUDA backend");
-}
-
-pi_result cuda_piEventGetInfo(pi_event event, pi_event_info param_name,
-                              size_t param_value_size, void *param_value,
-                              size_t *param_value_size_ret) {
-  assert(event != nullptr);
-
-  switch (param_name) {
-  case PI_EVENT_INFO_COMMAND_QUEUE:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   event->get_queue());
-  case PI_EVENT_INFO_COMMAND_TYPE:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   event->get_command_type());
-  case PI_EVENT_INFO_REFERENCE_COUNT:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   event->get_reference_count());
-  case PI_EVENT_INFO_COMMAND_EXECUTION_STATUS: {
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   static_cast<pi_event_status>(event->get_execution_status()));
-  }
-  case PI_EVENT_INFO_CONTEXT:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   event->get_context());
-  default:
-    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-  }
-
-  return PI_ERROR_INVALID_EVENT;
-}
-
-/// Obtain profiling information from PI CUDA events
-/// \TODO Timings from CUDA are only elapsed time.
-pi_result cuda_piEventGetProfilingInfo(pi_event event,
-                                       pi_profiling_info param_name,
-                                       size_t param_value_size,
-                                       void *param_value,
-                                       size_t *param_value_size_ret) {
-
-  assert(event != nullptr);
-
-  pi_queue queue = event->get_queue();
-  if (queue == nullptr ||
-      !(queue->properties_ & PI_QUEUE_FLAG_PROFILING_ENABLE)) {
-    return PI_ERROR_PROFILING_INFO_NOT_AVAILABLE;
-  }
-
-  switch (param_name) {
-  case PI_PROFILING_INFO_COMMAND_QUEUED:
-  case PI_PROFILING_INFO_COMMAND_SUBMIT:
-    // Note: No user for this case
-    return getInfo<pi_uint64>(param_value_size, param_value,
-                              param_value_size_ret, event->get_queued_time());
-  case PI_PROFILING_INFO_COMMAND_START:
-    return getInfo<pi_uint64>(param_value_size, param_value,
-                              param_value_size_ret, event->get_start_time());
-  case PI_PROFILING_INFO_COMMAND_END:
-    return getInfo<pi_uint64>(param_value_size, param_value,
-                              param_value_size_ret, event->get_end_time());
-  default:
-    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-  }
-  sycl::detail::pi::die("Event Profiling info request not implemented");
-  return {};
-}
-
-pi_result cuda_piEventSetCallback(pi_event, pi_int32, pfn_notify, void *) {
-  sycl::detail::pi::die("Event Callback not implemented in CUDA backend");
-  return PI_SUCCESS;
-}
-
-pi_result cuda_piEventSetStatus(pi_event, pi_int32) {
-  sycl::detail::pi::die("Event Set Status not implemented in CUDA backend");
-  return PI_ERROR_INVALID_VALUE;
-}
-
-pi_result cuda_piEventRetain(pi_event event) {
-  assert(event != nullptr);
-
-  const auto refCount = event->increment_reference_count();
-
-  sycl::detail::pi::assertion(
-      refCount != 0,
-      "Reference count overflow detected in cuda_piEventRetain.");
-
-  return PI_SUCCESS;
-}
-
-pi_result cuda_piEventRelease(pi_event event) {
-  assert(event != nullptr);
-
-  // double delete or someone is messing with the ref count.
-  // either way, cannot safely proceed.
-  sycl::detail::pi::assertion(
-      event->get_reference_count() != 0,
-      "Reference count overflow detected in cuda_piEventRelease.");
-
-  // decrement ref count. If it is 0, delete the event.
-  if (event->decrement_reference_count() == 0) {
-    std::unique_ptr<_pi_event> event_ptr{event};
-    pi_result result = PI_ERROR_INVALID_EVENT;
-    try {
-      ScopedContext active(event->get_context());
-      result = event->release();
-    } catch (...) {
-      result = PI_ERROR_OUT_OF_RESOURCES;
-    }
-    return result;
-  }
-
-  return PI_SUCCESS;
-}
-
-/// Enqueues a wait on the given CUstream for all events.
-/// See \ref enqueueEventWait
-/// TODO: Add support for multiple streams once the Event class is properly
-/// refactored.
-///
-pi_result cuda_piEnqueueEventsWait(pi_queue command_queue,
-                                   pi_uint32 num_events_in_wait_list,
-                                   const pi_event *event_wait_list,
-                                   pi_event *event) {
-  return cuda_piEnqueueEventsWaitWithBarrier(
-      command_queue, num_events_in_wait_list, event_wait_list, event);
-}
-
-/// Enqueues a wait on the given CUstream for all specified events (See
-/// \ref enqueueEventWaitWithBarrier.) If the events list is empty, the enqueued
-/// wait will wait on all previous events in the queue.
-///
-/// \param[in] command_queue A valid PI queue.
-/// \param[in] num_events_in_wait_list Number of events in event_wait_list.
-/// \param[in] event_wait_list Events to wait on.
-/// \param[out] event Event for when all events in event_wait_list have finished
-/// or, if event_wait_list is empty, when all previous events in the queue have
-/// finished.
-///
-/// \return TBD
-pi_result cuda_piEnqueueEventsWaitWithBarrier(pi_queue command_queue,
-                                              pi_uint32 num_events_in_wait_list,
-                                              const pi_event *event_wait_list,
-                                              pi_event *event) {
-  // This function makes one stream work on the previous work (or work
-  // represented by input events) and then all future work waits on that stream.
-  if (!command_queue) {
-    return PI_ERROR_INVALID_QUEUE;
-  }
-
-  pi_result result;
-
-  try {
-    ScopedContext active(command_queue->get_context());
-    pi_uint32 stream_token;
-    _pi_stream_guard guard;
-    CUstream cuStream = command_queue->get_next_compute_stream(
-        num_events_in_wait_list, event_wait_list, guard, &stream_token);
-    {
-      std::lock_guard<std::mutex> guard(command_queue->barrier_mutex_);
-      if (command_queue->barrier_event_ == nullptr) {
-        PI_CHECK_ERROR(cuEventCreate(&command_queue->barrier_event_,
-                                     CU_EVENT_DISABLE_TIMING));
-      }
-      if (num_events_in_wait_list == 0) { //  wait on all work
-        if (command_queue->barrier_tmp_event_ == nullptr) {
-          PI_CHECK_ERROR(cuEventCreate(&command_queue->barrier_tmp_event_,
-                                       CU_EVENT_DISABLE_TIMING));
-        }
-        command_queue->sync_streams(
-            [cuStream,
-             tmp_event = command_queue->barrier_tmp_event_](CUstream s) {
-              if (cuStream != s) {
-                // record a new CUDA event on every stream and make one stream
-                // wait for these events
-                PI_CHECK_ERROR(cuEventRecord(tmp_event, s));
-                PI_CHECK_ERROR(cuStreamWaitEvent(cuStream, tmp_event, 0));
-              }
-            });
-      } else { // wait just on given events
-        forLatestEvents(event_wait_list, num_events_in_wait_list,
-                        [cuStream](pi_event event) -> pi_result {
-                          if (event->get_queue()->has_been_synchronized(
-                                  event->get_compute_stream_token())) {
-                            return PI_SUCCESS;
-                          } else {
-                            return PI_CHECK_ERROR(
-                                cuStreamWaitEvent(cuStream, event->get(), 0));
-                          }
-                        });
-      }
-
-      result = PI_CHECK_ERROR(
-          cuEventRecord(command_queue->barrier_event_, cuStream));
-      for (unsigned int i = 0;
-           i < command_queue->compute_applied_barrier_.size(); i++) {
-        command_queue->compute_applied_barrier_[i] = false;
-      }
-      for (unsigned int i = 0;
-           i < command_queue->transfer_applied_barrier_.size(); i++) {
-        command_queue->transfer_applied_barrier_[i] = false;
-      }
-    }
-    if (result != PI_SUCCESS) {
-      return result;
-    }
-
-    if (event) {
-      *event = _pi_event::make_native(PI_COMMAND_TYPE_MARKER, command_queue,
-                                      cuStream, stream_token);
-      (*event)->start();
-      (*event)->record();
-    }
-
-    return PI_SUCCESS;
-  } catch (pi_result err) {
-    return err;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-}
-
-/// Gets the native CUDA handle of a PI event object
-///
-/// \param[in] event The PI event to get the native CUDA object of.
-/// \param[out] nativeHandle Set to the native handle of the PI event object.
-///
-/// \return PI_SUCCESS on success. PI_ERROR_INVALID_EVENT if given a user event.
-pi_result cuda_piextEventGetNativeHandle(pi_event event,
-                                         pi_native_handle *nativeHandle) {
-  *nativeHandle = reinterpret_cast<pi_native_handle>(event->get());
-  return PI_SUCCESS;
-}
-
-/// Created a PI event object from a CUDA event handle.
-/// TODO: Implement this.
-/// NOTE: The created PI object takes ownership of the native handle.
-///
-/// \param[in] nativeHandle The native handle to create PI event object from.
-/// \param[out] event Set to the PI event object created from native handle.
-///
-/// \return TBD
-pi_result cuda_piextEventCreateWithNativeHandle(pi_native_handle nativeHandle,
-                                                pi_context context,
-                                                bool ownNativeHandle,
-                                                pi_event *event) {
-  (void)ownNativeHandle;
-  assert(!ownNativeHandle);
-
-  std::unique_ptr<_pi_event> event_ptr{nullptr};
-
-  *event = _pi_event::make_with_native(context,
-                                       reinterpret_cast<CUevent>(nativeHandle));
-
-  return PI_SUCCESS;
-}
-
-/// Creates a PI sampler object
-///
-/// \param[in] context The context the sampler is created for.
-/// \param[in] sampler_properties The properties for the sampler.
-/// \param[out] result_sampler Set to the resulting sampler object.
-///
-/// \return PI_SUCCESS on success. PI_ERROR_INVALID_VALUE if given an invalid
-/// property
-///         or if there is multiple of properties from the same category.
-pi_result cuda_piSamplerCreate(pi_context context,
-                               const pi_sampler_properties *sampler_properties,
-                               pi_sampler *result_sampler) {
-  std::unique_ptr<_pi_sampler> retImplSampl{new _pi_sampler(context)};
-
-  bool propSeen[3] = {false, false, false};
-  for (size_t i = 0; sampler_properties[i] != 0; i += 2) {
-    switch (sampler_properties[i]) {
-    case PI_SAMPLER_PROPERTIES_NORMALIZED_COORDS:
-      if (propSeen[0]) {
-        return PI_ERROR_INVALID_VALUE;
-      }
-      propSeen[0] = true;
-      retImplSampl->props_ |= sampler_properties[i + 1];
-      break;
-    case PI_SAMPLER_PROPERTIES_FILTER_MODE:
-      if (propSeen[1]) {
-        return PI_ERROR_INVALID_VALUE;
-      }
-      propSeen[1] = true;
-      retImplSampl->props_ |=
-          (sampler_properties[i + 1] - PI_SAMPLER_FILTER_MODE_NEAREST) << 1;
-      break;
-    case PI_SAMPLER_PROPERTIES_ADDRESSING_MODE:
-      if (propSeen[2]) {
-        return PI_ERROR_INVALID_VALUE;
-      }
-      propSeen[2] = true;
-      retImplSampl->props_ |=
-          (sampler_properties[i + 1] - PI_SAMPLER_ADDRESSING_MODE_NONE) << 2;
-      break;
-    default:
-      return PI_ERROR_INVALID_VALUE;
-    }
-  }
-
-  if (!propSeen[0]) {
-    retImplSampl->props_ |= PI_TRUE;
-  }
-  // Default filter mode to PI_SAMPLER_FILTER_MODE_NEAREST
-  if (!propSeen[2]) {
-    retImplSampl->props_ |=
-        (PI_SAMPLER_ADDRESSING_MODE_CLAMP % PI_SAMPLER_ADDRESSING_MODE_NONE)
-        << 2;
-  }
-
-  *result_sampler = retImplSampl.release();
-  return PI_SUCCESS;
-}
-
-/// Gets information from a PI sampler object
-///
-/// \param[in] sampler The sampler to get the information from.
-/// \param[in] param_name The name of the information to get.
-/// \param[in] param_value_size The size of the param_value.
-/// \param[out] param_value Set to information value.
-/// \param[out] param_value_size_ret Set to the size of the information value.
-///
-/// \return PI_SUCCESS on success.
-pi_result cuda_piSamplerGetInfo(pi_sampler sampler, pi_sampler_info param_name,
-                                size_t param_value_size, void *param_value,
-                                size_t *param_value_size_ret) {
-  assert(sampler != nullptr);
-
-  switch (param_name) {
-  case PI_SAMPLER_INFO_REFERENCE_COUNT:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   sampler->get_reference_count());
-  case PI_SAMPLER_INFO_CONTEXT:
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   sampler->context_);
-  case PI_SAMPLER_INFO_NORMALIZED_COORDS: {
-    pi_bool norm_coords_prop = static_cast<pi_bool>(sampler->props_ & 0x1);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   norm_coords_prop);
-  }
-  case PI_SAMPLER_INFO_FILTER_MODE: {
-    pi_sampler_filter_mode filter_prop = static_cast<pi_sampler_filter_mode>(
-        ((sampler->props_ >> 1) & 0x1) + PI_SAMPLER_FILTER_MODE_NEAREST);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   filter_prop);
-  }
-  case PI_SAMPLER_INFO_ADDRESSING_MODE: {
-    pi_sampler_addressing_mode addressing_prop =
-        static_cast<pi_sampler_addressing_mode>(
-            (sampler->props_ >> 2) + PI_SAMPLER_ADDRESSING_MODE_NONE);
-    return getInfo(param_value_size, param_value, param_value_size_ret,
-                   addressing_prop);
-  }
-  default:
-    __SYCL_PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
-  }
-  return {};
-}
-
-/// Retains a PI sampler object, incrementing its reference count.
-///
-/// \param[in] sampler The sampler to increment the reference count of.
-///
-/// \return PI_SUCCESS.
-pi_result cuda_piSamplerRetain(pi_sampler sampler) {
-  assert(sampler != nullptr);
-  sampler->increment_reference_count();
-  return PI_SUCCESS;
-}
-
-/// Releases a PI sampler object, decrementing its reference count. If the
-/// reference count reaches zero, the sampler object is destroyed.
-///
-/// \param[in] sampler The sampler to decrement the reference count of.
-///
-/// \return PI_SUCCESS.
-pi_result cuda_piSamplerRelease(pi_sampler sampler) {
-  assert(sampler != nullptr);
-
-  // double delete or someone is messing with the ref count.
-  // either way, cannot safely proceed.
-  sycl::detail::pi::assertion(
-      sampler->get_reference_count() != 0,
-      "Reference count overflow detected in cuda_piSamplerRelease.");
-
-  // decrement ref count. If it is 0, delete the sampler.
-  if (sampler->decrement_reference_count() == 0) {
-    delete sampler;
-  }
-
-  return PI_SUCCESS;
-}
-
-/// General 3D memory copy operation.
-/// This function requires the corresponding CUDA context to be at the top of
-/// the context stack
-/// If the source and/or destination is on the device, src_ptr and/or dst_ptr
-/// must be a pointer to a CUdeviceptr
-static pi_result commonEnqueueMemBufferCopyRect(
-    CUstream cu_stream, pi_buff_rect_region region, const void *src_ptr,
-    const CUmemorytype_enum src_type, pi_buff_rect_offset src_offset,
-    size_t src_row_pitch, size_t src_slice_pitch, void *dst_ptr,
-    const CUmemorytype_enum dst_type, pi_buff_rect_offset dst_offset,
-    size_t dst_row_pitch, size_t dst_slice_pitch) {
-
-  assert(region != nullptr);
-  assert(src_offset != nullptr);
-  assert(dst_offset != nullptr);
-
-  assert(src_type == CU_MEMORYTYPE_DEVICE || src_type == CU_MEMORYTYPE_HOST);
-  assert(dst_type == CU_MEMORYTYPE_DEVICE || dst_type == CU_MEMORYTYPE_HOST);
-
-  src_row_pitch = (!src_row_pitch) ? region->width_bytes + src_offset->x_bytes
-                                   : src_row_pitch;
-  src_slice_pitch =
-      (!src_slice_pitch)
-          ? ((region->height_scalar + src_offset->y_scalar) * src_row_pitch)
-          : src_slice_pitch;
-  dst_row_pitch = (!dst_row_pitch) ? region->width_bytes + dst_offset->x_bytes
-                                   : dst_row_pitch;
-  dst_slice_pitch =
-      (!dst_slice_pitch)
-          ? ((region->height_scalar + dst_offset->y_scalar) * dst_row_pitch)
-          : dst_slice_pitch;
-
-  CUDA_MEMCPY3D params = {};
-
-  params.WidthInBytes = region->width_bytes;
-  params.Height = region->height_scalar;
-  params.Depth = region->depth_scalar;
-
-  params.srcMemoryType = src_type;
-  params.srcDevice = src_type == CU_MEMORYTYPE_DEVICE
-                         ? *static_cast<const CUdeviceptr *>(src_ptr)
-                         : 0;
-  params.srcHost = src_type == CU_MEMORYTYPE_HOST ? src_ptr : nullptr;
-  params.srcXInBytes = src_offset->x_bytes;
-  params.srcY = src_offset->y_scalar;
-  params.srcZ = src_offset->z_scalar;
-  params.srcPitch = src_row_pitch;
-  params.srcHeight = src_slice_pitch / src_row_pitch;
-
-  params.dstMemoryType = dst_type;
-  params.dstDevice = dst_type == CU_MEMORYTYPE_DEVICE
-                         ? *static_cast<CUdeviceptr *>(dst_ptr)
-                         : 0;
-  params.dstHost = dst_type == CU_MEMORYTYPE_HOST ? dst_ptr : nullptr;
-  params.dstXInBytes = dst_offset->x_bytes;
-  params.dstY = dst_offset->y_scalar;
-  params.dstZ = dst_offset->z_scalar;
-  params.dstPitch = dst_row_pitch;
-  params.dstHeight = dst_slice_pitch / dst_row_pitch;
-
-  return PI_CHECK_ERROR(cuMemcpy3DAsync(&params, cu_stream));
-}
-
-pi_result cuda_piEnqueueMemBufferReadRect(
-    pi_queue command_queue, pi_mem buffer, pi_bool blocking_read,
-    pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset,
-    pi_buff_rect_region region, size_t buffer_row_pitch,
-    size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch,
-    void *ptr, pi_uint32 num_events_in_wait_list,
-    const pi_event *event_wait_list, pi_event *event) {
-
-  assert(buffer != nullptr);
-  assert(command_queue != nullptr);
-
-  pi_result retErr = PI_SUCCESS;
-  CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get();
-  std::unique_ptr<_pi_event> retImplEv{nullptr};
-
-  try {
-    ScopedContext active(command_queue->get_context());
-    CUstream cuStream = command_queue->get_next_transfer_stream();
-
-    retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
-                               event_wait_list);
-
-    if (event) {
-      retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
-          PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT, command_queue, cuStream));
-      retImplEv->start();
-    }
-
-    retErr = commonEnqueueMemBufferCopyRect(
-        cuStream, region, &devPtr, CU_MEMORYTYPE_DEVICE, buffer_offset,
-        buffer_row_pitch, buffer_slice_pitch, ptr, CU_MEMORYTYPE_HOST,
-        host_offset, host_row_pitch, host_slice_pitch);
-
-    if (event) {
-      retErr = retImplEv->record();
-    }
-
-    if (blocking_read) {
-      retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
-    }
-
-    if (event) {
-      *event = retImplEv.release();
-    }
-
-  } catch (pi_result err) {
-    retErr = err;
-  }
-  return retErr;
-}
-
-pi_result cuda_piEnqueueMemBufferWriteRect(
-    pi_queue command_queue, pi_mem buffer, pi_bool blocking_write,
-    pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset,
-    pi_buff_rect_region region, size_t buffer_row_pitch,
-    size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch,
-    const void *ptr, pi_uint32 num_events_in_wait_list,
-    const pi_event *event_wait_list, pi_event *event) {
-
-  assert(buffer != nullptr);
-  assert(command_queue != nullptr);
-
-  pi_result retErr = PI_SUCCESS;
-  CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get();
-  std::unique_ptr<_pi_event> retImplEv{nullptr};
-
-  try {
-    ScopedContext active(command_queue->get_context());
-    CUstream cuStream = command_queue->get_next_transfer_stream();
-    retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
-                               event_wait_list);
-
-    if (event) {
-      retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
-          PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT, command_queue, cuStream));
-      retImplEv->start();
-    }
-
-    retErr = commonEnqueueMemBufferCopyRect(
-        cuStream, region, ptr, CU_MEMORYTYPE_HOST, host_offset, host_row_pitch,
-        host_slice_pitch, &devPtr, CU_MEMORYTYPE_DEVICE, buffer_offset,
-        buffer_row_pitch, buffer_slice_pitch);
-
-    if (event) {
-      retErr = retImplEv->record();
-    }
-
-    if (blocking_write) {
-      retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
-    }
-
-    if (event) {
-      *event = retImplEv.release();
-    }
-
-  } catch (pi_result err) {
-    retErr = err;
-  }
-  return retErr;
-}
-
-pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer,
-                                      pi_mem dst_buffer, size_t src_offset,
-                                      size_t dst_offset, size_t size,
-                                      pi_uint32 num_events_in_wait_list,
-                                      const pi_event *event_wait_list,
-                                      pi_event *event) {
-  if (!command_queue) {
-    return PI_ERROR_INVALID_QUEUE;
-  }
-
-  std::unique_ptr<_pi_event> retImplEv{nullptr};
-
-  try {
-    ScopedContext active(command_queue->get_context());
-    pi_result result;
-
-    auto stream = command_queue->get_next_transfer_stream();
-    result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list,
-                               event_wait_list);
-
-    if (event) {
-      retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
-          PI_COMMAND_TYPE_MEM_BUFFER_COPY, command_queue, stream));
-      result = retImplEv->start();
-    }
-
-    auto src = src_buffer->mem_.buffer_mem_.get() + src_offset;
-    auto dst = dst_buffer->mem_.buffer_mem_.get() + dst_offset;
-
-    result = PI_CHECK_ERROR(cuMemcpyDtoDAsync(dst, src, size, stream));
-
-    if (event) {
-      result = retImplEv->record();
-      *event = retImplEv.release();
-    }
-
-    return result;
-  } catch (pi_result err) {
-    return err;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-}
-
-pi_result cuda_piEnqueueMemBufferCopyRect(
-    pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer,
-    pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin,
-    pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch,
-    size_t dst_row_pitch, size_t dst_slice_pitch,
-    pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
-    pi_event *event) {
-
-  assert(src_buffer != nullptr);
-  assert(dst_buffer != nullptr);
-  assert(command_queue != nullptr);
-
-  pi_result retErr = PI_SUCCESS;
-  CUdeviceptr srcPtr = src_buffer->mem_.buffer_mem_.get();
-  CUdeviceptr dstPtr = dst_buffer->mem_.buffer_mem_.get();
-  std::unique_ptr<_pi_event> retImplEv{nullptr};
-
-  try {
-    ScopedContext active(command_queue->get_context());
-    CUstream cuStream = command_queue->get_next_transfer_stream();
-    retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
-                               event_wait_list);
-
-    if (event) {
-      retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
-          PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, command_queue, cuStream));
-      retImplEv->start();
-    }
-
-    retErr = commonEnqueueMemBufferCopyRect(
-        cuStream, region, &srcPtr, CU_MEMORYTYPE_DEVICE, src_origin,
-        src_row_pitch, src_slice_pitch, &dstPtr, CU_MEMORYTYPE_DEVICE,
-        dst_origin, dst_row_pitch, dst_slice_pitch);
-
-    if (event) {
-      retImplEv->record();
-      *event = retImplEv.release();
-    }
-
-  } catch (pi_result err) {
-    retErr = err;
-  }
-  return retErr;
-}
-
-pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer,
-                                      const void *pattern, size_t pattern_size,
-                                      size_t offset, size_t size,
-                                      pi_uint32 num_events_in_wait_list,
-                                      const pi_event *event_wait_list,
-                                      pi_event *event) {
-  assert(command_queue != nullptr);
-
-  auto args_are_multiples_of_pattern_size =
-      (offset % pattern_size == 0) || (size % pattern_size == 0);
-
-  auto pattern_is_valid = (pattern != nullptr);
-
-  auto pattern_size_is_valid =
-      ((pattern_size & (pattern_size - 1)) == 0) && // is power of two
-      (pattern_size > 0) && (pattern_size <= 128);  // falls within valid range
-
-  assert(args_are_multiples_of_pattern_size && pattern_is_valid &&
-         pattern_size_is_valid);
-  (void)args_are_multiples_of_pattern_size;
-  (void)pattern_is_valid;
-  (void)pattern_size_is_valid;
-
-  std::unique_ptr<_pi_event> retImplEv{nullptr};
-
-  try {
-    ScopedContext active(command_queue->get_context());
-
-    auto stream = command_queue->get_next_transfer_stream();
-    pi_result result;
-    result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list,
-                               event_wait_list);
-
-    if (event) {
-      retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native(
-          PI_COMMAND_TYPE_MEM_BUFFER_FILL, command_queue, stream));
-      result = retImplEv->start();
-    }
-
-    auto dstDevice = buffer->mem_.buffer_mem_.get() + offset;
-    auto N = size / pattern_size;
-
-    // pattern size in bytes
-    switch (pattern_size) {
-    case 1: {
-      auto value = *static_cast<const uint8_t *>(pattern);
-      result = PI_CHECK_ERROR(cuMemsetD8Async(dstDevice, value, N, stream));
-      break;
-    }
-    case 2: {
-      auto value = *static_cast<const uint16_t *>(pattern);
-      result = PI_CHECK_ERROR(cuMemsetD16Async(dstDevice, value, N, stream));
-      break;
-    }
-    case 4: {
-      auto value = *static_cast<const uint32_t *>(pattern);
-      result = PI_CHECK_ERROR(cuMemsetD32Async(dstDevice, value, N, stream));
-      break;
-    }
-    default: {
-      // CUDA has no memset functions that allow setting values more than 4
-      // bytes. PI API lets you pass an arbitrary "pattern" to the buffer
-      // fill, which can be more than 4 bytes. We must break up the pattern
-      // into 4 byte values, and set the buffer using multiple strided calls.
-      // This means that one cuMemsetD2D32Async call is made for every 4 bytes
-      // in the pattern.
-
-      auto number_of_steps = pattern_size / sizeof(uint32_t);
-
-      // we walk up the pattern in 4-byte steps, and call cuMemset for each
-      // 4-byte chunk of the pattern.
-      for (auto step = 0u; step < number_of_steps; ++step) {
-        // take 4 bytes of the pattern
-        auto value = *(static_cast<const uint32_t *>(pattern) + step);
-
-        // offset the pointer to the part of the buffer we want to write to
-        auto offset_ptr = dstDevice + (step * sizeof(uint32_t));
-
-        // set all of the pattern chunks
-        result = PI_CHECK_ERROR(
-            cuMemsetD2D32Async(offset_ptr, pattern_size, value, 1, N, stream));
-      }
-
-      break;
-    }
-    }
-
-    if (event) {
-      result = retImplEv->record();
-      *event = retImplEv.release();
-    }
-
-    return result;
-  } catch (pi_result err) {
-    return err;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-}
-
-static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR array_desc) {
-  switch (array_desc.Format) {
-  case CU_AD_FORMAT_UNSIGNED_INT8:
-  case CU_AD_FORMAT_SIGNED_INT8:
-    return 1;
-  case CU_AD_FORMAT_UNSIGNED_INT16:
-  case CU_AD_FORMAT_SIGNED_INT16:
-  case CU_AD_FORMAT_HALF:
-    return 2;
-  case CU_AD_FORMAT_UNSIGNED_INT32:
-  case CU_AD_FORMAT_SIGNED_INT32:
-  case CU_AD_FORMAT_FLOAT:
-    return 4;
-  default:
-    sycl::detail::pi::die("Invalid image format.");
-    return 0;
-  }
-}
-
-/// General ND memory copy operation for images (where N > 1).
-/// This function requires the corresponding CUDA context to be at the top of
-/// the context stack
-/// If the source and/or destination is an array, src_ptr and/or dst_ptr
-/// must be a pointer to a CUarray
-static pi_result commonEnqueueMemImageNDCopy(
-    CUstream cu_stream, pi_mem_type img_type, const size_t *region,
-    const void *src_ptr, const CUmemorytype_enum src_type,
-    const size_t *src_offset, void *dst_ptr, const CUmemorytype_enum dst_type,
-    const size_t *dst_offset) {
-  assert(region != nullptr);
-
-  assert(src_type == CU_MEMORYTYPE_ARRAY || src_type == CU_MEMORYTYPE_HOST);
-  assert(dst_type == CU_MEMORYTYPE_ARRAY || dst_type == CU_MEMORYTYPE_HOST);
-
-  if (img_type == PI_MEM_TYPE_IMAGE2D) {
-    CUDA_MEMCPY2D cpyDesc;
-    memset(&cpyDesc, 0, sizeof(cpyDesc));
-    cpyDesc.srcMemoryType = src_type;
-    if (src_type == CU_MEMORYTYPE_ARRAY) {
-      cpyDesc.srcArray = *static_cast<const CUarray *>(src_ptr);
-      cpyDesc.srcXInBytes = src_offset[0];
-      cpyDesc.srcY = src_offset[1];
-    } else {
-      cpyDesc.srcHost = src_ptr;
-    }
-    cpyDesc.dstMemoryType = dst_type;
-    if (dst_type == CU_MEMORYTYPE_ARRAY) {
-      cpyDesc.dstArray = *static_cast<CUarray *>(dst_ptr);
-      cpyDesc.dstXInBytes = dst_offset[0];
-      cpyDesc.dstY = dst_offset[1];
-    } else {
-      cpyDesc.dstHost = dst_ptr;
-    }
-    cpyDesc.WidthInBytes = region[0];
-    cpyDesc.Height = region[1];
-    return PI_CHECK_ERROR(cuMemcpy2DAsync(&cpyDesc, cu_stream));
-  }
-  if (img_type == PI_MEM_TYPE_IMAGE3D) {
-    CUDA_MEMCPY3D cpyDesc;
-    memset(&cpyDesc, 0, sizeof(cpyDesc));
-    cpyDesc.srcMemoryType = src_type;
-    if (src_type == CU_MEMORYTYPE_ARRAY) {
-      cpyDesc.srcArray = *static_cast<const CUarray *>(src_ptr);
-      cpyDesc.srcXInBytes = src_offset[0];
-      cpyDesc.srcY = src_offset[1];
-      cpyDesc.srcZ = src_offset[2];
-    } else {
-      cpyDesc.srcHost = src_ptr;
-    }
-    cpyDesc.dstMemoryType = dst_type;
-    if (dst_type == CU_MEMORYTYPE_ARRAY) {
-      cpyDesc.dstArray = *static_cast<CUarray *>(dst_ptr);
-      cpyDesc.dstXInBytes = dst_offset[0];
-      cpyDesc.dstY = dst_offset[1];
-      cpyDesc.dstZ = dst_offset[2];
-    } else {
-      cpyDesc.dstHost = dst_ptr;
-    }
-    cpyDesc.WidthInBytes = region[0];
-    cpyDesc.Height = region[1];
-    cpyDesc.Depth = region[2];
-    return PI_CHECK_ERROR(cuMemcpy3DAsync(&cpyDesc, cu_stream));
-  }
-  return PI_ERROR_INVALID_VALUE;
-}
-
-pi_result cuda_piEnqueueMemImageRead(
-    pi_queue command_queue, pi_mem image, pi_bool blocking_read,
-    const size_t *origin, const size_t *region, size_t row_pitch,
-    size_t slice_pitch, void *ptr, pi_uint32 num_events_in_wait_list,
-    const pi_event *event_wait_list, pi_event *event) {
-  // Ignore unused parameters
-  (void)row_pitch;
-  (void)slice_pitch;
-
-  assert(command_queue != nullptr);
-  assert(image != nullptr);
-  assert(image->mem_type_ == _pi_mem::mem_type::surface);
-
-  pi_result retErr = PI_SUCCESS;
-
-  try {
-    ScopedContext active(command_queue->get_context());
-    CUstream cuStream = command_queue->get_next_transfer_stream();
-    retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
-                               event_wait_list);
-
-    CUarray array = image->mem_.surface_mem_.get_array();
-
-    CUDA_ARRAY_DESCRIPTOR arrayDesc;
-    retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&arrayDesc, array));
-
-    int elementByteSize = imageElementByteSize(arrayDesc);
-
-    size_t byteOffsetX = origin[0] * elementByteSize * arrayDesc.NumChannels;
-    size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region[0];
-
-    pi_mem_type imgType = image->mem_.surface_mem_.get_image_type();
-    if (imgType == PI_MEM_TYPE_IMAGE1D) {
-      retErr = PI_CHECK_ERROR(
-          cuMemcpyAtoHAsync(ptr, array, byteOffsetX, bytesToCopy, cuStream));
-    } else {
-      size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
-      size_t srcOffset[3] = {byteOffsetX, origin[1], origin[2]};
-
-      retErr = commonEnqueueMemImageNDCopy(
-          cuStream, imgType, adjustedRegion, &array, CU_MEMORYTYPE_ARRAY,
-          srcOffset, ptr, CU_MEMORYTYPE_HOST, nullptr);
-
-      if (retErr != PI_SUCCESS) {
-        return retErr;
-      }
-    }
-
-    if (event) {
-      auto new_event = _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_READ,
-                                              command_queue, cuStream);
-      new_event->record();
-      *event = new_event;
-    }
-
-    if (blocking_read) {
-      retErr = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
-    }
-  } catch (pi_result err) {
-    return err;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-
-  return retErr;
-}
-
-pi_result
-cuda_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image,
-                            pi_bool blocking_write, const size_t *origin,
-                            const size_t *region, size_t input_row_pitch,
-                            size_t input_slice_pitch, const void *ptr,
-                            pi_uint32 num_events_in_wait_list,
-                            const pi_event *event_wait_list, pi_event *event) {
-  // Ignore unused parameters
-  (void)blocking_write;
-  (void)input_row_pitch;
-  (void)input_slice_pitch;
-
-  assert(command_queue != nullptr);
-  assert(image != nullptr);
-  assert(image->mem_type_ == _pi_mem::mem_type::surface);
-
-  pi_result retErr = PI_SUCCESS;
-
-  try {
-    ScopedContext active(command_queue->get_context());
-    CUstream cuStream = command_queue->get_next_transfer_stream();
-    retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
-                               event_wait_list);
-
-    CUarray array = image->mem_.surface_mem_.get_array();
-
-    CUDA_ARRAY_DESCRIPTOR arrayDesc;
-    retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&arrayDesc, array));
-
-    int elementByteSize = imageElementByteSize(arrayDesc);
-
-    size_t byteOffsetX = origin[0] * elementByteSize * arrayDesc.NumChannels;
-    size_t bytesToCopy = elementByteSize * arrayDesc.NumChannels * region[0];
-
-    pi_mem_type imgType = image->mem_.surface_mem_.get_image_type();
-    if (imgType == PI_MEM_TYPE_IMAGE1D) {
-      retErr = PI_CHECK_ERROR(
-          cuMemcpyHtoAAsync(array, byteOffsetX, ptr, bytesToCopy, cuStream));
-    } else {
-      size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
-      size_t dstOffset[3] = {byteOffsetX, origin[1], origin[2]};
-
-      retErr = commonEnqueueMemImageNDCopy(
-          cuStream, imgType, adjustedRegion, ptr, CU_MEMORYTYPE_HOST, nullptr,
-          &array, CU_MEMORYTYPE_ARRAY, dstOffset);
-
-      if (retErr != PI_SUCCESS) {
-        return retErr;
-      }
-    }
-
-    if (event) {
-      auto new_event = _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_WRITE,
-                                              command_queue, cuStream);
-      new_event->record();
-      *event = new_event;
-    }
-  } catch (pi_result err) {
-    return err;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-
-  return retErr;
-}
-
-pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image,
-                                     pi_mem dst_image, const size_t *src_origin,
-                                     const size_t *dst_origin,
-                                     const size_t *region,
-                                     pi_uint32 num_events_in_wait_list,
-                                     const pi_event *event_wait_list,
-                                     pi_event *event) {
-  assert(src_image->mem_type_ == _pi_mem::mem_type::surface);
-  assert(dst_image->mem_type_ == _pi_mem::mem_type::surface);
-  assert(src_image->mem_.surface_mem_.get_image_type() ==
-         dst_image->mem_.surface_mem_.get_image_type());
-
-  pi_result retErr = PI_SUCCESS;
-
-  try {
-    ScopedContext active(command_queue->get_context());
-    CUstream cuStream = command_queue->get_next_transfer_stream();
-    retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list,
-                               event_wait_list);
-
-    CUarray srcArray = src_image->mem_.surface_mem_.get_array();
-    CUarray dstArray = dst_image->mem_.surface_mem_.get_array();
-
-    CUDA_ARRAY_DESCRIPTOR srcArrayDesc;
-    retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&srcArrayDesc, srcArray));
-    CUDA_ARRAY_DESCRIPTOR dstArrayDesc;
-    retErr = PI_CHECK_ERROR(cuArrayGetDescriptor(&dstArrayDesc, dstArray));
-
-    assert(srcArrayDesc.Format == dstArrayDesc.Format);
-    assert(srcArrayDesc.NumChannels == dstArrayDesc.NumChannels);
-
-    int elementByteSize = imageElementByteSize(srcArrayDesc);
-
-    size_t dstByteOffsetX =
-        dst_origin[0] * elementByteSize * srcArrayDesc.NumChannels;
-    size_t srcByteOffsetX =
-        src_origin[0] * elementByteSize * dstArrayDesc.NumChannels;
-    size_t bytesToCopy = elementByteSize * srcArrayDesc.NumChannels * region[0];
-
-    pi_mem_type imgType = src_image->mem_.surface_mem_.get_image_type();
-    if (imgType == PI_MEM_TYPE_IMAGE1D) {
-      retErr = PI_CHECK_ERROR(cuMemcpyAtoA(dstArray, dstByteOffsetX, srcArray,
-                                           srcByteOffsetX, bytesToCopy));
-    } else {
-      size_t adjustedRegion[3] = {bytesToCopy, region[1], region[2]};
-      size_t srcOffset[3] = {srcByteOffsetX, src_origin[1], src_origin[2]};
-      size_t dstOffset[3] = {dstByteOffsetX, dst_origin[1], dst_origin[2]};
-
-      retErr = commonEnqueueMemImageNDCopy(
-          cuStream, imgType, adjustedRegion, &srcArray, CU_MEMORYTYPE_ARRAY,
-          srcOffset, &dstArray, CU_MEMORYTYPE_ARRAY, dstOffset);
-
-      if (retErr != PI_SUCCESS) {
-        return retErr;
-      }
-    }
-
-    if (event) {
-      auto new_event = _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_COPY,
-                                              command_queue, cuStream);
-      new_event->record();
-      *event = new_event;
-    }
-  } catch (pi_result err) {
-    return err;
-  } catch (...) {
-    return PI_ERROR_UNKNOWN;
-  }
-
-  return retErr;
-}
-
-/// \TODO Not implemented in CUDA.
-pi_result cuda_piEnqueueMemImageFill(pi_queue, pi_mem, const void *,
-                                     const size_t *, const size_t *, pi_uint32,
-                                     const pi_event *, pi_event *) {
-  sycl::detail::pi::die("cuda_piEnqueueMemImageFill not implemented");
-  return {};
-}
-
-/// Implements mapping on the host using a BufferRead operation.
-/// Mapped pointers are stored in the pi_mem object.
-/// If the buffer uses pinned host memory a pointer to that memory is returned
-/// and no read operation is done.
-///
-pi_result cuda_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer,
-                                     pi_bool blocking_map,
-                                     pi_map_flags map_flags, size_t offset,
-                                     size_t size,
-                                     pi_uint32 num_events_in_wait_list,
-                                     const pi_event *event_wait_list,
-                                     pi_event *event, void **ret_map) {
-  assert(ret_map != nullptr);
-  assert(command_queue != nullptr);
-  assert(buffer != nullptr);
-  assert(buffer->mem_type_ == _pi_mem::mem_type::buffer);
-
-  pi_result ret_err = PI_ERROR_INVALID_OPERATION;
-  const bool is_pinned = buffer->mem_.buffer_mem_.allocMode_ ==
-                         _pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr;
-
-  // Currently no support for overlapping regions
-  if (buffer->mem_.buffer_mem_.get_map_ptr() != nullptr) {
-    return ret_err;
-  }
-
-  // Allocate a pointer in the host to store the mapped information
-  auto hostPtr = buffer->mem_.buffer_mem_.map_to_ptr(offset, map_flags);
-  *ret_map = buffer->mem_.buffer_mem_.get_map_ptr();
-  if (hostPtr) {
-    ret_err = PI_SUCCESS;
-  }
-
-  if (!is_pinned && ((map_flags & PI_MAP_READ) || (map_flags & PI_MAP_WRITE))) {
-    // Pinned host memory is already on host so it doesn't need to be read.
-    ret_err = cuda_piEnqueueMemBufferRead(
-        command_queue, buffer, blocking_map, offset, size, hostPtr,
-        num_events_in_wait_list, event_wait_list, event);
-  } else {
-    ScopedContext active(command_queue->get_context());
-
-    if (is_pinned) {
-      ret_err = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
-                                         event_wait_list, nullptr);
-    }
-
-    if (event) {
-      try {
-        *event = _pi_event::make_native(
-            PI_COMMAND_TYPE_MEM_BUFFER_MAP, command_queue,
-            command_queue->get_next_transfer_stream());
-        (*event)->start();
-        (*event)->record();
-      } catch (pi_result error) {
-        ret_err = error;
-      }
-    }
-  }
-
-  return ret_err;
-}
-
-/// Implements the unmap from the host, using a BufferWrite operation.
-/// Requires the mapped pointer to be already registered in the given memobj.
-/// If memobj uses pinned host memory, this will not do a write.
-///
-pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj,
-                                 void *mapped_ptr,
-                                 pi_uint32 num_events_in_wait_list,
-                                 const pi_event *event_wait_list,
-                                 pi_event *event) {
-  pi_result ret_err = PI_SUCCESS;
-
-  assert(command_queue != nullptr);
-  assert(mapped_ptr != nullptr);
-  assert(memobj != nullptr);
-  assert(memobj->mem_type_ == _pi_mem::mem_type::buffer);
-  assert(memobj->mem_.buffer_mem_.get_map_ptr() != nullptr);
-  assert(memobj->mem_.buffer_mem_.get_map_ptr() == mapped_ptr);
-
-  const bool is_pinned = memobj->mem_.buffer_mem_.allocMode_ ==
-                         _pi_mem::mem_::buffer_mem_::alloc_mode::alloc_host_ptr;
-
-  if (!is_pinned &&
-      ((memobj->mem_.buffer_mem_.get_map_flags() & PI_MAP_WRITE) ||
-       (memobj->mem_.buffer_mem_.get_map_flags() &
-        PI_MAP_WRITE_INVALIDATE_REGION))) {
-    // Pinned host memory is only on host so it doesn't need to be written to.
-    ret_err = cuda_piEnqueueMemBufferWrite(
-        command_queue, memobj, true,
-        memobj->mem_.buffer_mem_.get_map_offset(mapped_ptr),
-        memobj->mem_.buffer_mem_.get_size(), mapped_ptr,
-        num_events_in_wait_list, event_wait_list, event);
-  } else {
-    ScopedContext active(command_queue->get_context());
-
-    if (is_pinned) {
-      ret_err = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list,
-                                         event_wait_list, nullptr);
-    }
-
-    if (event) {
-      try {
-        *event = _pi_event::make_native(
-            PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, command_queue,
-            command_queue->get_next_transfer_stream());
-        (*event)->start();
-        (*event)->record();
-      } catch (pi_result error) {
-        ret_err = error;
-      }
-    }
-  }
-
-  memobj->mem_.buffer_mem_.unmap(mapped_ptr);
-  return ret_err;
-}
-
-/// USM: Implements USM Host allocations using CUDA Pinned Memory
-///
-pi_result
-cuda_piextUSMHostAlloc(void **result_ptr, pi_context context,
-                       [[maybe_unused]] pi_usm_mem_properties *properties,
-                       size_t size, [[maybe_unused]] pi_uint32 alignment) {
-  assert(result_ptr != nullptr);
-  assert(context != nullptr);
-  assert(properties == nullptr || *properties == 0);
-  pi_result result = PI_SUCCESS;
-  try {
-    ScopedContext active(context);
-    result = PI_CHECK_ERROR(cuMemAllocHost(result_ptr, size));
-  } catch (pi_result error) {
-    result = error;
-  }
-
-  assert(alignment == 0 ||
-         (result == PI_SUCCESS &&
-          reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
-  return result;
-}
-
-/// USM: Implements USM device allocations using a normal CUDA device pointer
-///
-pi_result
-cuda_piextUSMDeviceAlloc(void **result_ptr, pi_context context,
-                         [[maybe_unused]] pi_device device,
-                         [[maybe_unused]] pi_usm_mem_properties *properties,
-                         size_t size, [[maybe_unused]] pi_uint32 alignment) {
-  assert(result_ptr != nullptr);
-  assert(context != nullptr);
-  assert(device != nullptr);
-  assert(properties == nullptr || *properties == 0);
-  pi_result result = PI_SUCCESS;
-  try {
-    ScopedContext active(context);
-    result = PI_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)result_ptr, size));
-  } catch (pi_result error) {
-    result = error;
-  }
-
-  assert(alignment == 0 ||
-         (result == PI_SUCCESS &&
-          reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
-  return result;
-}
-
-/// USM: Implements USM Shared allocations using CUDA Managed Memory
-///
-pi_result
-cuda_piextUSMSharedAlloc(void **result_ptr, pi_context context,
-                         [[maybe_unused]] pi_device device,
-                         [[maybe_unused]] pi_usm_mem_properties *properties,
-                         size_t size, [[maybe_unused]] pi_uint32 alignment) {
-  assert(result_ptr != nullptr);
-  assert(context != nullptr);
-  assert(device != nullptr);
-  assert(properties == nullptr || *properties == 0);
-  pi_result result = PI_SUCCESS;
-  try {
-    ScopedContext active(context);
-    result = PI_CHECK_ERROR(cuMemAllocManaged((CUdeviceptr *)result_ptr, size,
-                                              CU_MEM_ATTACH_GLOBAL));
-  } catch (pi_result error) {
-    result = error;
-  }
-
-  assert(alignment == 0 ||
-         (result == PI_SUCCESS &&
-          reinterpret_cast<std::uintptr_t>(*result_ptr) % alignment == 0));
-  return result;
-}
-
-/// USM: Frees the given USM pointer associated with the context.
-///
-pi_result cuda_piextUSMFree(pi_context context, void *ptr) {
-  assert(context != nullptr);
-  pi_result result = PI_SUCCESS;
-  try {
-    ScopedContext active(context);
-    bool is_managed;
-    unsigned int type;
-    void *attribute_values[2] = {&is_managed, &type};
-    CUpointer_attribute attributes[2] = {CU_POINTER_ATTRIBUTE_IS_MANAGED,
-                                         CU_POINTER_ATTRIBUTE_MEMORY_TYPE};
-    result = PI_CHECK_ERROR(cuPointerGetAttributes(
-        2, attributes, attribute_values, (CUdeviceptr)ptr));
-    assert(type == CU_MEMORYTYPE_DEVICE || type == CU_MEMORYTYPE_HOST);
-    if (is_managed || type == CU_MEMORYTYPE_DEVICE) {
-      // Memory allocated with cuMemAlloc and cuMemAllocManaged must be freed
-      // with cuMemFree
-      result = PI_CHECK_ERROR(cuMemFree((CUdeviceptr)ptr));
-    } else {
-      // Memory allocated with cuMemAllocHost must be freed with cuMemFreeHost
-      result = PI_CHECK_ERROR(cuMemFreeHost(ptr));
-    }
-  } catch (pi_result error) {
-    result = error;
-  }
-  return result;
-}
-
-pi_result cuda_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value,
-                                     size_t count,
-                                     pi_uint32 num_events_in_waitlist,
-                                     const pi_event *events_waitlist,
-                                     pi_event *event) {
-  assert(queue != nullptr);
-  assert(ptr != nullptr);
-  pi_result result = PI_SUCCESS;
-  std::unique_ptr<_pi_event> event_ptr{nullptr};
-
-  try {
-    ScopedContext active(queue->get_context());
-    pi_uint32 stream_token;
-    _pi_stream_guard guard;
-    CUstream cuStream = queue->get_next_compute_stream(
-        num_events_in_waitlist, events_waitlist, guard, &stream_token);
-    result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist,
-                               events_waitlist);
-    if (event) {
-      event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
-          PI_COMMAND_TYPE_MEM_BUFFER_FILL, queue, cuStream, stream_token));
-      event_ptr->start();
-    }
-    result = PI_CHECK_ERROR(cuMemsetD8Async(
-        (CUdeviceptr)ptr, (unsigned char)value & 0xFF, count, cuStream));
-    if (event) {
-      result = event_ptr->record();
-      *event = event_ptr.release();
-    }
-  } catch (pi_result err) {
-    result = err;
-  }
-  return result;
-}
-
-pi_result cuda_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking,
-                                     void *dst_ptr, const void *src_ptr,
-                                     size_t size,
-                                     pi_uint32 num_events_in_waitlist,
-                                     const pi_event *events_waitlist,
-                                     pi_event *event) {
-  assert(queue != nullptr);
-  assert(dst_ptr != nullptr);
-  assert(src_ptr != nullptr);
-  pi_result result = PI_SUCCESS;
-
-  std::unique_ptr<_pi_event> event_ptr{nullptr};
-
-  try {
-    ScopedContext active(queue->get_context());
-    CUstream cuStream = queue->get_next_transfer_stream();
-    result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist,
-                               events_waitlist);
-    if (event) {
-      event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
-          PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue, cuStream));
-      event_ptr->start();
-    }
-    result = PI_CHECK_ERROR(cuMemcpyAsync(
-        (CUdeviceptr)dst_ptr, (CUdeviceptr)src_ptr, size, cuStream));
-    if (event) {
-      result = event_ptr->record();
-    }
-    if (blocking) {
-      result = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
-    }
-    if (event) {
-      *event = event_ptr.release();
-    }
-  } catch (pi_result err) {
-    result = err;
-  }
-  return result;
-}
-
-pi_result cuda_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr,
-                                       size_t size,
-                                       pi_usm_migration_flags flags,
-                                       pi_uint32 num_events_in_waitlist,
-                                       const pi_event *events_waitlist,
-                                       pi_event *event) {
-  pi_device device = queue->get_context()->get_device();
-
-  // Certain cuda devices and Windows do not have support for some Unified
-  // Memory features. cuMemPrefetchAsync requires concurrent memory access
-  // for managed memory. Therfore, ignore prefetch hint if concurrent managed
-  // memory access is not available.
-  if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
-    setErrorMessage("Prefetch hint ignored as device does not support "
-                    "concurrent managed access",
-                    PI_SUCCESS);
-    return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
-  }
-
-  unsigned int is_managed;
-  PI_CHECK_ERROR(cuPointerGetAttribute(
-      &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr));
-  if (!is_managed) {
-    setErrorMessage("Prefetch hint ignored as prefetch only works with USM",
-                    PI_SUCCESS);
-    return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
-  }
-
-  // flags is currently unused so fail if set
-  if (flags != 0)
-    return PI_ERROR_INVALID_VALUE;
-  assert(queue != nullptr);
-  assert(ptr != nullptr);
-  pi_result result = PI_SUCCESS;
-  std::unique_ptr<_pi_event> event_ptr{nullptr};
-
-  try {
-    ScopedContext active(queue->get_context());
-    CUstream cuStream = queue->get_next_transfer_stream();
-    result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist,
-                               events_waitlist);
-    if (event) {
-      event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
-          PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue, cuStream));
-      event_ptr->start();
-    }
-    result = PI_CHECK_ERROR(
-        cuMemPrefetchAsync((CUdeviceptr)ptr, size, device->get(), cuStream));
-    if (event) {
-      result = event_ptr->record();
-      *event = event_ptr.release();
-    }
-  } catch (pi_result err) {
-    result = err;
-  }
-  return result;
-}
-
-/// USM: memadvise API to govern behavior of automatic migration mechanisms
-pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr,
-                                        size_t length, pi_mem_advice advice,
-                                        pi_event *event) {
-  assert(queue != nullptr);
-  assert(ptr != nullptr);
-
-  // Certain cuda devices and Windows do not have support for some Unified
-  // Memory features. Passing CU_MEM_ADVISE_[UN]SET_PREFERRED_LOCATION and
-  // CU_MEM_ADVISE_[UN]SET_ACCESSED_BY to cuMemAdvise on a GPU device requires
-  // the GPU device to report a non-zero value for
-  // CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Therfore, ignore memory
-  // advise if concurrent managed memory access is not available.
-  if (advice == PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION ||
-      advice == PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION ||
-      advice == PI_MEM_ADVICE_CUDA_SET_ACCESSED_BY ||
-      advice == PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY ||
-      advice == PI_MEM_ADVICE_RESET) {
-    pi_device device = queue->get_context()->get_device();
-    if (!getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
-      setErrorMessage("Mem advise ignored as device does not support "
-                      "concurrent managed access",
-                      PI_SUCCESS);
-      return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
-    }
-
-    // TODO: If ptr points to valid system-allocated pageable memory we should
-    // check that the device also has the
-    // CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS property.
-  }
-
-  unsigned int is_managed;
-  PI_CHECK_ERROR(cuPointerGetAttribute(
-      &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr));
-  if (!is_managed) {
-    setErrorMessage(
-        "Memory advice ignored as memory advices only works with USM",
-        PI_SUCCESS);
-    return PI_ERROR_PLUGIN_SPECIFIC_ERROR;
-  }
-
-  pi_result result = PI_SUCCESS;
-  std::unique_ptr<_pi_event> event_ptr{nullptr};
-
-  try {
-    ScopedContext active(queue->get_context());
-
-    if (event) {
-      event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native(
-          PI_COMMAND_TYPE_USER, queue, queue->get_next_transfer_stream()));
-      event_ptr->start();
-    }
-
-    switch (advice) {
-    case PI_MEM_ADVICE_CUDA_SET_READ_MOSTLY:
-    case PI_MEM_ADVICE_CUDA_UNSET_READ_MOSTLY:
-    case PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION:
-    case PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION:
-    case PI_MEM_ADVICE_CUDA_SET_ACCESSED_BY:
-    case PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY:
-      result = PI_CHECK_ERROR(cuMemAdvise(
-          (CUdeviceptr)ptr, length,
-          (CUmem_advise)(advice - PI_MEM_ADVICE_CUDA_SET_READ_MOSTLY + 1),
-          queue->get_context()->get_device()->get()));
-      break;
-    case PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION_HOST:
-    case PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION_HOST:
-    case PI_MEM_ADVICE_CUDA_SET_ACCESSED_BY_HOST:
-    case PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY_HOST:
-      result = PI_CHECK_ERROR(cuMemAdvise(
-          (CUdeviceptr)ptr, length,
-          (CUmem_advise)(advice - PI_MEM_ADVICE_CUDA_SET_READ_MOSTLY + 1 -
-                         (PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION_HOST -
-                          PI_MEM_ADVICE_CUDA_SET_PREFERRED_LOCATION)),
-          CU_DEVICE_CPU));
-      break;
-    case PI_MEM_ADVICE_RESET:
-      PI_CHECK_ERROR(cuMemAdvise((CUdeviceptr)ptr, length,
-                                 CU_MEM_ADVISE_UNSET_READ_MOSTLY,
-                                 queue->get_context()->get_device()->get()));
-      PI_CHECK_ERROR(cuMemAdvise((CUdeviceptr)ptr, length,
-                                 CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION,
-                                 queue->get_context()->get_device()->get()));
-      PI_CHECK_ERROR(cuMemAdvise((CUdeviceptr)ptr, length,
-                                 CU_MEM_ADVISE_UNSET_ACCESSED_BY,
-                                 queue->get_context()->get_device()->get()));
-      break;
-    default:
-      sycl::detail::pi::die("Unknown advice");
-    }
-    if (event) {
-      result = event_ptr->record();
-      *event = event_ptr.release();
-    }
-  } catch (pi_result err) {
-    result = err;
-  } catch (...) {
-    result = PI_ERROR_UNKNOWN;
-  }
-  return result;
-}
-
-// TODO: Implement this. Remember to return true for
-//       PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT when it is implemented.
-pi_result cuda_piextUSMEnqueueFill2D(pi_queue, void *, size_t, size_t,
-                                     const void *, size_t, size_t, pi_uint32,
-                                     const pi_event *, pi_event *) {
-  sycl::detail::pi::die("piextUSMEnqueueFill2D: not implemented");
-  return {};
-}
-
-// TODO: Implement this. Remember to return true for
-//       PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMSET2D_SUPPORT when it is implemented.
-pi_result cuda_piextUSMEnqueueMemset2D(pi_queue, void *, size_t, int, size_t,
-                                       size_t, pi_uint32, const pi_event *,
-                                       pi_event *) {
-  sycl::detail::pi::die("cuda_piextUSMEnqueueMemset2D: not implemented");
-  return {};
-}
-
-/// 2D Memcpy API
-///
-/// \param queue is the queue to submit to
-/// \param blocking is whether this operation should block the host
-/// \param dst_ptr is the location the data will be copied
-/// \param dst_pitch is the total width of the destination memory including
-/// padding
-/// \param src_ptr is the data to be copied
-/// \param dst_pitch is the total width of the source memory including padding
-/// \param width is width in bytes of each row to be copied
-/// \param height is height the columns to be copied
-/// \param num_events_in_waitlist is the number of events to wait on
-/// \param events_waitlist is an array of events to wait on
-/// \param event is the event that represents this operation
-pi_result cuda_piextUSMEnqueueMemcpy2D(pi_queue queue, pi_bool blocking,
-                                       void *dst_ptr, size_t dst_pitch,
-                                       const void *src_ptr, size_t src_pitch,
-                                       size_t width, size_t height,
-                                       pi_uint32 num_events_in_wait_list,
-                                       const pi_event *event_wait_list,
-                                       pi_event *event) {
-
-  assert(queue != nullptr);
-
-  pi_result result = PI_SUCCESS;
-
-  try {
-    ScopedContext active(queue->get_context());
-    CUstream cuStream = queue->get_next_transfer_stream();
-    result = enqueueEventsWait(queue, cuStream, num_events_in_wait_list,
-                               event_wait_list);
-    if (event) {
-      (*event) = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT,
-                                        queue, cuStream);
-      (*event)->start();
-    }
-
-    // Determine the direction of copy using cuPointerGetAttribute
-    // for both the src_ptr and dst_ptr
-    CUDA_MEMCPY2D cpyDesc;
-    memset(&cpyDesc, 0, sizeof(cpyDesc));
-
-    getUSMHostOrDevicePtr(src_ptr, &cpyDesc.srcMemoryType, &cpyDesc.srcDevice,
-                          &cpyDesc.srcHost);
-    getUSMHostOrDevicePtr(dst_ptr, &cpyDesc.dstMemoryType, &cpyDesc.dstDevice,
-                          &cpyDesc.dstHost);
-
-    cpyDesc.dstPitch = dst_pitch;
-    cpyDesc.srcPitch = src_pitch;
-    cpyDesc.WidthInBytes = width;
-    cpyDesc.Height = height;
-
-    result = PI_CHECK_ERROR(cuMemcpy2DAsync(&cpyDesc, cuStream));
-
-    if (event) {
-      (*event)->record();
-    }
-    if (blocking) {
-      result = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
-    }
-  } catch (pi_result err) {
-    result = err;
-  }
-  return result;
-}
-
-/// API to query information about USM allocated pointers
-/// Valid Queries:
-///   PI_MEM_ALLOC_TYPE returns host/device/shared pi_host_usm value
-///   PI_MEM_ALLOC_BASE_PTR returns the base ptr of an allocation if
-///                         the queried pointer fell inside an allocation.
-///                         Result must fit in void *
-///   PI_MEM_ALLOC_SIZE returns how big the queried pointer's
-///                     allocation is in bytes. Result is a size_t.
-///   PI_MEM_ALLOC_DEVICE returns the pi_device this was allocated against
-///
-/// \param context is the pi_context
-/// \param ptr is the pointer to query
-/// \param param_name is the type of query to perform
-/// \param param_value_size is the size of the result in bytes
-/// \param param_value is the result
-/// \param param_value_size_ret is how many bytes were written
-pi_result cuda_piextUSMGetMemAllocInfo(pi_context context, const void *ptr,
-                                       pi_mem_alloc_info param_name,
-                                       size_t param_value_size,
-                                       void *param_value,
-                                       size_t *param_value_size_ret) {
-  assert(context != nullptr);
-  assert(ptr != nullptr);
-  pi_result result = PI_SUCCESS;
-
-  try {
-    ScopedContext active(context);
-    switch (param_name) {
-    case PI_MEM_ALLOC_TYPE: {
-      unsigned int value;
-      // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE
-      CUresult ret = cuPointerGetAttribute(
-          &value, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr);
-      if (ret == CUDA_ERROR_INVALID_VALUE) {
-        // pointer not known to the CUDA subsystem
-        return getInfo(param_value_size, param_value, param_value_size_ret,
-                       PI_MEM_TYPE_UNKNOWN);
-      }
-      result = check_error(ret, __func__, __LINE__ - 5, __FILE__);
-      if (value) {
-        // pointer to managed memory
-        return getInfo(param_value_size, param_value, param_value_size_ret,
-                       PI_MEM_TYPE_SHARED);
-      }
-      result = PI_CHECK_ERROR(cuPointerGetAttribute(
-          &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)ptr));
-      assert(value == CU_MEMORYTYPE_DEVICE || value == CU_MEMORYTYPE_HOST);
-      if (value == CU_MEMORYTYPE_DEVICE) {
-        // pointer to device memory
-        return getInfo(param_value_size, param_value, param_value_size_ret,
-                       PI_MEM_TYPE_DEVICE);
-      }
-      if (value == CU_MEMORYTYPE_HOST) {
-        // pointer to host memory
-        return getInfo(param_value_size, param_value, param_value_size_ret,
-                       PI_MEM_TYPE_HOST);
-      }
-      // should never get here
-#ifdef _MSC_VER
-      __assume(0);
-#else
-      __builtin_unreachable();
-#endif
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     PI_MEM_TYPE_UNKNOWN);
-    }
-    case PI_MEM_ALLOC_BASE_PTR: {
-#if CUDA_VERSION >= 10020
-      // CU_POINTER_ATTRIBUTE_RANGE_START_ADDR was introduced in CUDA 10.2
-      unsigned int value;
-      result = PI_CHECK_ERROR(cuPointerGetAttribute(
-          &value, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, (CUdeviceptr)ptr));
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     value);
-#else
-      return PI_ERROR_INVALID_VALUE;
-#endif
-    }
-    case PI_MEM_ALLOC_SIZE: {
-#if CUDA_VERSION >= 10020
-      // CU_POINTER_ATTRIBUTE_RANGE_SIZE was introduced in CUDA 10.2
-      unsigned int value;
-      result = PI_CHECK_ERROR(cuPointerGetAttribute(
-          &value, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)ptr));
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     value);
-#else
-      return PI_ERROR_INVALID_VALUE;
-#endif
-    }
-    case PI_MEM_ALLOC_DEVICE: {
-      // get device index associated with this pointer
-      unsigned int device_idx;
-      result = PI_CHECK_ERROR(cuPointerGetAttribute(
-          &device_idx, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)ptr));
-
-      // currently each device is in its own platform, so find the platform at
-      // the same index
-      std::vector<pi_platform> platforms;
-      platforms.resize(device_idx + 1);
-      result = cuda_piPlatformsGet(device_idx + 1, platforms.data(), nullptr);
-
-      // get the device from the platform
-      pi_device device = platforms[device_idx]->devices_[0].get();
-      return getInfo(param_value_size, param_value, param_value_size_ret,
-                     device);
-    }
-    }
-  } catch (pi_result error) {
-    result = error;
-  }
-  return result;
-}
-
-pi_result cuda_piextEnqueueDeviceGlobalVariableWrite(
-    pi_queue queue, pi_program program, const char *name,
-    pi_bool blocking_write, size_t count, size_t offset, const void *src,
-    pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
-    pi_event *event) {
-  assert(queue != nullptr);
-  assert(program != nullptr);
-
-  if (name == nullptr || src == nullptr)
-    return PI_ERROR_INVALID_VALUE;
-
-  // Since CUDA requires a the global variable to be referenced by name, we use
-  // metadata to find the correct name to access it by.
-  auto device_global_name_it = program->globalIDMD_.find(name);
-  if (device_global_name_it == program->globalIDMD_.end())
-    return PI_ERROR_INVALID_VALUE;
-  std::string device_global_name = device_global_name_it->second;
-
-  pi_result result = PI_SUCCESS;
-  try {
-    CUdeviceptr device_global = 0;
-    size_t device_global_size = 0;
-    result = PI_CHECK_ERROR(
-        cuModuleGetGlobal(&device_global, &device_global_size, program->get(),
-                          device_global_name.c_str()));
-
-    if (offset + count > device_global_size)
-      return PI_ERROR_INVALID_VALUE;
-
-    return cuda_piextUSMEnqueueMemcpy(
-        queue, blocking_write, reinterpret_cast<void *>(device_global + offset),
-        src, count, num_events_in_wait_list, event_wait_list, event);
-  } catch (pi_result error) {
-    result = error;
-  }
-  return result;
-}
-
-pi_result cuda_piextEnqueueDeviceGlobalVariableRead(
-    pi_queue queue, pi_program program, const char *name, pi_bool blocking_read,
-    size_t count, size_t offset, void *dst, pi_uint32 num_events_in_wait_list,
-    const pi_event *event_wait_list, pi_event *event) {
-  assert(queue != nullptr);
-  assert(program != nullptr);
-
-  if (name == nullptr || dst == nullptr)
-    return PI_ERROR_INVALID_VALUE;
-
-  // Since CUDA requires a the global variable to be referenced by name, we use
-  // metadata to find the correct name to access it by.
-  auto device_global_name_it = program->globalIDMD_.find(name);
-  if (device_global_name_it == program->globalIDMD_.end())
-    return PI_ERROR_INVALID_VALUE;
-  std::string device_global_name = device_global_name_it->second;
-
-  pi_result result = PI_SUCCESS;
-  try {
-    CUdeviceptr device_global = 0;
-    size_t device_global_size = 0;
-    result = PI_CHECK_ERROR(
-        cuModuleGetGlobal(&device_global, &device_global_size, program->get(),
-                          device_global_name.c_str()));
-
-    if (offset + count > device_global_size)
-      return PI_ERROR_INVALID_VALUE;
-
-    return cuda_piextUSMEnqueueMemcpy(
-        queue, blocking_read, dst,
-        reinterpret_cast<const void *>(device_global + offset), count,
-        num_events_in_wait_list, event_wait_list, event);
-  } catch (pi_result error) {
-    result = error;
-  }
-  return result;
-}
-
-/// Host Pipes
-pi_result cuda_piextEnqueueReadHostPipe(
-    pi_queue queue, pi_program program, const char *pipe_symbol,
-    pi_bool blocking, void *ptr, size_t size, pi_uint32 num_events_in_waitlist,
-    const pi_event *events_waitlist, pi_event *event) {
-  (void)queue;
-  (void)program;
-  (void)pipe_symbol;
-  (void)blocking;
-  (void)ptr;
-  (void)size;
-  (void)num_events_in_waitlist;
-  (void)events_waitlist;
-  (void)event;
-
-  sycl::detail::pi::die("cuda_piextEnqueueReadHostPipe not implemented");
-  return {};
-}
-
-pi_result cuda_piextEnqueueWriteHostPipe(
-    pi_queue queue, pi_program program, const char *pipe_symbol,
-    pi_bool blocking, void *ptr, size_t size, pi_uint32 num_events_in_waitlist,
-    const pi_event *events_waitlist, pi_event *event) {
-  (void)queue;
-  (void)program;
-  (void)pipe_symbol;
-  (void)blocking;
-  (void)ptr;
-  (void)size;
-  (void)num_events_in_waitlist;
-  (void)events_waitlist;
-  (void)event;
-
-  sycl::detail::pi::die("cuda_piextEnqueueWriteHostPipe not implemented");
-  return {};
-}
-
-// This API is called by Sycl RT to notify the end of the plugin lifetime.
-// Windows: dynamically loaded plugins might have been unloaded already
-// when this is called. Sycl RT holds onto the PI plugin so it can be
-// called safely. But this is not transitive. If the PI plugin in turn
-// dynamically loaded a different DLL, that may have been unloaded.
-// TODO: add a global variable lifetime management code here (see
-// pi_level_zero.cpp for reference) Currently this is just a NOOP.
-pi_result cuda_piTearDown(void *) {
-  disableCUDATracing();
-  return PI_SUCCESS;
-}
-
-pi_result cuda_piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime,
-                                       uint64_t *HostTime) {
-  _pi_event::native_type event;
-  ScopedContext active(Device->get_context());
-
-  if (DeviceTime) {
-    PI_CHECK_ERROR(cuEventCreate(&event, CU_EVENT_DEFAULT));
-    PI_CHECK_ERROR(cuEventRecord(event, 0));
-  }
-  if (HostTime) {
-
-    using namespace std::chrono;
-    *HostTime =
-        duration_cast<nanoseconds>(steady_clock::now().time_since_epoch())
-            .count();
-  }
-
-  if (DeviceTime) {
-    PI_CHECK_ERROR(cuEventSynchronize(event));
-    *DeviceTime = Device->get_elapsed_time(event);
-  }
+// Forward declarations
+void enableCUDATracing();
 
-  return PI_SUCCESS;
-}
+//-- PI API implementation
+extern "C" {
 
 const char SupportedVersion[] = _PI_CUDA_PLUGIN_VERSION_STRING;
 
@@ -5915,141 +46,145 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   (PluginInit->PiFunctionTable).pi_api = (decltype(&::pi_api))(&cuda_api);
 
   // Platform
-  _PI_CL(piPlatformsGet, cuda_piPlatformsGet)
-  _PI_CL(piPlatformGetInfo, cuda_piPlatformGetInfo)
+  _PI_CL(piPlatformsGet, pi2ur::piPlatformsGet)
+  _PI_CL(piPlatformGetInfo, pi2ur::piPlatformGetInfo)
   // Device
-  _PI_CL(piDevicesGet, cuda_piDevicesGet)
-  _PI_CL(piDeviceGetInfo, cuda_piDeviceGetInfo)
-  _PI_CL(piDevicePartition, cuda_piDevicePartition)
-  _PI_CL(piDeviceRetain, cuda_piDeviceRetain)
-  _PI_CL(piDeviceRelease, cuda_piDeviceRelease)
-  _PI_CL(piextDeviceSelectBinary, cuda_piextDeviceSelectBinary)
-  _PI_CL(piextGetDeviceFunctionPointer, cuda_piextGetDeviceFunctionPointer)
-  _PI_CL(piextDeviceGetNativeHandle, cuda_piextDeviceGetNativeHandle)
+  _PI_CL(piDevicesGet, pi2ur::piDevicesGet)
+  _PI_CL(piDeviceGetInfo, pi2ur::piDeviceGetInfo)
+  _PI_CL(piDevicePartition, pi2ur::piDevicePartition)
+  _PI_CL(piDeviceRetain, pi2ur::piDeviceRetain)
+  _PI_CL(piDeviceRelease, pi2ur::piDeviceRelease)
+  _PI_CL(piextDeviceSelectBinary, pi2ur::piextDeviceSelectBinary)
+  _PI_CL(piextGetDeviceFunctionPointer, pi2ur::piextGetDeviceFunctionPointer)
+  _PI_CL(piextDeviceGetNativeHandle, pi2ur::piextDeviceGetNativeHandle)
   _PI_CL(piextDeviceCreateWithNativeHandle,
-         cuda_piextDeviceCreateWithNativeHandle)
+         pi2ur::piextDeviceCreateWithNativeHandle)
   // Context
-  _PI_CL(piextContextSetExtendedDeleter, cuda_piextContextSetExtendedDeleter)
-  _PI_CL(piContextCreate, cuda_piContextCreate)
-  _PI_CL(piContextGetInfo, cuda_piContextGetInfo)
-  _PI_CL(piContextRetain, cuda_piContextRetain)
-  _PI_CL(piContextRelease, cuda_piContextRelease)
-  _PI_CL(piextContextGetNativeHandle, cuda_piextContextGetNativeHandle)
+  _PI_CL(piextContextSetExtendedDeleter, pi2ur::piextContextSetExtendedDeleter)
+  _PI_CL(piContextCreate, pi2ur::piContextCreate)
+  _PI_CL(piContextGetInfo, pi2ur::piContextGetInfo)
+  _PI_CL(piContextRetain, pi2ur::piContextRetain)
+  _PI_CL(piContextRelease, pi2ur::piContextRelease)
+  _PI_CL(piextContextGetNativeHandle, pi2ur::piextContextGetNativeHandle)
   _PI_CL(piextContextCreateWithNativeHandle,
-         cuda_piextContextCreateWithNativeHandle)
+         pi2ur::piextContextCreateWithNativeHandle)
   // Queue
-  _PI_CL(piQueueCreate, cuda_piQueueCreate)
-  _PI_CL(piextQueueCreate, cuda_piextQueueCreate)
-  _PI_CL(piQueueGetInfo, cuda_piQueueGetInfo)
-  _PI_CL(piQueueFinish, cuda_piQueueFinish)
-  _PI_CL(piQueueFlush, cuda_piQueueFlush)
-  _PI_CL(piQueueRetain, cuda_piQueueRetain)
-  _PI_CL(piQueueRelease, cuda_piQueueRelease)
-  _PI_CL(piextQueueGetNativeHandle, cuda_piextQueueGetNativeHandle)
+  _PI_CL(piQueueCreate, pi2ur::piQueueCreate)
+  _PI_CL(piextQueueCreate, pi2ur::piextQueueCreate)
+  _PI_CL(piQueueGetInfo, pi2ur::piQueueGetInfo)
+  _PI_CL(piQueueFinish, pi2ur::piQueueFinish)
+  _PI_CL(piQueueFlush, pi2ur::piQueueFlush)
+  _PI_CL(piQueueRetain, pi2ur::piQueueRetain)
+  _PI_CL(piQueueRelease, pi2ur::piQueueRelease)
+  _PI_CL(piextQueueGetNativeHandle, pi2ur::piextQueueGetNativeHandle)
   _PI_CL(piextQueueCreateWithNativeHandle,
-         cuda_piextQueueCreateWithNativeHandle)
+         pi2ur::piextQueueCreateWithNativeHandle)
   // Memory
-  _PI_CL(piMemBufferCreate, cuda_piMemBufferCreate)
-  _PI_CL(piMemImageCreate, cuda_piMemImageCreate)
-  _PI_CL(piMemGetInfo, cuda_piMemGetInfo)
-  _PI_CL(piMemImageGetInfo, cuda_piMemImageGetInfo)
-  _PI_CL(piMemRetain, cuda_piMemRetain)
-  _PI_CL(piMemRelease, cuda_piMemRelease)
-  _PI_CL(piMemBufferPartition, cuda_piMemBufferPartition)
-  _PI_CL(piextMemGetNativeHandle, cuda_piextMemGetNativeHandle)
-  _PI_CL(piextMemCreateWithNativeHandle, cuda_piextMemCreateWithNativeHandle)
+  _PI_CL(piMemBufferCreate, pi2ur::piMemBufferCreate)
+  _PI_CL(piMemImageCreate, pi2ur::piMemImageCreate)
+  _PI_CL(piMemGetInfo, pi2ur::piMemGetInfo)
+  _PI_CL(piMemImageGetInfo, pi2ur::piMemImageGetInfo)
+  _PI_CL(piMemRetain, pi2ur::piMemRetain)
+  _PI_CL(piMemRelease, pi2ur::piMemRelease)
+  _PI_CL(piMemBufferPartition, pi2ur::piMemBufferPartition)
+  _PI_CL(piextMemGetNativeHandle, pi2ur::piextMemGetNativeHandle)
+  _PI_CL(piextMemCreateWithNativeHandle, pi2ur::piextMemCreateWithNativeHandle)
+
   // Program
-  _PI_CL(piProgramCreate, cuda_piProgramCreate)
-  _PI_CL(piclProgramCreateWithSource, cuda_piclProgramCreateWithSource)
-  _PI_CL(piProgramCreateWithBinary, cuda_piProgramCreateWithBinary)
-  _PI_CL(piProgramGetInfo, cuda_piProgramGetInfo)
-  _PI_CL(piProgramCompile, cuda_piProgramCompile)
-  _PI_CL(piProgramBuild, cuda_piProgramBuild)
-  _PI_CL(piProgramLink, cuda_piProgramLink)
-  _PI_CL(piProgramGetBuildInfo, cuda_piProgramGetBuildInfo)
-  _PI_CL(piProgramRetain, cuda_piProgramRetain)
-  _PI_CL(piProgramRelease, cuda_piProgramRelease)
-  _PI_CL(piextProgramGetNativeHandle, cuda_piextProgramGetNativeHandle)
+  _PI_CL(piProgramCreate, pi2ur::piProgramCreate)
+  _PI_CL(piclProgramCreateWithSource, pi2ur::piclProgramCreateWithSource)
+  _PI_CL(piProgramCreateWithBinary, pi2ur::piProgramCreateWithBinary)
+  _PI_CL(piProgramGetInfo, pi2ur::piProgramGetInfo)
+  _PI_CL(piProgramCompile, pi2ur::piProgramCompile)
+  _PI_CL(piProgramBuild, pi2ur::piProgramBuild)
+  _PI_CL(piProgramLink, pi2ur::piProgramLink)
+  _PI_CL(piProgramGetBuildInfo, pi2ur::piProgramGetBuildInfo)
+  _PI_CL(piProgramRetain, pi2ur::piProgramRetain)
+  _PI_CL(piProgramRelease, pi2ur::piProgramRelease)
+  _PI_CL(piextProgramGetNativeHandle, pi2ur::piextProgramGetNativeHandle)
   _PI_CL(piextProgramCreateWithNativeHandle,
-         cuda_piextProgramCreateWithNativeHandle)
-  // Kernel
-  _PI_CL(piKernelCreate, cuda_piKernelCreate)
-  _PI_CL(piKernelSetArg, cuda_piKernelSetArg)
-  _PI_CL(piKernelGetInfo, cuda_piKernelGetInfo)
-  _PI_CL(piKernelGetGroupInfo, cuda_piKernelGetGroupInfo)
-  _PI_CL(piKernelGetSubGroupInfo, cuda_piKernelGetSubGroupInfo)
-  _PI_CL(piKernelRetain, cuda_piKernelRetain)
-  _PI_CL(piKernelRelease, cuda_piKernelRelease)
-  _PI_CL(piKernelSetExecInfo, cuda_piKernelSetExecInfo)
+         pi2ur::piextProgramCreateWithNativeHandle)
   _PI_CL(piextProgramSetSpecializationConstant,
-         cuda_piextProgramSetSpecializationConstant)
-  _PI_CL(piextKernelSetArgPointer, cuda_piextKernelSetArgPointer)
+         pi2ur::piextProgramSetSpecializationConstant)
+  // Kernel
+  _PI_CL(piKernelCreate, pi2ur::piKernelCreate)
+  _PI_CL(piKernelSetArg, pi2ur::piKernelSetArg)
+  _PI_CL(piKernelGetInfo, pi2ur::piKernelGetInfo)
+  _PI_CL(piKernelGetGroupInfo, pi2ur::piKernelGetGroupInfo)
+  _PI_CL(piKernelGetSubGroupInfo, pi2ur::piKernelGetSubGroupInfo)
+  _PI_CL(piKernelRetain, pi2ur::piKernelRetain)
+  _PI_CL(piKernelRelease, pi2ur::piKernelRelease)
+  _PI_CL(piextKernelGetNativeHandle, pi2ur::piextKernelGetNativeHandle)
+  _PI_CL(piKernelSetExecInfo, pi2ur::piKernelSetExecInfo)
+  _PI_CL(piextKernelSetArgPointer, pi2ur::piKernelSetArgPointer)
   _PI_CL(piextKernelCreateWithNativeHandle,
-         cuda_piextKernelCreateWithNativeHandle)
+         pi2ur::piextKernelCreateWithNativeHandle)
+
   // Event
-  _PI_CL(piEventCreate, cuda_piEventCreate)
-  _PI_CL(piEventGetInfo, cuda_piEventGetInfo)
-  _PI_CL(piEventGetProfilingInfo, cuda_piEventGetProfilingInfo)
-  _PI_CL(piEventsWait, cuda_piEventsWait)
-  _PI_CL(piEventSetCallback, cuda_piEventSetCallback)
-  _PI_CL(piEventSetStatus, cuda_piEventSetStatus)
-  _PI_CL(piEventRetain, cuda_piEventRetain)
-  _PI_CL(piEventRelease, cuda_piEventRelease)
-  _PI_CL(piextEventGetNativeHandle, cuda_piextEventGetNativeHandle)
+  _PI_CL(piEventCreate, pi2ur::piEventCreate)
+  _PI_CL(piEventGetInfo, pi2ur::piEventGetInfo)
+  _PI_CL(piEventGetProfilingInfo, pi2ur::piEventGetProfilingInfo)
+  _PI_CL(piEventsWait, pi2ur::piEventsWait)
+  _PI_CL(piEventSetCallback, pi2ur::piEventSetCallback)
+  _PI_CL(piEventSetStatus, pi2ur::piEventSetStatus)
+  _PI_CL(piEventRetain, pi2ur::piEventRetain)
+  _PI_CL(piEventRelease, pi2ur::piEventRelease)
+  _PI_CL(piextEventGetNativeHandle, pi2ur::piextEventGetNativeHandle)
   _PI_CL(piextEventCreateWithNativeHandle,
-         cuda_piextEventCreateWithNativeHandle)
+         pi2ur::piextEventCreateWithNativeHandle)
   // Sampler
-  _PI_CL(piSamplerCreate, cuda_piSamplerCreate)
-  _PI_CL(piSamplerGetInfo, cuda_piSamplerGetInfo)
-  _PI_CL(piSamplerRetain, cuda_piSamplerRetain)
-  _PI_CL(piSamplerRelease, cuda_piSamplerRelease)
+  _PI_CL(piSamplerCreate, pi2ur::piSamplerCreate)
+  _PI_CL(piSamplerGetInfo, pi2ur::piSamplerGetInfo)
+  _PI_CL(piSamplerRetain, pi2ur::piSamplerRetain)
+  _PI_CL(piSamplerRelease, pi2ur::piSamplerRelease)
   // Queue commands
-  _PI_CL(piEnqueueKernelLaunch, cuda_piEnqueueKernelLaunch)
-  _PI_CL(piEnqueueNativeKernel, cuda_piEnqueueNativeKernel)
-  _PI_CL(piEnqueueEventsWait, cuda_piEnqueueEventsWait)
-  _PI_CL(piEnqueueEventsWaitWithBarrier, cuda_piEnqueueEventsWaitWithBarrier)
-  _PI_CL(piEnqueueMemBufferRead, cuda_piEnqueueMemBufferRead)
-  _PI_CL(piEnqueueMemBufferReadRect, cuda_piEnqueueMemBufferReadRect)
-  _PI_CL(piEnqueueMemBufferWrite, cuda_piEnqueueMemBufferWrite)
-  _PI_CL(piEnqueueMemBufferWriteRect, cuda_piEnqueueMemBufferWriteRect)
-  _PI_CL(piEnqueueMemBufferCopy, cuda_piEnqueueMemBufferCopy)
-  _PI_CL(piEnqueueMemBufferCopyRect, cuda_piEnqueueMemBufferCopyRect)
-  _PI_CL(piEnqueueMemBufferFill, cuda_piEnqueueMemBufferFill)
-  _PI_CL(piEnqueueMemImageRead, cuda_piEnqueueMemImageRead)
-  _PI_CL(piEnqueueMemImageWrite, cuda_piEnqueueMemImageWrite)
-  _PI_CL(piEnqueueMemImageCopy, cuda_piEnqueueMemImageCopy)
-  _PI_CL(piEnqueueMemImageFill, cuda_piEnqueueMemImageFill)
-  _PI_CL(piEnqueueMemBufferMap, cuda_piEnqueueMemBufferMap)
-  _PI_CL(piEnqueueMemUnmap, cuda_piEnqueueMemUnmap)
+  _PI_CL(piEnqueueKernelLaunch, pi2ur::piEnqueueKernelLaunch)
+  _PI_CL(piEnqueueNativeKernel, pi2ur::piEnqueueNativeKernel)
+  _PI_CL(piEnqueueEventsWait, pi2ur::piEnqueueEventsWait)
+  _PI_CL(piEnqueueEventsWaitWithBarrier, pi2ur::piEnqueueEventsWaitWithBarrier)
+  _PI_CL(piEnqueueMemBufferRead, pi2ur::piEnqueueMemBufferRead)
+  _PI_CL(piEnqueueMemBufferReadRect, pi2ur::piEnqueueMemBufferReadRect)
+  _PI_CL(piEnqueueMemBufferWrite, pi2ur::piEnqueueMemBufferWrite)
+  _PI_CL(piEnqueueMemBufferWriteRect, pi2ur::piEnqueueMemBufferWriteRect)
+  _PI_CL(piEnqueueMemBufferCopy, pi2ur::piEnqueueMemBufferCopy)
+  _PI_CL(piEnqueueMemBufferCopyRect, pi2ur::piEnqueueMemBufferCopyRect)
+  _PI_CL(piEnqueueMemBufferFill, pi2ur::piEnqueueMemBufferFill)
+  _PI_CL(piEnqueueMemImageRead, pi2ur::piEnqueueMemImageRead)
+  _PI_CL(piEnqueueMemImageWrite, pi2ur::piEnqueueMemImageWrite)
+  _PI_CL(piEnqueueMemImageCopy, pi2ur::piEnqueueMemImageCopy)
+  _PI_CL(piEnqueueMemImageFill, pi2ur::piEnqueueMemImageFill)
+  _PI_CL(piEnqueueMemBufferMap, pi2ur::piEnqueueMemBufferMap)
+  _PI_CL(piEnqueueMemUnmap, pi2ur::piEnqueueMemUnmap)
+
   // USM
-  _PI_CL(piextUSMHostAlloc, cuda_piextUSMHostAlloc)
-  _PI_CL(piextUSMDeviceAlloc, cuda_piextUSMDeviceAlloc)
-  _PI_CL(piextUSMSharedAlloc, cuda_piextUSMSharedAlloc)
-  _PI_CL(piextUSMFree, cuda_piextUSMFree)
-  _PI_CL(piextUSMEnqueueMemset, cuda_piextUSMEnqueueMemset)
-  _PI_CL(piextUSMEnqueueMemcpy, cuda_piextUSMEnqueueMemcpy)
-  _PI_CL(piextUSMEnqueuePrefetch, cuda_piextUSMEnqueuePrefetch)
-  _PI_CL(piextUSMEnqueueMemAdvise, cuda_piextUSMEnqueueMemAdvise)
-  _PI_CL(piextUSMEnqueueFill2D, cuda_piextUSMEnqueueFill2D)
-  _PI_CL(piextUSMEnqueueMemset2D, cuda_piextUSMEnqueueMemset2D)
-  _PI_CL(piextUSMEnqueueMemcpy2D, cuda_piextUSMEnqueueMemcpy2D)
-  _PI_CL(piextUSMGetMemAllocInfo, cuda_piextUSMGetMemAllocInfo)
+  _PI_CL(piextUSMHostAlloc, pi2ur::piextUSMHostAlloc)
+  _PI_CL(piextUSMDeviceAlloc, pi2ur::piextUSMDeviceAlloc)
+  _PI_CL(piextUSMSharedAlloc, pi2ur::piextUSMSharedAlloc)
+  _PI_CL(piextUSMFree, pi2ur::piextUSMFree)
+  _PI_CL(piextUSMEnqueueMemset, pi2ur::piextUSMEnqueueMemset)
+  _PI_CL(piextUSMEnqueueMemcpy, pi2ur::piextUSMEnqueueMemcpy)
+  _PI_CL(piextUSMEnqueuePrefetch, pi2ur::piextUSMEnqueuePrefetch)
+  _PI_CL(piextUSMEnqueueMemAdvise, pi2ur::piextUSMEnqueueMemAdvise)
+  _PI_CL(piextUSMEnqueueFill2D, pi2ur::piextUSMEnqueueFill2D)
+  _PI_CL(piextUSMEnqueueMemset2D, pi2ur::piextUSMEnqueueMemset2D)
+  _PI_CL(piextUSMEnqueueMemcpy2D, pi2ur::piextUSMEnqueueMemcpy2D)
+  _PI_CL(piextUSMGetMemAllocInfo, pi2ur::piextUSMGetMemAllocInfo)
   // Device global variable
   _PI_CL(piextEnqueueDeviceGlobalVariableWrite,
-         cuda_piextEnqueueDeviceGlobalVariableWrite)
+         pi2ur::piextEnqueueDeviceGlobalVariableWrite)
   _PI_CL(piextEnqueueDeviceGlobalVariableRead,
-         cuda_piextEnqueueDeviceGlobalVariableRead)
+         pi2ur::piextEnqueueDeviceGlobalVariableRead)
 
   // Host Pipe
-  _PI_CL(piextEnqueueReadHostPipe, cuda_piextEnqueueReadHostPipe)
-  _PI_CL(piextEnqueueWriteHostPipe, cuda_piextEnqueueWriteHostPipe)
-
-  _PI_CL(piextKernelSetArgMemObj, cuda_piextKernelSetArgMemObj)
-  _PI_CL(piextKernelSetArgSampler, cuda_piextKernelSetArgSampler)
-  _PI_CL(piPluginGetLastError, cuda_piPluginGetLastError)
-  _PI_CL(piTearDown, cuda_piTearDown)
-  _PI_CL(piGetDeviceAndHostTimer, cuda_piGetDeviceAndHostTimer)
-  _PI_CL(piPluginGetBackendOption, cuda_piPluginGetBackendOption)
+  _PI_CL(piextEnqueueReadHostPipe, pi2ur::piextEnqueueReadHostPipe)
+  _PI_CL(piextEnqueueWriteHostPipe, pi2ur::piextEnqueueWriteHostPipe)
+
+  _PI_CL(piextKernelSetArgMemObj, pi2ur::piextKernelSetArgMemObj)
+  _PI_CL(piextKernelSetArgSampler, pi2ur::piextKernelSetArgSampler)
+  _PI_CL(piPluginGetLastError, pi2ur::piPluginGetLastError)
+  _PI_CL(piTearDown, pi2ur::piTearDown)
+  _PI_CL(piGetDeviceAndHostTimer, pi2ur::piGetDeviceAndHostTimer)
+  _PI_CL(piPluginGetBackendOption, pi2ur::piPluginGetBackendOption)
 
 #undef _PI_CL
 
diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp
index a4864cf673392..8fb4664199286 100644
--- a/sycl/plugins/cuda/pi_cuda.hpp
+++ b/sycl/plugins/cuda/pi_cuda.hpp
@@ -25,983 +25,55 @@
 #define _PI_CUDA_PLUGIN_VERSION_STRING                                         \
   _PI_PLUGIN_VERSION_STRING(_PI_CUDA_PLUGIN_VERSION)
 
-#include "sycl/detail/pi.h"
-#include <algorithm>
-#include <array>
-#include <atomic>
-#include <cassert>
-#include <cstring>
-#include <cuda.h>
-#include <functional>
-#include <limits>
-#include <memory>
-#include <mutex>
-#include <numeric>
-#include <stdint.h>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-extern "C" {
-
-/// \cond IGNORE_BLOCK_IN_DOXYGEN
-pi_result cuda_piContextRetain(pi_context);
-pi_result cuda_piContextRelease(pi_context);
-pi_result cuda_piDeviceRelease(pi_device);
-pi_result cuda_piDeviceRetain(pi_device);
-pi_result cuda_piProgramRetain(pi_program);
-pi_result cuda_piProgramRelease(pi_program);
-pi_result cuda_piQueueRelease(pi_queue);
-pi_result cuda_piQueueRetain(pi_queue);
-pi_result cuda_piMemRetain(pi_mem);
-pi_result cuda_piMemRelease(pi_mem);
-pi_result cuda_piKernelRetain(pi_kernel);
-pi_result cuda_piKernelRelease(pi_kernel);
-pi_result cuda_piKernelGetGroupInfo(pi_kernel kernel, pi_device device,
-                                    pi_kernel_group_info param_name,
-                                    size_t param_value_size, void *param_value,
-                                    size_t *param_value_size_ret);
-/// \endcond
-}
+#include <ur/adapters/cuda/context.hpp>
+#include <ur/adapters/cuda/device.hpp>
+#include <ur/adapters/cuda/event.hpp>
+#include <ur/adapters/cuda/kernel.hpp>
+#include <ur/adapters/cuda/memory.hpp>
+#include <ur/adapters/cuda/platform.hpp>
+#include <ur/adapters/cuda/program.hpp>
+#include <ur/adapters/cuda/queue.hpp>
+#include <ur/adapters/cuda/sampler.hpp>
+
+// Share code between the PI Plugin and UR Adapter
+#include <pi2ur.hpp>
 
 using _pi_stream_guard = std::unique_lock<std::mutex>;
 
-/// A PI platform stores all known PI devices,
-///  in the CUDA plugin this is just a vector of
-///  available devices since initialization is done
-///  when devices are used.
-///
-struct _pi_platform {
-  std::vector<std::unique_ptr<_pi_device>> devices_;
+struct _pi_platform : ur_platform_handle_t_ {
+  using ur_platform_handle_t_::ur_platform_handle_t_;
 };
 
-/// PI device mapping to a CUdevice.
-/// Includes an observer pointer to the platform,
-/// and implements the reference counting semantics since
-/// CUDA objects are not refcounted.
-///
-struct _pi_device {
-private:
-  using native_type = CUdevice;
-
-  native_type cuDevice_;
-  CUcontext cuContext_;
-  CUevent evBase_; // CUDA event used as base counter
-  std::atomic_uint32_t refCount_;
-  pi_platform platform_;
-
-  static constexpr pi_uint32 max_work_item_dimensions = 3u;
-  size_t max_work_item_sizes[max_work_item_dimensions];
-  int max_work_group_size;
-
-public:
-  _pi_device(native_type cuDevice, CUcontext cuContext, CUevent evBase,
-             pi_platform platform)
-      : cuDevice_(cuDevice), cuContext_(cuContext),
-        evBase_(evBase), refCount_{1}, platform_(platform) {}
-
-  ~_pi_device() { cuDevicePrimaryCtxRelease(cuDevice_); }
-
-  native_type get() const noexcept { return cuDevice_; };
-
-  CUcontext get_context() const noexcept { return cuContext_; };
-
-  pi_uint32 get_reference_count() const noexcept { return refCount_; }
-
-  pi_platform get_platform() const noexcept { return platform_; };
-
-  pi_uint64 get_elapsed_time(CUevent) const;
-
-  void save_max_work_item_sizes(size_t size,
-                                size_t *save_max_work_item_sizes) noexcept {
-    memcpy(max_work_item_sizes, save_max_work_item_sizes, size);
-  };
-
-  void save_max_work_group_size(int value) noexcept {
-    max_work_group_size = value;
-  };
-
-  void get_max_work_item_sizes(size_t ret_size,
-                               size_t *ret_max_work_item_sizes) const noexcept {
-    memcpy(ret_max_work_item_sizes, max_work_item_sizes, ret_size);
-  };
-
-  int get_max_work_group_size() const noexcept { return max_work_group_size; };
+struct _pi_device : ur_device_handle_t_ {
+  using ur_device_handle_t_::ur_device_handle_t_;
 };
 
-/// PI context mapping to a CUDA context object.
-///
-/// There is no direct mapping between a CUDA context and a PI context,
-/// main differences described below:
-///
-/// <b> CUDA context vs PI context </b>
-///
-/// One of the main differences between the PI API and the CUDA driver API is
-/// that the second modifies the state of the threads by assigning
-/// `CUcontext` objects to threads. `CUcontext` objects store data associated
-/// with a given device and control access to said device from the user side.
-/// PI API context are objects that are passed to functions, and not bound
-/// to threads.
-/// The _pi_context object doesn't implement this behavior, only holds the
-/// CUDA context data. The RAII object \ref ScopedContext implements the active
-/// context behavior.
-///
-/// <b> Primary vs User-defined context </b>
-///
-/// CUDA has two different types of context, the Primary context,
-/// which is usable by all threads on a given process for a given device, and
-/// the aforementioned custom contexts.
-/// CUDA documentation, and performance analysis, indicates it is recommended
-/// to use Primary context whenever possible.
-/// Primary context is used as well by the CUDA Runtime API.
-/// For PI applications to interop with CUDA Runtime API, they have to use
-/// the primary context - and make that active in the thread.
-/// The `_pi_context` object can be constructed with a `kind` parameter
-/// that allows to construct a Primary or `user-defined` context, so that
-/// the PI object interface is always the same.
-///
-///  <b> Destructor callback </b>
-///
-///  Required to implement CP023, SYCL Extended Context Destruction,
-///  the PI Context can store a number of callback functions that will be
-///  called upon destruction of the PI Context.
-///  See proposal for details.
-///
-struct _pi_context {
-
-  struct deleter_data {
-    pi_context_extended_deleter function;
-    void *user_data;
-
-    void operator()() { function(user_data); }
-  };
-
-  using native_type = CUcontext;
-
-  native_type cuContext_;
-  _pi_device *deviceId_;
-  std::atomic_uint32_t refCount_;
-
-  _pi_context(_pi_device *devId)
-      : cuContext_{devId->get_context()}, deviceId_{devId}, refCount_{1} {
-    cuda_piDeviceRetain(deviceId_);
-  };
-
-  ~_pi_context() { cuda_piDeviceRelease(deviceId_); }
-
-  void invoke_extended_deleters() {
-    std::lock_guard<std::mutex> guard(mutex_);
-    for (auto &deleter : extended_deleters_) {
-      deleter();
-    }
-  }
-
-  void set_extended_deleter(pi_context_extended_deleter function,
-                            void *user_data) {
-    std::lock_guard<std::mutex> guard(mutex_);
-    extended_deleters_.emplace_back(deleter_data{function, user_data});
-  }
-
-  pi_device get_device() const noexcept { return deviceId_; }
-
-  native_type get() const noexcept { return cuContext_; }
-
-  pi_uint32 increment_reference_count() noexcept { return ++refCount_; }
-
-  pi_uint32 decrement_reference_count() noexcept { return --refCount_; }
-
-  pi_uint32 get_reference_count() const noexcept { return refCount_; }
-
-private:
-  std::mutex mutex_;
-  std::vector<deleter_data> extended_deleters_;
+struct _pi_context : ur_context_handle_t_ {
+  using ur_context_handle_t_::ur_context_handle_t_;
 };
 
-/// PI Mem mapping to CUDA memory allocations, both data and texture/surface.
-/// \brief Represents non-SVM allocations on the CUDA backend.
-/// Keeps tracks of all mapped regions used for Map/Unmap calls.
-/// Only one region can be active at the same time per allocation.
-struct _pi_mem {
-
-  // TODO: Move as much shared data up as possible
-  using pi_context = _pi_context *;
-
-  // Context where the memory object is accessibles
-  pi_context context_;
-
-  /// Reference counting of the handler
-  std::atomic_uint32_t refCount_;
-  enum class mem_type { buffer, surface } mem_type_;
-
-  /// A PI Memory object represents either plain memory allocations ("Buffers"
-  /// in OpenCL) or typed allocations ("Images" in OpenCL).
-  /// In CUDA their API handlers are different. Whereas "Buffers" are allocated
-  /// as pointer-like structs, "Images" are stored in Textures or Surfaces
-  /// This union allows implementation to use either from the same handler.
-  union mem_ {
-    // Handler for plain, pointer-based CUDA allocations
-    struct buffer_mem_ {
-      using native_type = CUdeviceptr;
-
-      // If this allocation is a sub-buffer (i.e., a view on an existing
-      // allocation), this is the pointer to the parent handler structure
-      pi_mem parent_;
-      // CUDA handler for the pointer
-      native_type ptr_;
-
-      /// Pointer associated with this device on the host
-      void *hostPtr_;
-      /// Size of the allocation in bytes
-      size_t size_;
-      /// Offset of the active mapped region.
-      size_t mapOffset_;
-      /// Pointer to the active mapped region, if any
-      void *mapPtr_;
-      /// Original flags for the mapped region
-      pi_map_flags mapFlags_;
-
-      /** alloc_mode
-       * classic: Just a normal buffer allocated on the device via cuda malloc
-       * use_host_ptr: Use an address on the host for the device
-       * copy_in: The data for the device comes from the host but the host
-       pointer is not available later for re-use
-       * alloc_host_ptr: Uses pinned-memory allocation
-      */
-      enum class alloc_mode {
-        classic,
-        use_host_ptr,
-        copy_in,
-        alloc_host_ptr
-      } allocMode_;
-
-      native_type get() const noexcept { return ptr_; }
-
-      size_t get_size() const noexcept { return size_; }
-
-      void *get_map_ptr() const noexcept { return mapPtr_; }
-
-      size_t get_map_offset(void *) const noexcept { return mapOffset_; }
-
-      /// Returns a pointer to data visible on the host that contains
-      /// the data on the device associated with this allocation.
-      /// The offset is used to index into the CUDA allocation.
-      ///
-      void *map_to_ptr(size_t offset, pi_map_flags flags) noexcept {
-        assert(mapPtr_ == nullptr);
-        mapOffset_ = offset;
-        mapFlags_ = flags;
-        if (hostPtr_) {
-          mapPtr_ = static_cast<char *>(hostPtr_) + offset;
-        } else {
-          // TODO: Allocate only what is needed based on the offset
-          mapPtr_ = static_cast<void *>(malloc(this->get_size()));
-        }
-        return mapPtr_;
-      }
-
-      /// Detach the allocation from the host memory.
-      void unmap(void *) noexcept {
-        assert(mapPtr_ != nullptr);
-
-        if (mapPtr_ != hostPtr_) {
-          free(mapPtr_);
-        }
-        mapPtr_ = nullptr;
-        mapOffset_ = 0;
-      }
-
-      pi_map_flags get_map_flags() const noexcept {
-        assert(mapPtr_ != nullptr);
-        return mapFlags_;
-      }
-    } buffer_mem_;
-
-    // Handler data for surface object (i.e. Images)
-    struct surface_mem_ {
-      CUarray array_;
-      CUsurfObject surfObj_;
-      pi_mem_type imageType_;
-
-      CUarray get_array() const noexcept { return array_; }
-
-      CUsurfObject get_surface() const noexcept { return surfObj_; }
-
-      pi_mem_type get_image_type() const noexcept { return imageType_; }
-    } surface_mem_;
-  } mem_;
-
-  /// Constructs the PI MEM handler for a non-typed allocation ("buffer")
-  _pi_mem(pi_context ctxt, pi_mem parent, mem_::buffer_mem_::alloc_mode mode,
-          CUdeviceptr ptr, void *host_ptr, size_t size)
-      : context_{ctxt}, refCount_{1}, mem_type_{mem_type::buffer} {
-    mem_.buffer_mem_.ptr_ = ptr;
-    mem_.buffer_mem_.parent_ = parent;
-    mem_.buffer_mem_.hostPtr_ = host_ptr;
-    mem_.buffer_mem_.size_ = size;
-    mem_.buffer_mem_.mapOffset_ = 0;
-    mem_.buffer_mem_.mapPtr_ = nullptr;
-    mem_.buffer_mem_.mapFlags_ = PI_MAP_WRITE;
-    mem_.buffer_mem_.allocMode_ = mode;
-    if (is_sub_buffer()) {
-      cuda_piMemRetain(mem_.buffer_mem_.parent_);
-    } else {
-      cuda_piContextRetain(context_);
-    }
-  };
-
-  /// Constructs the PI allocation for an Image object (surface in CUDA)
-  _pi_mem(pi_context ctxt, CUarray array, CUsurfObject surf,
-          pi_mem_type image_type, void *host_ptr)
-      : context_{ctxt}, refCount_{1}, mem_type_{mem_type::surface} {
-    // Ignore unused parameter
-    (void)host_ptr;
-
-    mem_.surface_mem_.array_ = array;
-    mem_.surface_mem_.surfObj_ = surf;
-    mem_.surface_mem_.imageType_ = image_type;
-    cuda_piContextRetain(context_);
-  }
-
-  ~_pi_mem() {
-    if (mem_type_ == mem_type::buffer) {
-      if (is_sub_buffer()) {
-        cuda_piMemRelease(mem_.buffer_mem_.parent_);
-        return;
-      }
-    }
-    cuda_piContextRelease(context_);
-  }
-
-  // TODO: Move as many shared funcs up as possible
-  bool is_buffer() const noexcept { return mem_type_ == mem_type::buffer; }
-
-  bool is_sub_buffer() const noexcept {
-    return (is_buffer() && (mem_.buffer_mem_.parent_ != nullptr));
-  }
-
-  bool is_image() const noexcept { return mem_type_ == mem_type::surface; }
-
-  pi_context get_context() const noexcept { return context_; }
-
-  pi_uint32 increment_reference_count() noexcept { return ++refCount_; }
-
-  pi_uint32 decrement_reference_count() noexcept { return --refCount_; }
-
-  pi_uint32 get_reference_count() const noexcept { return refCount_; }
+struct _pi_mem : ur_mem_handle_t_ {
+  using ur_mem_handle_t_::ur_mem_handle_t_;
 };
 
-/// PI queue mapping on to CUstream objects.
-///
-struct _pi_queue {
-  using native_type = CUstream;
-  static constexpr int default_num_compute_streams = 128;
-  static constexpr int default_num_transfer_streams = 64;
-
-  std::vector<native_type> compute_streams_;
-  std::vector<native_type> transfer_streams_;
-  // delay_compute_ keeps track of which streams have been recently reused and
-  // their next use should be delayed. If a stream has been recently reused it
-  // will be skipped the next time it would be selected round-robin style. When
-  // skipped, its delay flag is cleared.
-  std::vector<bool> delay_compute_;
-  // keep track of which streams have applied barrier
-  std::vector<bool> compute_applied_barrier_;
-  std::vector<bool> transfer_applied_barrier_;
-  _pi_context *context_;
-  _pi_device *device_;
-  pi_queue_properties properties_;
-  CUevent barrier_event_ = nullptr;
-  CUevent barrier_tmp_event_ = nullptr;
-  std::atomic_uint32_t refCount_;
-  std::atomic_uint32_t eventCount_;
-  std::atomic_uint32_t compute_stream_idx_;
-  std::atomic_uint32_t transfer_stream_idx_;
-  unsigned int num_compute_streams_;
-  unsigned int num_transfer_streams_;
-  unsigned int last_sync_compute_streams_;
-  unsigned int last_sync_transfer_streams_;
-  unsigned int flags_;
-  // When compute_stream_sync_mutex_ and compute_stream_mutex_ both need to be
-  // locked at the same time, compute_stream_sync_mutex_ should be locked first
-  // to avoid deadlocks
-  std::mutex compute_stream_sync_mutex_;
-  std::mutex compute_stream_mutex_;
-  std::mutex transfer_stream_mutex_;
-  std::mutex barrier_mutex_;
-  bool has_ownership_;
-
-  _pi_queue(std::vector<CUstream> &&compute_streams,
-            std::vector<CUstream> &&transfer_streams, _pi_context *context,
-            _pi_device *device, pi_queue_properties properties,
-            unsigned int flags, bool backend_owns = true)
-      : compute_streams_{std::move(compute_streams)},
-        transfer_streams_{std::move(transfer_streams)},
-        delay_compute_(compute_streams_.size(), false),
-        compute_applied_barrier_(compute_streams_.size()),
-        transfer_applied_barrier_(transfer_streams_.size()), context_{context},
-        device_{device}, properties_{properties}, refCount_{1}, eventCount_{0},
-        compute_stream_idx_{0}, transfer_stream_idx_{0},
-        num_compute_streams_{0}, num_transfer_streams_{0},
-        last_sync_compute_streams_{0}, last_sync_transfer_streams_{0},
-        flags_(flags), has_ownership_{backend_owns} {
-    cuda_piContextRetain(context_);
-    cuda_piDeviceRetain(device_);
-  }
-
-  ~_pi_queue() {
-    cuda_piContextRelease(context_);
-    cuda_piDeviceRelease(device_);
-  }
-
-  void compute_stream_wait_for_barrier_if_needed(CUstream stream,
-                                                 pi_uint32 stream_i);
-  void transfer_stream_wait_for_barrier_if_needed(CUstream stream,
-                                                  pi_uint32 stream_i);
-
-  // get_next_compute/transfer_stream() functions return streams from
-  // appropriate pools in round-robin fashion
-  native_type get_next_compute_stream(pi_uint32 *stream_token = nullptr);
-  // this overload tries select a stream that was used by one of dependancies.
-  // If that is not possible returns a new stream. If a stream is reused it
-  // returns a lock that needs to remain locked as long as the stream is in use
-  native_type get_next_compute_stream(pi_uint32 num_events_in_wait_list,
-                                      const pi_event *event_wait_list,
-                                      _pi_stream_guard &guard,
-                                      pi_uint32 *stream_token = nullptr);
-  native_type get_next_transfer_stream();
-  native_type get() { return get_next_compute_stream(); };
-
-  bool has_been_synchronized(pi_uint32 stream_token) {
-    // stream token not associated with one of the compute streams
-    if (stream_token == std::numeric_limits<pi_uint32>::max()) {
-      return false;
-    }
-    return last_sync_compute_streams_ > stream_token;
-  }
-
-  bool can_reuse_stream(pi_uint32 stream_token) {
-    // stream token not associated with one of the compute streams
-    if (stream_token == std::numeric_limits<pi_uint32>::max()) {
-      return false;
-    }
-    // If the command represented by the stream token was not the last command
-    // enqueued to the stream we can not reuse the stream - we need to allow for
-    // commands enqueued after it and the one we are about to enqueue to run
-    // concurrently
-    bool is_last_command =
-        (compute_stream_idx_ - stream_token) <= compute_streams_.size();
-    // If there was a barrier enqueued to the queue after the command
-    // represented by the stream token we should not reuse the stream, as we can
-    // not take that stream into account for the bookkeeping for the next
-    // barrier - such a stream would not be synchronized with. Performance-wise
-    // it does not matter that we do not reuse the stream, as the work
-    // represented by the stream token is guaranteed to be complete by the
-    // barrier before any work we are about to enqueue to the stream will start,
-    // so the event does not need to be synchronized with.
-    return is_last_command && !has_been_synchronized(stream_token);
-  }
-
-  template <typename T> bool all_of(T &&f) {
-    {
-      std::lock_guard<std::mutex> compute_guard(compute_stream_mutex_);
-      unsigned int end =
-          std::min(static_cast<unsigned int>(compute_streams_.size()),
-                   num_compute_streams_);
-      if (!std::all_of(compute_streams_.begin(), compute_streams_.begin() + end,
-                       f))
-        return false;
-    }
-    {
-      std::lock_guard<std::mutex> transfer_guard(transfer_stream_mutex_);
-      unsigned int end =
-          std::min(static_cast<unsigned int>(transfer_streams_.size()),
-                   num_transfer_streams_);
-      if (!std::all_of(transfer_streams_.begin(),
-                       transfer_streams_.begin() + end, f))
-        return false;
-    }
-    return true;
-  }
-
-  template <typename T> void for_each_stream(T &&f) {
-    {
-      std::lock_guard<std::mutex> compute_guard(compute_stream_mutex_);
-      unsigned int end =
-          std::min(static_cast<unsigned int>(compute_streams_.size()),
-                   num_compute_streams_);
-      for (unsigned int i = 0; i < end; i++) {
-        f(compute_streams_[i]);
-      }
-    }
-    {
-      std::lock_guard<std::mutex> transfer_guard(transfer_stream_mutex_);
-      unsigned int end =
-          std::min(static_cast<unsigned int>(transfer_streams_.size()),
-                   num_transfer_streams_);
-      for (unsigned int i = 0; i < end; i++) {
-        f(transfer_streams_[i]);
-      }
-    }
-  }
-
-  template <bool ResetUsed = false, typename T> void sync_streams(T &&f) {
-    auto sync_compute = [&f, &streams = compute_streams_,
-                         &delay = delay_compute_](unsigned int start,
-                                                  unsigned int stop) {
-      for (unsigned int i = start; i < stop; i++) {
-        f(streams[i]);
-        delay[i] = false;
-      }
-    };
-    auto sync_transfer = [&f, &streams = transfer_streams_](unsigned int start,
-                                                            unsigned int stop) {
-      for (unsigned int i = start; i < stop; i++) {
-        f(streams[i]);
-      }
-    };
-    {
-      unsigned int size = static_cast<unsigned int>(compute_streams_.size());
-      std::lock_guard<std::mutex> compute_sync_guard(
-          compute_stream_sync_mutex_);
-      std::lock_guard<std::mutex> compute_guard(compute_stream_mutex_);
-      unsigned int start = last_sync_compute_streams_;
-      unsigned int end = num_compute_streams_ < size
-                             ? num_compute_streams_
-                             : compute_stream_idx_.load();
-      if (end - start >= size) {
-        sync_compute(0, size);
-      } else {
-        start %= size;
-        end %= size;
-        if (start <= end) {
-          sync_compute(start, end);
-        } else {
-          sync_compute(start, size);
-          sync_compute(0, end);
-        }
-      }
-      if (ResetUsed) {
-        last_sync_compute_streams_ = end;
-      }
-    }
-    {
-      unsigned int size = static_cast<unsigned int>(transfer_streams_.size());
-      if (size > 0) {
-        std::lock_guard<std::mutex> transfer_guard(transfer_stream_mutex_);
-        unsigned int start = last_sync_transfer_streams_;
-        unsigned int end = num_transfer_streams_ < size
-                               ? num_transfer_streams_
-                               : transfer_stream_idx_.load();
-        if (end - start >= size) {
-          sync_transfer(0, size);
-        } else {
-          start %= size;
-          end %= size;
-          if (start <= end) {
-            sync_transfer(start, end);
-          } else {
-            sync_transfer(start, size);
-            sync_transfer(0, end);
-          }
-        }
-        if (ResetUsed) {
-          last_sync_transfer_streams_ = end;
-        }
-      }
-    }
-  }
-
-  _pi_context *get_context() const { return context_; };
-
-  _pi_device *get_device() const { return device_; };
-
-  pi_uint32 increment_reference_count() noexcept { return ++refCount_; }
-
-  pi_uint32 decrement_reference_count() noexcept { return --refCount_; }
-
-  pi_uint32 get_reference_count() const noexcept { return refCount_; }
-
-  pi_uint32 get_next_event_id() noexcept { return ++eventCount_; }
-
-  bool backend_has_ownership() const noexcept { return has_ownership_; }
+struct _pi_queue : ur_queue_handle_t_ {
+  using ur_queue_handle_t_::ur_queue_handle_t_;
 };
 
-typedef void (*pfn_notify)(pi_event event, pi_int32 eventCommandStatus,
-                           void *userData);
-/// PI Event mapping to CUevent
-///
-struct _pi_event {
-public:
-  using native_type = CUevent;
-
-  pi_result record();
-
-  pi_result wait();
-
-  pi_result start();
-
-  native_type get() const noexcept { return evEnd_; };
-
-  pi_queue get_queue() const noexcept { return queue_; }
-
-  CUstream get_stream() const noexcept { return stream_; }
-
-  pi_uint32 get_compute_stream_token() const noexcept { return streamToken_; }
-
-  pi_command_type get_command_type() const noexcept { return commandType_; }
-
-  pi_uint32 get_reference_count() const noexcept { return refCount_; }
-
-  bool is_recorded() const noexcept { return isRecorded_; }
-
-  bool is_started() const noexcept { return isStarted_; }
-
-  bool is_completed() const noexcept;
-
-  pi_int32 get_execution_status() const noexcept {
-
-    if (!is_recorded()) {
-      return PI_EVENT_SUBMITTED;
-    }
-
-    if (!is_completed()) {
-      return PI_EVENT_RUNNING;
-    }
-    return PI_EVENT_COMPLETE;
-  }
-
-  pi_context get_context() const noexcept { return context_; };
-
-  pi_uint32 increment_reference_count() { return ++refCount_; }
-
-  pi_uint32 decrement_reference_count() { return --refCount_; }
-
-  pi_uint32 get_event_id() const noexcept { return eventId_; }
-
-  bool backend_has_ownership() const noexcept { return has_ownership_; }
-
-  // Returns the counter time when the associated command(s) were enqueued
-  //
-  pi_uint64 get_queued_time() const;
-
-  // Returns the counter time when the associated command(s) started execution
-  //
-  pi_uint64 get_start_time() const;
-
-  // Returns the counter time when the associated command(s) completed
-  //
-  pi_uint64 get_end_time() const;
-
-  // construct a native CUDA. This maps closely to the underlying CUDA event.
-  static pi_event
-  make_native(pi_command_type type, pi_queue queue, CUstream stream,
-              pi_uint32 stream_token = std::numeric_limits<pi_uint32>::max()) {
-    return new _pi_event(type, queue->get_context(), queue, stream,
-                         stream_token);
-  }
-
-  static pi_event make_with_native(pi_context context, CUevent eventNative) {
-    return new _pi_event(context, eventNative);
-  }
-
-  pi_result release();
-
-  ~_pi_event();
-
-private:
-  // This constructor is private to force programmers to use the make_native /
-  // make_user static members in order to create a pi_event for CUDA.
-  _pi_event(pi_command_type type, pi_context context, pi_queue queue,
-            CUstream stream, pi_uint32 stream_token);
-
-  // This constructor is private to force programmers to use the
-  // make_with_native for event introp
-  _pi_event(pi_context context, CUevent eventNative);
-
-  pi_command_type commandType_; // The type of command associated with event.
-
-  std::atomic_uint32_t refCount_; // Event reference count.
-
-  bool has_ownership_; // Signifies if event owns the native type.
-
-  bool hasBeenWaitedOn_; // Signifies whether the event has been waited
-                         // on through a call to wait(), which implies
-                         // that it has completed.
-
-  bool isRecorded_; // Signifies wether a native CUDA event has been recorded
-                    // yet.
-  bool isStarted_;  // Signifies wether the operation associated with the
-                    // PI event has started or not
-                    //
-
-  pi_uint32 streamToken_;
-  pi_uint32 eventId_; // Queue identifier of the event.
-
-  native_type evEnd_; // CUDA event handle. If this _pi_event represents a user
-                      // event, this will be nullptr.
-
-  native_type evStart_; // CUDA event handle associated with the start
-
-  native_type evQueued_; // CUDA event handle associated with the time
-                         // the command was enqueued
-
-  pi_queue queue_; // pi_queue associated with the event. If this is a user
-                   // event, this will be nullptr.
-
-  CUstream stream_; // CUstream associated with the event. If this is a user
-                    // event, this will be uninitialized.
-
-  pi_context context_; // pi_context associated with the event. If this is a
-                       // native event, this will be the same context associated
-                       // with the queue_ member.
+struct _pi_event : ur_event_handle_t_ {
+  using ur_event_handle_t_::ur_event_handle_t_;
 };
 
-/// Implementation of PI Program on CUDA Module object
-///
-struct _pi_program {
-  using native_type = CUmodule;
-  native_type module_;
-  const char *binary_;
-  size_t binarySizeInBytes_;
-  std::atomic_uint32_t refCount_;
-  _pi_context *context_;
-
-  // Metadata
-  std::unordered_map<std::string, std::tuple<uint32_t, uint32_t, uint32_t>>
-      kernelReqdWorkGroupSizeMD_;
-  std::unordered_map<std::string, std::string> globalIDMD_;
-
-  constexpr static size_t MAX_LOG_SIZE = 8192u;
-
-  char errorLog_[MAX_LOG_SIZE], infoLog_[MAX_LOG_SIZE];
-  std::string buildOptions_;
-  pi_program_build_status buildStatus_ = PI_PROGRAM_BUILD_STATUS_NONE;
-
-  _pi_program(pi_context ctxt);
-  ~_pi_program();
-
-  pi_result set_metadata(const pi_device_binary_property *metadata,
-                         size_t length);
-
-  pi_result set_binary(const char *binary, size_t binarySizeInBytes);
-
-  pi_result build_program(const char *build_options);
-
-  pi_context get_context() const { return context_; };
-
-  native_type get() const noexcept { return module_; };
-
-  pi_uint32 increment_reference_count() noexcept { return ++refCount_; }
-
-  pi_uint32 decrement_reference_count() noexcept { return --refCount_; }
-
-  pi_uint32 get_reference_count() const noexcept { return refCount_; }
+struct _pi_program : ur_program_handle_t_ {
+  using ur_program_handle_t_::ur_program_handle_t_;
 };
 
-/// Implementation of a PI Kernel for CUDA
-///
-/// PI Kernels are used to set kernel arguments,
-/// creating a state on the Kernel object for a given
-/// invocation. This is not the case of CUFunction objects,
-/// which are simply passed together with the arguments on the invocation.
-/// The PI Kernel implementation for CUDA stores the list of arguments,
-/// argument sizes and offsets to emulate the interface of PI Kernel,
-/// saving the arguments for the later dispatch.
-/// Note that in PI API, the Local memory is specified as a size per
-/// individual argument, but in CUDA only the total usage of shared
-/// memory is required since it is not passed as a parameter.
-/// A compiler pass converts the PI API local memory model into the
-/// CUDA shared model. This object simply calculates the total of
-/// shared memory, and the initial offsets of each parameter.
-///
-struct _pi_kernel {
-  using native_type = CUfunction;
-
-  native_type function_;
-  native_type functionWithOffsetParam_;
-  std::string name_;
-  pi_context context_;
-  pi_program program_;
-  std::atomic_uint32_t refCount_;
-
-  static constexpr pi_uint32 REQD_THREADS_PER_BLOCK_DIMENSIONS = 3u;
-  size_t reqdThreadsPerBlock_[REQD_THREADS_PER_BLOCK_DIMENSIONS];
-
-  /// Structure that holds the arguments to the kernel.
-  /// Note earch argument size is known, since it comes
-  /// from the kernel signature.
-  /// This is not something can be queried from the CUDA API
-  /// so there is a hard-coded size (\ref MAX_PARAM_BYTES)
-  /// and a storage.
-  ///
-  struct arguments {
-    static constexpr size_t MAX_PARAM_BYTES = 4000u;
-    using args_t = std::array<char, MAX_PARAM_BYTES>;
-    using args_size_t = std::vector<size_t>;
-    using args_index_t = std::vector<void *>;
-    args_t storage_;
-    args_size_t paramSizes_;
-    args_index_t indices_;
-    args_size_t offsetPerIndex_;
-
-    std::uint32_t implicitOffsetArgs_[3] = {0, 0, 0};
-
-    arguments() {
-      // Place the implicit offset index at the end of the indicies collection
-      indices_.emplace_back(&implicitOffsetArgs_);
-    }
-
-    /// Adds an argument to the kernel.
-    /// If the argument existed before, it is replaced.
-    /// Otherwise, it is added.
-    /// Gaps are filled with empty arguments.
-    /// Implicit offset argument is kept at the back of the indices collection.
-    void add_arg(size_t index, size_t size, const void *arg,
-                 size_t localSize = 0) {
-      if (index + 2 > indices_.size()) {
-        // Move implicit offset argument index with the end
-        indices_.resize(index + 2, indices_.back());
-        // Ensure enough space for the new argument
-        paramSizes_.resize(index + 1);
-        offsetPerIndex_.resize(index + 1);
-      }
-      paramSizes_[index] = size;
-      // calculate the insertion point on the array
-      size_t insertPos = std::accumulate(std::begin(paramSizes_),
-                                         std::begin(paramSizes_) + index, 0);
-      // Update the stored value for the argument
-      std::memcpy(&storage_[insertPos], arg, size);
-      indices_[index] = &storage_[insertPos];
-      offsetPerIndex_[index] = localSize;
-    }
-
-    void add_local_arg(size_t index, size_t size) {
-      size_t localOffset = this->get_local_size();
-
-      // maximum required alignment is the size of the largest vector type
-      const size_t max_alignment = sizeof(double) * 16;
-
-      // for arguments smaller than the maximum alignment simply align to the
-      // size of the argument
-      const size_t alignment = std::min(max_alignment, size);
-
-      // align the argument
-      size_t alignedLocalOffset = localOffset;
-      if (localOffset % alignment != 0) {
-        alignedLocalOffset += alignment - (localOffset % alignment);
-      }
-
-      add_arg(index, sizeof(size_t), (const void *)&(alignedLocalOffset),
-              size + (alignedLocalOffset - localOffset));
-    }
-
-    void set_implicit_offset(size_t size, std::uint32_t *implicitOffset) {
-      assert(size == sizeof(std::uint32_t) * 3);
-      std::memcpy(implicitOffsetArgs_, implicitOffset, size);
-    }
-
-    void clear_local_size() {
-      std::fill(std::begin(offsetPerIndex_), std::end(offsetPerIndex_), 0);
-    }
-
-    const args_index_t &get_indices() const noexcept { return indices_; }
-
-    pi_uint32 get_local_size() const {
-      return std::accumulate(std::begin(offsetPerIndex_),
-                             std::end(offsetPerIndex_), 0);
-    }
-  } args_;
-
-  _pi_kernel(CUfunction func, CUfunction funcWithOffsetParam, const char *name,
-             pi_program program, pi_context ctxt)
-      : function_{func}, functionWithOffsetParam_{funcWithOffsetParam},
-        name_{name}, context_{ctxt}, program_{program}, refCount_{1} {
-    cuda_piProgramRetain(program_);
-    cuda_piContextRetain(context_);
-    /// Note: this code assumes that there is only one device per context
-    pi_result retError = cuda_piKernelGetGroupInfo(
-        this, ctxt->get_device(), PI_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
-        sizeof(reqdThreadsPerBlock_), reqdThreadsPerBlock_, nullptr);
-    (void)retError;
-    assert(retError == PI_SUCCESS);
-  }
-
-  ~_pi_kernel() {
-    cuda_piProgramRelease(program_);
-    cuda_piContextRelease(context_);
-  }
-
-  pi_program get_program() const noexcept { return program_; }
-
-  pi_uint32 increment_reference_count() noexcept { return ++refCount_; }
-
-  pi_uint32 decrement_reference_count() noexcept { return --refCount_; }
-
-  pi_uint32 get_reference_count() const noexcept { return refCount_; }
-
-  native_type get() const noexcept { return function_; };
-
-  native_type get_with_offset_parameter() const noexcept {
-    return functionWithOffsetParam_;
-  };
-
-  bool has_with_offset_parameter() const noexcept {
-    return functionWithOffsetParam_ != nullptr;
-  }
-
-  pi_context get_context() const noexcept { return context_; };
-
-  const char *get_name() const noexcept { return name_.c_str(); }
-
-  /// Returns the number of arguments, excluding the implicit global offset.
-  /// Note this only returns the current known number of arguments, not the
-  /// real one required by the kernel, since this cannot be queried from
-  /// the CUDA Driver API
-  pi_uint32 get_num_args() const noexcept { return args_.indices_.size() - 1; }
-
-  void set_kernel_arg(int index, size_t size, const void *arg) {
-    args_.add_arg(index, size, arg);
-  }
-
-  void set_kernel_local_arg(int index, size_t size) {
-    args_.add_local_arg(index, size);
-  }
-
-  void set_implicit_offset_arg(size_t size, std::uint32_t *implicitOffset) {
-    args_.set_implicit_offset(size, implicitOffset);
-  }
-
-  const arguments::args_index_t &get_arg_indices() const {
-    return args_.get_indices();
-  }
-
-  pi_uint32 get_local_size() const noexcept { return args_.get_local_size(); }
-
-  void clear_local_size() { args_.clear_local_size(); }
+struct _pi_kernel : ur_kernel_handle_t_ {
+  using ur_kernel_handle_t_::ur_kernel_handle_t_;
 };
 
-/// Implementation of samplers for CUDA
-///
-/// Sampler property layout:
-/// | 31 30 ... 6 5 |      4 3 2      |     1      |         0        |
-/// |      N/A      | addressing mode | fiter mode | normalize coords |
-struct _pi_sampler {
-  std::atomic_uint32_t refCount_;
-  pi_uint32 props_;
-  pi_context context_;
-
-  _pi_sampler(pi_context context)
-      : refCount_(1), props_(0), context_(context) {}
-
-  pi_uint32 increment_reference_count() noexcept { return ++refCount_; }
-
-  pi_uint32 decrement_reference_count() noexcept { return --refCount_; }
-
-  pi_uint32 get_reference_count() const noexcept { return refCount_; }
+struct _pi_sampler : ur_sampler_handle_t_ {
+  using ur_sampler_handle_t_::ur_sampler_handle_t_;
 };
 
-// -------------------------------------------------------------
-// Helper types and functions
-//
-
 #endif // PI_CUDA_HPP
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index e829d012e55b4..8cff5b2848b0f 100755
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -123,6 +123,51 @@ set_target_properties("ur_adapter_level_zero" PROPERTIES
     SOVERSION "0"
 )
 
+if ("cuda" IN_LIST SYCL_ENABLE_PLUGINS)
+  # Build CUDA adapter
+  add_sycl_library("ur_adapter_cuda" SHARED
+    SOURCES
+      "ur/ur.hpp"
+      "ur/ur.cpp"
+      "ur/usm_allocator.cpp"
+      "ur/usm_allocator.hpp"
+      "ur/adapters/cuda/common.cpp"
+      "ur/adapters/cuda/common.hpp"
+      "ur/adapters/cuda/context.cpp"
+      "ur/adapters/cuda/context.hpp"
+      "ur/adapters/cuda/device.cpp"
+      "ur/adapters/cuda/device.hpp"
+      "ur/adapters/cuda/enqueue.cpp"
+      "ur/adapters/cuda/event.cpp"
+      "ur/adapters/cuda/event.hpp"
+      "ur/adapters/cuda/kernel.cpp"
+      "ur/adapters/cuda/kernel.hpp"
+      "ur/adapters/cuda/memory.cpp"
+      "ur/adapters/cuda/memory.hpp"
+      "ur/adapters/cuda/platform.cpp"
+      "ur/adapters/cuda/platform.hpp"
+      "ur/adapters/cuda/program.cpp"
+      "ur/adapters/cuda/program.hpp"
+      "ur/adapters/cuda/queue.cpp"
+      "ur/adapters/cuda/queue.hpp"
+      "ur/adapters/cuda/sampler.cpp"
+      "ur/adapters/cuda/sampler.hpp"
+      "ur/adapters/cuda/tracing.cpp"
+      "ur/adapters/cuda/ur_interface_loader.cpp"
+      "ur/adapters/cuda/usm.cpp"
+    INCLUDE_DIRS
+      ${sycl_inc_dir}
+    LIBRARIES
+      UnifiedRuntime-Headers
+      Threads::Threads
+      cudadrv
+  )
+
+  set_target_properties("ur_adapter_cuda" PROPERTIES
+    VERSION "0.0.0"
+    SOVERSION "0"
+  )
+endif()
 
 if (TARGET UnifiedRuntimeLoader)
   set_target_properties(hello_world PROPERTIES EXCLUDE_FROM_ALL 1 EXCLUDE_FROM_DEFAULT_BUILD 1)
diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index 4ba4104ce6c3a..5fed9d0f933f7 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -1017,6 +1017,10 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
     InfoType = UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION;
     break;
   }
+  case PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP: {
+    InfoType = UR_EXT_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP;
+    break;
+  }
   default:
     return PI_ERROR_UNKNOWN;
   };
@@ -1607,17 +1611,20 @@ inline pi_result piProgramCreateWithBinary(
       reinterpret_cast<ur_context_handle_t>(Context);
   auto UrDevice = reinterpret_cast<ur_device_handle_t>(DeviceList[0]);
 
-  std::unique_ptr<ur_program_metadata_t[]> pMetadatas(
-      new ur_program_metadata_t[NumMetadataEntries]);
-  for (unsigned i = 0; i < NumMetadataEntries; i++) {
-    HANDLE_ERRORS(mapPIMetadataToUR(&Metadata[i], &pMetadatas[i]));
-  }
-
-  ur_program_properties_t Properties;
+  ur_program_properties_t Properties = {};
   Properties.stype = UR_STRUCTURE_TYPE_PROGRAM_PROPERTIES;
   Properties.pNext = nullptr;
   Properties.count = NumMetadataEntries;
-  Properties.pMetadatas = pMetadatas.get();
+
+  std::unique_ptr<ur_program_metadata_t[]> pMetadatas;
+  if (NumMetadataEntries) {
+    pMetadatas.reset(new ur_program_metadata_t[NumMetadataEntries]);
+    for (unsigned i = 0; i < NumMetadataEntries; i++) {
+      HANDLE_ERRORS(mapPIMetadataToUR(&Metadata[i], &pMetadatas[i]));
+    }
+
+    Properties.pMetadatas = pMetadatas.get();
+  }
 
   ur_program_handle_t *UrProgram =
       reinterpret_cast<ur_program_handle_t *>(Program);
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
new file mode 100644
index 0000000000000..86975e5097257
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp
@@ -0,0 +1,106 @@
+//===--------- common.cpp - CUDA Adapter -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include "common.hpp"
+
+#include <cuda.h>
+
+#include <sstream>
+
+ur_result_t mapErrorUR(CUresult Result) {
+  switch (Result) {
+  case CUDA_SUCCESS:
+    return UR_RESULT_SUCCESS;
+  case CUDA_ERROR_NOT_PERMITTED:
+    return UR_RESULT_ERROR_INVALID_OPERATION;
+  case CUDA_ERROR_INVALID_CONTEXT:
+    return UR_RESULT_ERROR_INVALID_CONTEXT;
+  case CUDA_ERROR_INVALID_DEVICE:
+    return UR_RESULT_ERROR_INVALID_DEVICE;
+  case CUDA_ERROR_INVALID_VALUE:
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  case CUDA_ERROR_OUT_OF_MEMORY:
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
+    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  default:
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+}
+
+ur_result_t checkErrorUR(CUresult Result, const char *Function, int Line,
+                         const char *File) {
+  if (Result == CUDA_SUCCESS || Result == CUDA_ERROR_DEINITIALIZED) {
+    return UR_RESULT_SUCCESS;
+  }
+
+  if (std::getenv("SYCL_PI_SUPPRESS_ERROR_MESSAGE") == nullptr) {
+    const char *ErrorString = nullptr;
+    const char *ErrorName = nullptr;
+    cuGetErrorName(Result, &ErrorName);
+    cuGetErrorString(Result, &ErrorString);
+    std::stringstream SS;
+    SS << "\nUR CUDA ERROR:"
+       << "\n\tValue:           " << Result
+       << "\n\tName:            " << ErrorName
+       << "\n\tDescription:     " << ErrorString
+       << "\n\tFunction:        " << Function << "\n\tSource Location: " << File
+       << ":" << Line << "\n"
+       << std::endl;
+    std::cerr << SS.str();
+  }
+
+  if (std::getenv("PI_CUDA_ABORT") != nullptr) {
+    std::abort();
+  }
+
+  throw mapErrorUR(Result);
+}
+
+std::string getCudaVersionString() {
+  int driver_version = 0;
+  cuDriverGetVersion(&driver_version);
+  // The version is returned as (1000 major + 10 minor).
+  std::stringstream stream;
+  stream << "CUDA " << driver_version / 1000 << "."
+         << driver_version % 1000 / 10;
+  return stream.str();
+}
+
+void sycl::detail::ur::die(const char *Message) {
+  std::cerr << "ur_die: " << Message << std::endl;
+  std::terminate();
+}
+
+void sycl::detail::ur::assertion(bool Condition, const char *Message) {
+  if (!Condition)
+    die(Message);
+}
+
+void sycl::detail::ur::cuPrint(const char *Message) {
+  std::cerr << "ur_print: " << Message << std::endl;
+}
+
+// Global variables for ZER_EXT_RESULT_ADAPTER_SPECIFIC_ERROR
+thread_local ur_result_t ErrorMessageCode = UR_RESULT_SUCCESS;
+thread_local char ErrorMessage[MaxMessageSize];
+
+// Utility function for setting a message and warning
+[[maybe_unused]] void setErrorMessage(const char *pMessage,
+                                      ur_result_t ErrorCode) {
+  assert(strlen(pMessage) <= MaxMessageSize);
+  strcpy(ErrorMessage, pMessage);
+  ErrorMessageCode = ErrorCode;
+}
+
+// Returns plugin specific error and warning messages; common implementation
+// that can be shared between adapters
+ur_result_t urGetLastResult(ur_platform_handle_t, const char **ppMessage) {
+  *ppMessage = &ErrorMessage[0];
+  return ErrorMessageCode;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp
new file mode 100644
index 0000000000000..5cfa609018b29
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp
@@ -0,0 +1,59 @@
+//===--------- common.hpp - CUDA Adapter -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+#pragma once
+
+#include <cuda.h>
+#include <sycl/detail/defines.hpp>
+#include <ur/ur.hpp>
+
+ur_result_t mapErrorUR(CUresult Result);
+
+/// Converts CUDA error into UR error codes, and outputs error information
+/// to stderr.
+/// If PI_CUDA_ABORT env variable is defined, it aborts directly instead of
+/// throwing the error. This is intended for debugging purposes.
+/// \return UR_RESULT_SUCCESS if \param Result was CUDA_SUCCESS.
+/// \throw ur_result_t exception (integer) if input was not success.
+///
+ur_result_t checkErrorUR(CUresult Result, const char *Function, int Line,
+                         const char *File);
+
+#define UR_CHECK_ERROR(Result)                                                 \
+  checkErrorUR(Result, __func__, __LINE__, __FILE__)
+
+std::string getCudaVersionString();
+
+constexpr size_t MaxMessageSize = 256;
+extern thread_local ur_result_t ErrorMessageCode;
+extern thread_local char ErrorMessage[MaxMessageSize];
+
+// Utility function for setting a message and warning
+[[maybe_unused]] void setErrorMessage(const char *pMessage,
+                                      ur_result_t ErrorCode);
+
+/// ------ Error handling, matching OpenCL plugin semantics.
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+namespace detail {
+namespace ur {
+
+// Report error and no return (keeps compiler from printing warnings).
+// TODO: Probably change that to throw a catchable exception,
+//       but for now it is useful to see every failure.
+//
+[[noreturn]] void die(const char *Message);
+
+// Reports error messages
+void cuPrint(const char *Message);
+
+void assertion(bool Condition, const char *Message = nullptr);
+
+} // namespace ur
+} // namespace detail
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
new file mode 100644
index 0000000000000..74a32bdac2748
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.cpp
@@ -0,0 +1,154 @@
+//===--------- context.cpp - CUDA Adapter ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include "context.hpp"
+
+#include <cassert>
+
+/// Create a UR CUDA context.
+///
+/// By default creates a scoped context and keeps the last active CUDA context
+/// on top of the CUDA context stack.
+/// With the __SYCL_PI_CONTEXT_PROPERTIES_CUDA_PRIMARY key/id and a value of
+/// PI_TRUE creates a primary CUDA context and activates it on the CUDA context
+/// stack.
+///
+UR_APIEXPORT ur_result_t UR_APICALL
+urContextCreate(uint32_t DeviceCount, const ur_device_handle_t *phDevices,
+                const ur_context_properties_t *pProperties,
+                ur_context_handle_t *phContext) {
+  std::ignore = DeviceCount;
+  std::ignore = pProperties;
+  UR_ASSERT(phDevices, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(phContext, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  assert(DeviceCount == 1);
+  ur_result_t RetErr = UR_RESULT_SUCCESS;
+
+  std::unique_ptr<ur_context_handle_t_> ContextPtr{nullptr};
+  try {
+    ContextPtr = std::unique_ptr<ur_context_handle_t_>(
+        new ur_context_handle_t_{*phDevices});
+    *phContext = ContextPtr.release();
+  } catch (ur_result_t Err) {
+    RetErr = Err;
+  } catch (...) {
+    RetErr = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  }
+  return RetErr;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
+    ur_context_handle_t hContext, ur_context_info_t ContextInfoType,
+    size_t propSize, void *pContextInfo, size_t *pPropSizeRet) {
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  UrReturnHelper ReturnValue(propSize, pContextInfo, pPropSizeRet);
+
+  switch (uint32_t{ContextInfoType}) {
+  case UR_CONTEXT_INFO_NUM_DEVICES:
+    return ReturnValue(1);
+  case UR_CONTEXT_INFO_DEVICES:
+    return ReturnValue(hContext->getDevice());
+  case UR_CONTEXT_INFO_REFERENCE_COUNT:
+    return ReturnValue(hContext->getReferenceCount());
+  case UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
+    uint32_t Capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
+                            UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
+                            UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE |
+                            UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL;
+    return ReturnValue(Capabilities);
+  }
+  case UR_CONTEXT_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
+    int Major = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&Major,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                             hContext->getDevice()->get()) == CUDA_SUCCESS);
+    uint32_t Capabilities =
+        (Major >= 7) ? UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_SYSTEM
+                     : UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE;
+    return ReturnValue(Capabilities);
+  }
+  case UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT:
+    // 2D USM memcpy is supported.
+    return ReturnValue(true);
+  case UR_CONTEXT_INFO_USM_FILL2D_SUPPORT:
+    // 2D USM operations currently not supported.
+    return ReturnValue(false);
+
+  default:
+    break;
+  }
+
+  return UR_RESULT_ERROR_INVALID_ENUMERATION;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urContextRelease(ur_context_handle_t hContext) {
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  if (hContext->decrementReferenceCount() > 0) {
+    return UR_RESULT_SUCCESS;
+  }
+  hContext->invokeExtendedDeleters();
+
+  std::unique_ptr<ur_context_handle_t_> Context{hContext};
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urContextRetain(ur_context_handle_t hContext) {
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  assert(hContext->getReferenceCount() > 0);
+
+  hContext->incrementReferenceCount();
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urContextGetNativeHandle(
+    ur_context_handle_t hContext, ur_native_handle_t *phNativeContext) {
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(phNativeContext, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  *phNativeContext = reinterpret_cast<ur_native_handle_t>(hContext->get());
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urContextCreateWithNativeHandle(
+    ur_native_handle_t hNativeContext, uint32_t numDevices,
+    const ur_device_handle_t *phDevices,
+    const ur_context_native_properties_t *pProperties,
+    ur_context_handle_t *phContext) {
+  std::ignore = hNativeContext;
+  std::ignore = numDevices;
+  std::ignore = phDevices;
+  std::ignore = pProperties;
+  std::ignore = phContext;
+
+  return UR_RESULT_ERROR_INVALID_OPERATION;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urContextSetExtendedDeleter(
+    ur_context_handle_t hContext, ur_context_extended_deleter_t pfnDeleter,
+    void *pUserData) {
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(pfnDeleter, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  hContext->setExtendedDeleter(pfnDeleter, pUserData);
+  return UR_RESULT_SUCCESS;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp
new file mode 100644
index 0000000000000..e13c48fa003b9
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/context.hpp
@@ -0,0 +1,139 @@
+//===--------- context.hpp - CUDA Adapter ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+#pragma once
+
+#include <cuda.h>
+#include <ur_api.h>
+
+#include <atomic>
+#include <mutex>
+#include <vector>
+
+#include "common.hpp"
+#include "device.hpp"
+
+typedef void (*ur_context_extended_deleter_t)(void *user_data);
+
+/// UR context mapping to a CUDA context object.
+///
+/// There is no direct mapping between a CUDA context and a UR context.
+/// The main differences are described below:
+///
+/// <b> CUDA context vs UR context </b>
+///
+/// One of the main differences between the UR API and the CUDA driver API is
+/// that the second modifies the state of the threads by assigning
+/// `CUcontext` objects to threads. `CUcontext` objects store data associated
+/// with a given device and control access to said device from the user side.
+/// UR API context are objects that are passed to functions, and not bound
+/// to threads.
+/// The ur_context_handle_t_ object doesn't implement this behavior. It only
+/// holds the CUDA context data. The RAII object \ref ScopedContext implements
+/// the active context behavior.
+///
+/// <b> Primary vs User-defined context </b>
+///
+/// CUDA has two different types of context, the Primary context,
+/// which is usable by all threads on a given process for a given device, and
+/// the aforementioned custom contexts.
+/// The CUDA documentation, confirmed with performance analysis, suggest using
+/// the Primary context whenever possible.
+/// The Primary context is also used by the CUDA Runtime API.
+/// For UR applications to interop with CUDA Runtime API, they have to use
+/// the primary context - and make that active in the thread.
+/// The `ur_context_handle_t_` object can be constructed with a `kind` parameter
+/// that allows to construct a Primary or `user-defined` context, so that
+/// the UR object interface is always the same.
+///
+///  <b> Destructor callback </b>
+///
+///  Required to implement CP023, SYCL Extended Context Destruction,
+///  the PI Context can store a number of callback functions that will be
+///  called upon destruction of the UR Context.
+///  See proposal for details.
+///  https://github.com/codeplaysoftware/standards-proposals/blob/master/extended-context-destruction/index.md
+///
+struct ur_context_handle_t_ {
+
+  struct deleter_data {
+    ur_context_extended_deleter_t Function;
+    void *UserData;
+
+    void operator()() { Function(UserData); }
+  };
+
+  using native_type = CUcontext;
+
+  native_type CUContext;
+  ur_device_handle_t DeviceID;
+  std::atomic_uint32_t RefCount;
+
+  ur_context_handle_t_(ur_device_handle_t_ *DevID)
+      : CUContext{DevID->getContext()}, DeviceID{DevID}, RefCount{1} {
+    urDeviceRetain(DeviceID);
+  };
+
+  ~ur_context_handle_t_() { urDeviceRelease(DeviceID); }
+
+  void invokeExtendedDeleters() {
+    std::lock_guard<std::mutex> Guard(Mutex);
+    for (auto &Deleter : ExtendedDeleters) {
+      Deleter();
+    }
+  }
+
+  void setExtendedDeleter(ur_context_extended_deleter_t Function,
+                          void *UserData) {
+    std::lock_guard<std::mutex> Guard(Mutex);
+    ExtendedDeleters.emplace_back(deleter_data{Function, UserData});
+  }
+
+  ur_device_handle_t getDevice() const noexcept { return DeviceID; }
+
+  native_type get() const noexcept { return CUContext; }
+
+  uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
+
+  uint32_t decrementReferenceCount() noexcept { return --RefCount; }
+
+  uint32_t getReferenceCount() const noexcept { return RefCount; }
+
+private:
+  std::mutex Mutex;
+  std::vector<deleter_data> ExtendedDeleters;
+};
+
+namespace {
+class ScopedContext {
+public:
+  ScopedContext(ur_context_handle_t Context) {
+    if (!Context) {
+      throw UR_RESULT_ERROR_INVALID_CONTEXT;
+    }
+
+    setContext(Context->get());
+  }
+
+  ScopedContext(CUcontext NativeContext) { setContext(NativeContext); }
+
+  ~ScopedContext() {}
+
+private:
+  void setContext(CUcontext Desired) {
+    CUcontext Original = nullptr;
+
+    UR_CHECK_ERROR(cuCtxGetCurrent(&Original));
+
+    // Make sure the desired context is active on the current thread, setting
+    // it if necessary
+    if (Original != Desired) {
+      UR_CHECK_ERROR(cuCtxSetCurrent(Desired));
+    }
+  }
+};
+} // namespace
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
new file mode 100644
index 0000000000000..c364c6f384a49
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.cpp
@@ -0,0 +1,1201 @@
+//===--------- device.cpp - CUDA Adapter -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include <cassert>
+#include <sstream>
+
+#include "context.hpp"
+#include "device.hpp"
+#include "platform.hpp"
+
+int getAttribute(ur_device_handle_t device, CUdevice_attribute attribute) {
+  int value;
+  sycl::detail::ur::assertion(
+      cuDeviceGetAttribute(&value, attribute, device->get()) == CUDA_SUCCESS);
+  return value;
+}
+
+uint64_t ur_device_handle_t_::getElapsedTime(CUevent ev) const {
+  float Milliseconds = 0.0f;
+
+  UR_CHECK_ERROR(cuEventElapsedTime(&Milliseconds, EvBase, ev));
+
+  return static_cast<uint64_t>(Milliseconds * 1.0e6);
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+                                                    ur_device_info_t propName,
+                                                    size_t propSize,
+                                                    void *pPropValue,
+                                                    size_t *pPropSizeRet) {
+  UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
+
+  static constexpr uint32_t MaxWorkItemDimensions = 3u;
+
+  ScopedContext Active(hDevice->getContext());
+
+  switch ((uint32_t)propName) {
+  case UR_DEVICE_INFO_TYPE: {
+    return ReturnValue(UR_DEVICE_TYPE_GPU);
+  }
+  case UR_DEVICE_INFO_VENDOR_ID: {
+    return ReturnValue(4318u);
+  }
+  case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: {
+    int ComputeUnits = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&ComputeUnits,
+                             CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(ComputeUnits >= 0);
+    return ReturnValue(static_cast<uint32_t>(ComputeUnits));
+  }
+  case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: {
+    return ReturnValue(MaxWorkItemDimensions);
+  }
+  case UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES: {
+    struct {
+      size_t Sizes[MaxWorkItemDimensions];
+    } ReturnSizes;
+
+    int MaxX = 0, MaxY = 0, MaxZ = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&MaxX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(MaxX >= 0);
+
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&MaxY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(MaxY >= 0);
+
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&MaxZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(MaxZ >= 0);
+
+    ReturnSizes.Sizes[0] = size_t(MaxX);
+    ReturnSizes.Sizes[1] = size_t(MaxY);
+    ReturnSizes.Sizes[2] = size_t(MaxZ);
+    return ReturnValue(ReturnSizes);
+  }
+
+  case UR_DEVICE_INFO_MAX_WORK_GROUPS_3D: {
+    struct {
+      size_t Sizes[MaxWorkItemDimensions];
+    } ReturnSizes;
+    int MaxX = 0, MaxY = 0, MaxZ = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&MaxX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(MaxX >= 0);
+
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&MaxY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(MaxY >= 0);
+
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&MaxZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(MaxZ >= 0);
+
+    ReturnSizes.Sizes[0] = size_t(MaxX);
+    ReturnSizes.Sizes[1] = size_t(MaxY);
+    ReturnSizes.Sizes[2] = size_t(MaxZ);
+    return ReturnValue(ReturnSizes);
+  }
+
+  case UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE: {
+    int MaxWorkGroupSize = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&MaxWorkGroupSize,
+                             CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                             hDevice->get()) == CUDA_SUCCESS);
+
+    sycl::detail::ur::assertion(MaxWorkGroupSize >= 0);
+
+    return ReturnValue(size_t(MaxWorkGroupSize));
+  }
+  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR: {
+    return ReturnValue(1u);
+  }
+  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT: {
+    return ReturnValue(1u);
+  }
+  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT: {
+    return ReturnValue(1u);
+  }
+  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG: {
+    return ReturnValue(1u);
+  }
+  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT: {
+    return ReturnValue(1u);
+  }
+  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE: {
+    return ReturnValue(1u);
+  }
+  case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF: {
+    return ReturnValue(0u);
+  }
+  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR: {
+    return ReturnValue(1u);
+  }
+  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT: {
+    return ReturnValue(1u);
+  }
+  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT: {
+    return ReturnValue(1u);
+  }
+  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG: {
+    return ReturnValue(1u);
+  }
+  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT: {
+    return ReturnValue(1u);
+  }
+  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE: {
+    return ReturnValue(1u);
+  }
+  case UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF: {
+    return ReturnValue(0u);
+  }
+  case UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS: {
+    // Number of sub-groups = max block size / warp size + possible remainder
+    int MaxThreads = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&MaxThreads,
+                             CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                             hDevice->get()) == CUDA_SUCCESS);
+    int WarpSize = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
+                             hDevice->get()) == CUDA_SUCCESS);
+    int MaxWarps = (MaxThreads + WarpSize - 1) / WarpSize;
+    return ReturnValue(MaxWarps);
+  }
+  case UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: {
+    // Volta provides independent thread scheduling
+    // TODO: Revisit for previous generation GPUs
+    int Major = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&Major,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                             hDevice->get()) == CUDA_SUCCESS);
+    bool IFP = (Major >= 7);
+    return ReturnValue(IFP);
+  }
+
+  case UR_DEVICE_INFO_ATOMIC_64: {
+    int Major = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&Major,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                             hDevice->get()) == CUDA_SUCCESS);
+
+    bool Atomic64 = (Major >= 6) ? true : false;
+    return ReturnValue(Atomic64);
+  }
+  case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
+    uint64_t Capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
+                            UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
+                            UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE |
+                            UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL;
+    return ReturnValue(Capabilities);
+  }
+  case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
+    int Major = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&Major,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                             hDevice->get()) == CUDA_SUCCESS);
+    uint64_t Capabilities =
+        (Major >= 7) ? UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_SYSTEM
+                     : UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP |
+                           UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE;
+    return ReturnValue(Capabilities);
+  }
+
+  case UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: {
+    // SYCL2020 4.6.4.2 minimum mandated capabilities for
+    // atomic_fence_order_capabilities.
+    ur_memory_order_capability_flags_t Capabilities =
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE |
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL;
+    return ReturnValue(Capabilities);
+  }
+  case UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: {
+    // SYCL2020 4.6.4.2 minimum mandated capabilities for
+    // atomic_fence/memory_scope_capabilities.
+    // Because scopes are hierarchical, wider scopes support all narrower
+    // scopes. At a minimum, each device must support WORK_ITEM, SUB_GROUP and
+    // WORK_GROUP. (https://github.com/KhronosGroup/SYCL-Docs/pull/382)
+    ur_memory_scope_capability_flags_t Capabilities =
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
+        UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP;
+    return ReturnValue(Capabilities);
+  }
+  case UR_DEVICE_INFO_BFLOAT16: {
+    int Major = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&Major,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                             hDevice->get()) == CUDA_SUCCESS);
+
+    bool BFloat16 = (Major >= 8) ? true : false;
+    return ReturnValue(BFloat16);
+  }
+  case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
+    // NVIDIA devices only support one sub-group size (the warp size)
+    int WarpSize = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
+                             hDevice->get()) == CUDA_SUCCESS);
+    size_t Sizes[1] = {static_cast<size_t>(WarpSize)};
+    return ReturnValue(Sizes, 1);
+  }
+  case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY: {
+    int ClockFreq = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&ClockFreq, CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(ClockFreq >= 0);
+    return ReturnValue(static_cast<uint32_t>(ClockFreq) / 1000u);
+  }
+  case UR_DEVICE_INFO_ADDRESS_BITS: {
+    auto Bits = uint32_t{std::numeric_limits<uintptr_t>::digits};
+    return ReturnValue(Bits);
+  }
+  case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: {
+    // Max size of memory object allocation in bytes.
+    // The minimum value is max(min(1024 × 1024 ×
+    // 1024, 1/4th of CL_DEVICE_GLOBAL_MEM_SIZE),
+    // 32 × 1024 × 1024) for devices that are not of type
+    // CL_DEVICE_TYPE_CUSTOM.
+
+    size_t Global = 0;
+    sycl::detail::ur::assertion(cuDeviceTotalMem(&Global, hDevice->get()) ==
+                                CUDA_SUCCESS);
+
+    auto QuarterGlobal = static_cast<uint32_t>(Global / 4u);
+
+    auto MaxAlloc = std::max(std::min(1024u * 1024u * 1024u, QuarterGlobal),
+                             32u * 1024u * 1024u);
+
+    return ReturnValue(uint64_t{MaxAlloc});
+  }
+  case UR_DEVICE_INFO_IMAGE_SUPPORTED: {
+    bool Enabled = false;
+
+    if (std::getenv("SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT") != nullptr) {
+      Enabled = true;
+    } else {
+      sycl::detail::ur::cuPrint(
+          "Images are not fully supported by the CUDA BE, their support is "
+          "disabled by default. Their partial support can be activated by "
+          "setting SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT environment variable at "
+          "runtime.");
+    }
+
+    return ReturnValue(uint32_t{Enabled});
+  }
+  case UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS: {
+    // This call doesn't match to CUDA as it doesn't have images, but instead
+    // surfaces and textures. No clear call in the CUDA API to determine this,
+    // but some searching found as of SM 2.x 128 are supported.
+    return ReturnValue(128u);
+  }
+  case UR_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS: {
+    // This call doesn't match to CUDA as it doesn't have images, but instead
+    // surfaces and textures. No clear call in the CUDA API to determine this,
+    // but some searching found as of SM 2.x 128 are supported.
+    return ReturnValue(128u);
+  }
+  case UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT: {
+    // Take the smaller of maximum surface and maximum texture height.
+    int TexHeight = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&TexHeight,
+                             CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(TexHeight >= 0);
+    int SurfHeight = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&SurfHeight,
+                             CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(SurfHeight >= 0);
+
+    int Min = std::min(TexHeight, SurfHeight);
+
+    return ReturnValue(static_cast<size_t>(Min));
+  }
+  case UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH: {
+    // Take the smaller of maximum surface and maximum texture width.
+    int TexWidth = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&TexWidth,
+                             CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(TexWidth >= 0);
+    int SurfWidth = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&SurfWidth,
+                             CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(SurfWidth >= 0);
+
+    int Min = std::min(TexWidth, SurfWidth);
+
+    return ReturnValue(static_cast<size_t>(Min));
+  }
+  case UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT: {
+    // Take the smaller of maximum surface and maximum texture height.
+    int TexHeight = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&TexHeight,
+                             CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(TexHeight >= 0);
+    int SurfHeight = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&SurfHeight,
+                             CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(SurfHeight >= 0);
+
+    int Min = std::min(TexHeight, SurfHeight);
+
+    return ReturnValue(static_cast<size_t>(Min));
+  }
+  case UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH: {
+    // Take the smaller of maximum surface and maximum texture width.
+    int TexWidth = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&TexWidth,
+                             CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(TexWidth >= 0);
+    int SurfWidth = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&SurfWidth,
+                             CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(SurfWidth >= 0);
+
+    int Min = std::min(TexWidth, SurfWidth);
+
+    return ReturnValue(static_cast<size_t>(Min));
+  }
+  case UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH: {
+    // Take the smaller of maximum surface and maximum texture depth.
+    int TexDepth = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&TexDepth,
+                             CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(TexDepth >= 0);
+    int SurfDepth = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&SurfDepth,
+                             CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(SurfDepth >= 0);
+
+    int Min = std::min(TexDepth, SurfDepth);
+
+    return ReturnValue(static_cast<size_t>(Min));
+  }
+  case UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE: {
+    // Take the smaller of maximum surface and maximum texture width.
+    int TexWidth = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&TexWidth,
+                             CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(TexWidth >= 0);
+    int SurfWidth = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&SurfWidth,
+                             CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(SurfWidth >= 0);
+
+    int Min = std::min(TexWidth, SurfWidth);
+
+    return ReturnValue(static_cast<size_t>(Min));
+  }
+  case UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: {
+    return ReturnValue(0lu);
+  }
+  case UR_DEVICE_INFO_MAX_SAMPLERS: {
+    // This call is kind of meaningless for cuda, as samplers don't exist.
+    // Closest thing is textures, which is 128.
+    return ReturnValue(128u);
+  }
+  case UR_DEVICE_INFO_MAX_PARAMETER_SIZE: {
+    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/#function-parameters
+    // __global__ function parameters are passed to the device via constant
+    // memory and are limited to 4 KB.
+    return ReturnValue(4000lu);
+  }
+  case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: {
+    int MemBaseAddrAlign = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&MemBaseAddrAlign,
+                             CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
+                             hDevice->get()) == CUDA_SUCCESS);
+    // Multiply by 8 as clGetDeviceInfo returns this value in bits
+    MemBaseAddrAlign *= 8;
+    return ReturnValue(MemBaseAddrAlign);
+  }
+  case UR_DEVICE_INFO_HALF_FP_CONFIG: {
+    // TODO: is this config consistent across all NVIDIA GPUs?
+    return ReturnValue(0u);
+  }
+  case UR_DEVICE_INFO_SINGLE_FP_CONFIG: {
+    // TODO: is this config consistent across all NVIDIA GPUs?
+    uint64_t Config =
+        UR_DEVICE_FP_CAPABILITY_FLAG_DENORM |
+        UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN |
+        UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST |
+        UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO |
+        UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF |
+        UR_DEVICE_FP_CAPABILITY_FLAG_FMA |
+        UR_DEVICE_FP_CAPABILITY_FLAG_CORRECTLY_ROUNDED_DIVIDE_SQRT;
+    return ReturnValue(Config);
+  }
+  case UR_DEVICE_INFO_DOUBLE_FP_CONFIG: {
+    // TODO: is this config consistent across all NVIDIA GPUs?
+    uint64_t Config = UR_DEVICE_FP_CAPABILITY_FLAG_DENORM |
+                      UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN |
+                      UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST |
+                      UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO |
+                      UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF |
+                      UR_DEVICE_FP_CAPABILITY_FLAG_FMA;
+    return ReturnValue(Config);
+  }
+  case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE: {
+    // TODO: is this config consistent across all NVIDIA GPUs?
+    return ReturnValue(UR_DEVICE_MEM_CACHE_TYPE_READ_WRITE_CACHE);
+  }
+  case UR_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE: {
+    // The value is documented for all existing GPUs in the CUDA programming
+    // guidelines, section "H.3.2. Global Memory".
+    return ReturnValue(128u);
+  }
+  case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE: {
+    int CacheSize = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(CacheSize >= 0);
+    // The L2 cache is global to the GPU.
+    return ReturnValue(static_cast<uint64_t>(CacheSize));
+  }
+  case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: {
+    size_t Bytes = 0;
+    // Runtime API has easy access to this value, driver API info is scarse.
+    sycl::detail::ur::assertion(cuDeviceTotalMem(&Bytes, hDevice->get()) ==
+                                CUDA_SUCCESS);
+    return ReturnValue(uint64_t{Bytes});
+  }
+  case UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE: {
+    int ConstantMemory = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&ConstantMemory,
+                             CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(ConstantMemory >= 0);
+
+    return ReturnValue(static_cast<uint64_t>(ConstantMemory));
+  }
+  case UR_DEVICE_INFO_MAX_CONSTANT_ARGS: {
+    // TODO: is there a way to retrieve this from CUDA driver API?
+    // Hard coded to value returned by clinfo for OpenCL 1.2 CUDA | GeForce GTX
+    // 1060 3GB
+    return ReturnValue(9u);
+  }
+  case UR_DEVICE_INFO_LOCAL_MEM_TYPE: {
+    return ReturnValue(UR_DEVICE_LOCAL_MEM_TYPE_LOCAL);
+  }
+  case UR_DEVICE_INFO_LOCAL_MEM_SIZE: {
+    // OpenCL's "local memory" maps most closely to CUDA's "shared memory".
+    // CUDA has its own definition of "local memory", which maps to OpenCL's
+    // "private memory".
+    int LocalMemSize = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&LocalMemSize,
+                             CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(LocalMemSize >= 0);
+    return ReturnValue(static_cast<uint64_t>(LocalMemSize));
+  }
+  case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: {
+    int ECCEnabled = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&ECCEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED,
+                             hDevice->get()) == CUDA_SUCCESS);
+
+    sycl::detail::ur::assertion((ECCEnabled == 0) | (ECCEnabled == 1));
+    auto Result = static_cast<bool>(ECCEnabled);
+    return ReturnValue(Result);
+  }
+  case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: {
+    int IsIntegrated = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&IsIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED,
+                             hDevice->get()) == CUDA_SUCCESS);
+
+    sycl::detail::ur::assertion((IsIntegrated == 0) | (IsIntegrated == 1));
+    auto result = static_cast<bool>(IsIntegrated);
+    return ReturnValue(result);
+  }
+  case UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: {
+    // Hard coded to value returned by clinfo for OpenCL 1.2 CUDA | GeForce GTX
+    // 1060 3GB
+    return ReturnValue(1000lu);
+  }
+  case UR_DEVICE_INFO_ENDIAN_LITTLE: {
+    return ReturnValue(true);
+  }
+  case UR_DEVICE_INFO_AVAILABLE: {
+    return ReturnValue(true);
+  }
+  case UR_DEVICE_INFO_BUILD_ON_SUBDEVICE: {
+    return ReturnValue(true);
+  }
+  case UR_DEVICE_INFO_COMPILER_AVAILABLE: {
+    return ReturnValue(true);
+  }
+  case UR_DEVICE_INFO_LINKER_AVAILABLE: {
+    return ReturnValue(true);
+  }
+  case UR_DEVICE_INFO_EXECUTION_CAPABILITIES: {
+    auto Capability = ur_device_exec_capability_flags_t{
+        UR_DEVICE_EXEC_CAPABILITY_FLAG_KERNEL};
+    return ReturnValue(Capability);
+  }
+  case UR_DEVICE_INFO_QUEUE_PROPERTIES:
+    return ReturnValue(
+        ur_queue_flag_t(UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE |
+                        UR_QUEUE_FLAG_PROFILING_ENABLE));
+  case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: {
+    // The mandated minimum capability:
+    uint64_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE |
+                          UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+    return ReturnValue(Capability);
+  }
+  case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: {
+    // The mandated minimum capability:
+    uint64_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE;
+    return ReturnValue(Capability);
+  }
+  case UR_DEVICE_INFO_BUILT_IN_KERNELS: {
+    // An empty string is returned if no built-in kernels are supported by the
+    // device.
+    return ReturnValue("");
+  }
+  case UR_DEVICE_INFO_PLATFORM: {
+    return ReturnValue(hDevice->getPlatform());
+  }
+  case UR_DEVICE_INFO_NAME: {
+    static constexpr size_t MaxDeviceNameLength = 256u;
+    char Name[MaxDeviceNameLength];
+    sycl::detail::ur::assertion(
+        cuDeviceGetName(Name, MaxDeviceNameLength, hDevice->get()) ==
+        CUDA_SUCCESS);
+    return ReturnValue(Name, strlen(Name) + 1);
+  }
+  case UR_DEVICE_INFO_VENDOR: {
+    return ReturnValue("NVIDIA Corporation");
+  }
+  case UR_DEVICE_INFO_DRIVER_VERSION: {
+    auto Version = getCudaVersionString();
+    return ReturnValue(Version.c_str());
+  }
+  case UR_DEVICE_INFO_PROFILE: {
+    return ReturnValue("CUDA");
+  }
+  case UR_DEVICE_INFO_REFERENCE_COUNT: {
+    return ReturnValue(hDevice->getReferenceCount());
+  }
+  case UR_DEVICE_INFO_VERSION: {
+    std::stringstream SS;
+    int Major;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&Major,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                             hDevice->get()) == CUDA_SUCCESS);
+    SS << Major;
+    int Minor;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&Minor,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                             hDevice->get()) == CUDA_SUCCESS);
+    SS << "." << Minor;
+    return ReturnValue(SS.str().c_str());
+  }
+  case UR_EXT_DEVICE_INFO_OPENCL_C_VERSION: {
+    return ReturnValue("");
+  }
+  case UR_DEVICE_INFO_EXTENSIONS: {
+
+    std::string SupportedExtensions = "cl_khr_fp64 cl_khr_subgroups ";
+    SupportedExtensions += "pi_ext_intel_devicelib_assert ";
+    SupportedExtensions += " ";
+
+    int Major = 0;
+    int Minor = 0;
+
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&Major,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&Minor,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                             hDevice->get()) == CUDA_SUCCESS);
+
+    if ((Major >= 6) || ((Major == 5) && (Minor >= 3))) {
+      SupportedExtensions += "cl_khr_fp16 ";
+    }
+
+    return ReturnValue(SupportedExtensions.c_str());
+  }
+  case UR_DEVICE_INFO_PRINTF_BUFFER_SIZE: {
+    // The minimum value for the FULL profile is 1 MB.
+    return ReturnValue(1024lu);
+  }
+  case UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC: {
+    return ReturnValue(true);
+  }
+  case UR_DEVICE_INFO_PARENT_DEVICE: {
+    return ReturnValue(nullptr);
+  }
+  case UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: {
+    return ReturnValue(0u);
+  }
+  case UR_DEVICE_INFO_PARTITION_PROPERTIES: {
+    return ReturnValue(static_cast<ur_device_partition_t>(0u));
+  }
+  case UR_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN: {
+    return ReturnValue(0u);
+  }
+  case UR_DEVICE_INFO_PARTITION_TYPE: {
+    return ReturnValue(static_cast<ur_device_partition_t>(0u));
+  }
+
+    // Intel USM extensions
+
+  case UR_DEVICE_INFO_USM_HOST_SUPPORT: {
+    // from cl_intel_unified_shared_memory: "The host memory access capabilities
+    // apply to any host allocation."
+    //
+    // query if/how the device can access page-locked host memory, possibly
+    // through PCIe, using the same pointer as the host
+    uint32_t Value = {};
+    if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) {
+      // the device shares a unified address space with the host
+      if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
+          6) {
+        // compute capability 6.x introduces operations that are atomic with
+        // respect to other CPUs and GPUs in the system
+        Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
+                UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS |
+                UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS |
+                UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS;
+      } else {
+        // on GPU architectures with compute capability lower than 6.x, atomic
+        // operations from the GPU to CPU memory will not be atomic with respect
+        // to CPU initiated atomic operations
+        Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
+                UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS;
+      }
+    }
+    return ReturnValue(Value);
+  }
+  case UR_DEVICE_INFO_USM_DEVICE_SUPPORT: {
+    // from cl_intel_unified_shared_memory:
+    // "The device memory access capabilities apply to any device allocation
+    // associated with this device."
+    //
+    // query how the device can access memory allocated on the device itself (?)
+    uint32_t Value =
+        UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
+        UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS |
+        UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS |
+        UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS;
+    return ReturnValue(Value);
+  }
+  case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: {
+    // from cl_intel_unified_shared_memory:
+    // "The single device shared memory access capabilities apply to any shared
+    // allocation associated with this device."
+    //
+    // query if/how the device can access managed memory associated to it
+    uint32_t Value = {};
+    if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) {
+      // the device can allocate managed memory on this system
+      Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
+              UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS;
+    }
+    if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
+      // the device can coherently access managed memory concurrently with the
+      // CPU
+      Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS;
+      if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
+          6) {
+        // compute capability 6.x introduces operations that are atomic with
+        // respect to other CPUs and GPUs in the system
+        Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS;
+      }
+    }
+    return ReturnValue(Value);
+  }
+  case UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: {
+    // from cl_intel_unified_shared_memory:
+    // "The cross-device shared memory access capabilities apply to any shared
+    // allocation associated with this device, or to any shared memory
+    // allocation on another device that also supports the same cross-device
+    // shared memory access capability."
+    //
+    // query if/how the device can access managed memory associated to other
+    // devices
+    uint32_t Value = {};
+    if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) {
+      // the device can allocate managed memory on this system
+      Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS;
+    }
+    if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
+      // all devices with the CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
+      // attribute can coherently access managed memory concurrently with the
+      // CPU
+      Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS;
+    }
+    if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
+        6) {
+      // compute capability 6.x introduces operations that are atomic with
+      // respect to other CPUs and GPUs in the system
+      if (Value & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)
+        Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS;
+      if (Value & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS)
+        Value |= UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS;
+    }
+    return ReturnValue(Value);
+  }
+  case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: {
+    // from cl_intel_unified_shared_memory:
+    // "The shared system memory access capabilities apply to any allocations
+    // made by a system allocator, such as malloc or new."
+    //
+    // query if/how the device can access pageable host memory allocated by the
+    // system allocator
+    uint32_t Value = {};
+    if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS)) {
+      // the device suppports coherently accessing pageable memory without
+      // calling cuMemHostRegister/cudaHostRegister on it
+      if (getAttribute(hDevice,
+                       CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED)) {
+        // the link between the device and the host supports native atomic
+        // operations
+        Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
+                UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS |
+                UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS |
+                UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS;
+      } else {
+        // the link between the device and the host does not support native
+        // atomic operations
+        Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS |
+                UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS;
+      }
+    }
+    return ReturnValue(Value);
+  }
+  case UR_DEVICE_INFO_ASYNC_BARRIER: {
+    int Value = getAttribute(hDevice,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 8;
+    return ReturnValue(static_cast<bool>(Value));
+  }
+  case UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION: {
+    int Major =
+        getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
+    int Minor =
+        getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
+    std::string Result = std::to_string(Major) + "." + std::to_string(Minor);
+    return ReturnValue(Result.c_str());
+  }
+
+  case UR_DEVICE_INFO_GLOBAL_MEM_FREE: {
+    size_t FreeMemory = 0;
+    size_t TotalMemory = 0;
+    sycl::detail::ur::assertion(cuMemGetInfo(&FreeMemory, &TotalMemory) ==
+                                    CUDA_SUCCESS,
+                                "failed cuMemGetInfo() API.");
+    return ReturnValue(FreeMemory);
+  }
+  case UR_DEVICE_INFO_MEMORY_CLOCK_RATE: {
+    int Value = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&Value, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(Value >= 0);
+    // Convert kilohertz to megahertz when returning.
+    return ReturnValue(Value / 1000);
+  }
+  case UR_DEVICE_INFO_MEMORY_BUS_WIDTH: {
+    int Value = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&Value,
+                             CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(Value >= 0);
+    return ReturnValue(Value);
+  }
+  case UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES: {
+    return ReturnValue(int32_t{1});
+  }
+  case UR_DEVICE_INFO_DEVICE_ID: {
+    int Value = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&Value, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(Value >= 0);
+    return ReturnValue(Value);
+  }
+  case UR_DEVICE_INFO_UUID: {
+    int DriverVersion = 0;
+    cuDriverGetVersion(&DriverVersion);
+    int Major = DriverVersion / 1000;
+    int Minor = DriverVersion % 1000 / 10;
+    CUuuid UUID;
+    if ((Major > 11) || (Major == 11 && Minor >= 4)) {
+      sycl::detail::ur::assertion(cuDeviceGetUuid_v2(&UUID, hDevice->get()) ==
+                                  CUDA_SUCCESS);
+    } else {
+      sycl::detail::ur::assertion(cuDeviceGetUuid(&UUID, hDevice->get()) ==
+                                  CUDA_SUCCESS);
+    }
+    std::array<unsigned char, 16> Name;
+    std::copy(UUID.bytes, UUID.bytes + 16, Name.begin());
+    return ReturnValue(Name.data(), 16);
+  }
+  case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH: {
+    int Major = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&Major,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                             hDevice->get()) == CUDA_SUCCESS);
+
+    int Minor = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&Minor,
+                             CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                             hDevice->get()) == CUDA_SUCCESS);
+
+    // Some specific devices seem to need special handling. See reference
+    // https://github.com/jeffhammond/HPCInfo/blob/master/cuda/gpu-detect.cu
+    bool IsXavierAGX = Major == 7 && Minor == 2;
+    bool IsOrinAGX = Major == 8 && Minor == 7;
+
+    int MemoryClockKHz = 0;
+    if (IsXavierAGX) {
+      MemoryClockKHz = 2133000;
+    } else if (IsOrinAGX) {
+      MemoryClockKHz = 3200000;
+    } else {
+      sycl::detail::ur::assertion(
+          cuDeviceGetAttribute(&MemoryClockKHz,
+                               CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
+                               hDevice->get()) == CUDA_SUCCESS);
+    }
+
+    int MemoryBusWidth = 0;
+    if (IsOrinAGX) {
+      MemoryBusWidth = 256;
+    } else {
+      sycl::detail::ur::assertion(
+          cuDeviceGetAttribute(&MemoryBusWidth,
+                               CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
+                               hDevice->get()) == CUDA_SUCCESS);
+    }
+
+    uint64_t MemoryBandwidth = uint64_t(MemoryClockKHz) * MemoryBusWidth * 250;
+
+    return ReturnValue(MemoryBandwidth);
+  }
+  case UR_DEVICE_INFO_IL_VERSION: {
+    std::string ILVersion = "nvptx-";
+
+    int DriverVersion = 0;
+    cuDriverGetVersion(&DriverVersion);
+    int Major = DriverVersion / 1000;
+    int Minor = DriverVersion % 1000 / 10;
+
+    // We can work out which ptx ISA version we support based on the versioning
+    // table published here
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release-notes
+    // Major versions that we support are consistent in how they line up, so we
+    // can derive that easily. The minor versions for version 10 don't line up
+    // the same so it needs a special case. This is not ideal but it does seem
+    // to be the best bet to avoid a maintenance burden here.
+    ILVersion += std::to_string(Major - 4) + ".";
+    if (Major == 10) {
+      ILVersion += std::to_string(Minor + 3);
+    } else if (Major >= 11) {
+      ILVersion += std::to_string(Minor);
+    } else {
+      return UR_RESULT_ERROR_INVALID_VALUE;
+    }
+
+    return ReturnValue(ILVersion.data(), ILVersion.size());
+  }
+  case UR_EXT_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP: {
+    // Maximum number of 32-bit registers available to a thread block.
+    // Note: This number is shared by all thread blocks simultaneously resident
+    // on a multiprocessor.
+    int MaxRegisters{-1};
+    UR_CHECK_ERROR(cuDeviceGetAttribute(
+        &MaxRegisters, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
+        hDevice->get()));
+
+    sycl::detail::ur::assertion(MaxRegisters >= 0);
+
+    return ReturnValue(static_cast<uint32_t>(MaxRegisters));
+  }
+  case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT:
+    return ReturnValue(false);
+  case UR_DEVICE_INFO_IMAGE_SRGB:
+    return ReturnValue(false);
+  case UR_DEVICE_INFO_PCI_ADDRESS: {
+    constexpr size_t AddressBufferSize = 13;
+    char AddressBuffer[AddressBufferSize];
+    sycl::detail::ur::assertion(
+        cuDeviceGetPCIBusId(AddressBuffer, AddressBufferSize, hDevice->get()) ==
+        CUDA_SUCCESS);
+    // CUDA API (8.x - 12.1) guarantees 12 bytes + \0 are written
+    sycl::detail::ur::assertion(strnlen(AddressBuffer, AddressBufferSize) ==
+                                12);
+    return ReturnValue(AddressBuffer,
+                       strnlen(AddressBuffer, AddressBufferSize - 1) + 1);
+  }
+  case UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS:
+    return ReturnValue(false);
+    // TODO: Investigate if this information is available on CUDA.
+  case UR_DEVICE_INFO_GPU_EU_COUNT:
+  case UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH:
+  case UR_DEVICE_INFO_GPU_EU_SLICES:
+  case UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE:
+  case UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE:
+  case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
+    return UR_RESULT_ERROR_INVALID_ENUMERATION;
+
+  default:
+    break;
+  }
+  return UR_RESULT_ERROR_INVALID_ENUMERATION;
+}
+
+/// \return PI_SUCCESS if the function is executed successfully
+/// CUDA devices are always root devices so retain always returns success.
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceRetain(ur_device_handle_t hDevice) {
+  UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urDevicePartition(ur_device_handle_t, const ur_device_partition_property_t *,
+                  uint32_t, ur_device_handle_t *, uint32_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+/// \return UR_RESULT_SUCCESS always since CUDA devices are always root
+/// devices.
+UR_APIEXPORT ur_result_t UR_APICALL
+urDeviceRelease(ur_device_handle_t hDevice) {
+  UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform,
+                                                ur_device_type_t DeviceType,
+                                                uint32_t NumEntries,
+                                                ur_device_handle_t *phDevices,
+                                                uint32_t *pNumDevices) {
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  const bool AskingForAll = DeviceType == UR_DEVICE_TYPE_ALL;
+  const bool AskingForDefault = DeviceType == UR_DEVICE_TYPE_DEFAULT;
+  const bool AskingForGPU = DeviceType == UR_DEVICE_TYPE_GPU;
+  const bool ReturnDevices = AskingForDefault || AskingForAll || AskingForGPU;
+
+  UR_ASSERT(hPlatform, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  size_t NumDevices = ReturnDevices ? hPlatform->Devices.size() : 0;
+
+  try {
+    UR_ASSERT(pNumDevices || phDevices, UR_RESULT_ERROR_INVALID_VALUE);
+
+    if (pNumDevices) {
+      *pNumDevices = NumDevices;
+    }
+
+    if (ReturnDevices && phDevices) {
+      for (size_t i = 0; i < std::min(size_t(NumEntries), NumDevices); ++i) {
+        phDevices[i] = hPlatform->Devices[i].get();
+      }
+    }
+
+    return Result;
+  } catch (ur_result_t Err) {
+    return Err;
+  } catch (...) {
+    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  }
+}
+
+/// Gets the native CUDA handle of a UR device object
+///
+/// \param[in] device The UR device to get the native CUDA object of.
+/// \param[out] nativeHandle Set to the native handle of the UR device object.
+///
+/// \return PI_SUCCESS
+
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle(
+    ur_device_handle_t hDevice, ur_native_handle_t *phNativeHandle) {
+  UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(phNativeHandle, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  *phNativeHandle = reinterpret_cast<ur_native_handle_t>(hDevice->get());
+  return UR_RESULT_SUCCESS;
+}
+
+/// Created a UR device object from a CUDA device handle.
+/// NOTE: The created UR object does not take ownership of the native handle.
+///
+/// \param[in] nativeHandle The native handle to create UR device object from.
+/// \param[in] platform is the UR platform of the device.
+/// \param[out] device Set to the UR device object created from native handle.
+///
+/// \return TBD
+
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
+    ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform,
+    const ur_device_native_properties_t *pProperties,
+    ur_device_handle_t *phDevice) {
+  std::ignore = pProperties;
+  UR_ASSERT(phDevice, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  // We can't cast between ur_native_handle_t and CUdevice, so memcpy the bits
+  // instead
+  CUdevice CuDevice = 0;
+  memcpy(&CuDevice, hNativeDevice, sizeof(CUdevice));
+
+  auto IsDevice = [=](std::unique_ptr<ur_device_handle_t_> &Dev) {
+    return Dev->get() == CuDevice;
+  };
+
+  // If a platform is provided just check if the device is in it
+  if (hPlatform) {
+    auto SearchRes = std::find_if(begin(hPlatform->Devices),
+                                  end(hPlatform->Devices), IsDevice);
+    if (SearchRes != end(hPlatform->Devices)) {
+      *phDevice = SearchRes->get();
+      return UR_RESULT_SUCCESS;
+    }
+  }
+
+  // Get list of platforms
+  uint32_t NumPlatforms = 0;
+  ur_result_t Result = urPlatformGet(0, nullptr, &NumPlatforms);
+  if (Result != UR_RESULT_SUCCESS)
+    return Result;
+
+  ur_platform_handle_t *Plat = static_cast<ur_platform_handle_t *>(
+      malloc(NumPlatforms * sizeof(ur_platform_handle_t)));
+  Result = urPlatformGet(NumPlatforms, Plat, nullptr);
+  if (Result != UR_RESULT_SUCCESS)
+    return Result;
+
+  // Iterate through platforms to find device that matches nativeHandle
+  for (uint32_t j = 0; j < NumPlatforms; ++j) {
+    auto SearchRes =
+        std::find_if(begin(Plat[j]->Devices), end(Plat[j]->Devices), IsDevice);
+    if (SearchRes != end(Plat[j]->Devices)) {
+      *phDevice = static_cast<ur_device_handle_t>((*SearchRes).get());
+      return UR_RESULT_SUCCESS;
+    }
+  }
+
+  // If the provided nativeHandle cannot be matched to an
+  // existing device return error
+  return UR_RESULT_ERROR_INVALID_OPERATION;
+}
+
+ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice,
+                                                   uint64_t *pDeviceTimestamp,
+                                                   uint64_t *pHostTimestamp) {
+  UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  CUevent Event;
+  ScopedContext Active(hDevice->getContext());
+
+  if (pDeviceTimestamp) {
+    UR_CHECK_ERROR(cuEventCreate(&Event, CU_EVENT_DEFAULT));
+    UR_CHECK_ERROR(cuEventRecord(Event, 0));
+  }
+  if (pHostTimestamp) {
+
+    using namespace std::chrono;
+    *pHostTimestamp =
+        duration_cast<nanoseconds>(steady_clock::now().time_since_epoch())
+            .count();
+  }
+
+  if (pDeviceTimestamp) {
+    UR_CHECK_ERROR(cuEventSynchronize(Event));
+    *pDeviceTimestamp = hDevice->getElapsedTime(Event);
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+/// \return If available, the first binary that is PTX
+///
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary(
+    ur_device_handle_t hDevice, const ur_device_binary_t *pBinaries,
+    uint32_t NumBinaries, uint32_t *pSelectedBinary) {
+  // Ignore unused parameter
+  (void)hDevice;
+
+  UR_ASSERT(pBinaries, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(NumBinaries > 0, UR_RESULT_ERROR_INVALID_ARGUMENT);
+
+  // Look for an image for the NVPTX64 target, and return the first one that is
+  // found
+  for (uint32_t i = 0; i < NumBinaries; i++) {
+    if (strcmp(pBinaries[i].pDeviceTargetSpec,
+               UR_DEVICE_BINARY_TARGET_NVPTX64) == 0) {
+      *pSelectedBinary = i;
+      return UR_RESULT_SUCCESS;
+    }
+  }
+
+  // No image can be loaded for the given device
+  return UR_RESULT_ERROR_INVALID_BINARY;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp
new file mode 100644
index 0000000000000..ff8d85cf7a3d9
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/device.hpp
@@ -0,0 +1,59 @@
+//===--------- device.hpp - CUDA Adapter -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+#pragma once
+
+#include <ur/ur.hpp>
+
+struct ur_device_handle_t_ {
+private:
+  using native_type = CUdevice;
+
+  native_type CuDevice;
+  CUcontext CuContext;
+  CUevent EvBase; // CUDA event used as base counter
+  std::atomic_uint32_t RefCount;
+  ur_platform_handle_t Platform;
+
+  static constexpr uint32_t MaxWorkItemDimensions = 3u;
+  size_t MaxWorkItemSizes[MaxWorkItemDimensions];
+  int MaxWorkGroupSize;
+
+public:
+  ur_device_handle_t_(native_type cuDevice, CUcontext cuContext, CUevent evBase,
+                      ur_platform_handle_t platform)
+      : CuDevice(cuDevice), CuContext(cuContext), EvBase(evBase), RefCount{1},
+        Platform(platform) {}
+
+  ur_device_handle_t_() { cuDevicePrimaryCtxRelease(CuDevice); }
+
+  native_type get() const noexcept { return CuDevice; };
+
+  CUcontext getContext() const noexcept { return CuContext; };
+
+  uint32_t getReferenceCount() const noexcept { return RefCount; }
+
+  ur_platform_handle_t getPlatform() const noexcept { return Platform; };
+
+  uint64_t getElapsedTime(CUevent) const;
+
+  void saveMaxWorkItemSizes(size_t Size,
+                            size_t *SaveMaxWorkItemSizes) noexcept {
+    memcpy(MaxWorkItemSizes, SaveMaxWorkItemSizes, Size);
+  };
+
+  void saveMaxWorkGroupSize(int Value) noexcept { MaxWorkGroupSize = Value; };
+
+  void getMaxWorkItemSizes(size_t RetSize,
+                           size_t *RetMaxWorkItemSizes) const noexcept {
+    memcpy(RetMaxWorkItemSizes, MaxWorkItemSizes, RetSize);
+  };
+
+  int getMaxWorkGroupSize() const noexcept { return MaxWorkGroupSize; };
+};
+
+int getAttribute(ur_device_handle_t Device, CUdevice_attribute Attribute);
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
new file mode 100644
index 0000000000000..8dbd6ee2a27fe
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
@@ -0,0 +1,1739 @@
+//===--------- enqueue.cpp - CUDA Adapter ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include "common.hpp"
+#include "context.hpp"
+#include "event.hpp"
+#include "kernel.hpp"
+#include "memory.hpp"
+#include "queue.hpp"
+
+#include <cmath>
+#include <cuda.h>
+
+ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
+                              uint32_t NumEventsInWaitList,
+                              const ur_event_handle_t *EventWaitList) {
+  UR_ASSERT(EventWaitList, UR_RESULT_SUCCESS);
+
+  try {
+    ScopedContext Active(CommandQueue->getContext());
+
+    auto Result = forLatestEvents(
+        EventWaitList, NumEventsInWaitList,
+        [Stream](ur_event_handle_t Event) -> ur_result_t {
+          if (Event->getStream() == Stream) {
+            return UR_RESULT_SUCCESS;
+          } else {
+            return UR_CHECK_ERROR(cuStreamWaitEvent(Stream, Event->get(), 0));
+          }
+        });
+    return Result;
+  } catch (ur_result_t Err) {
+    return Err;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+}
+
+template <typename PtrT>
+void getUSMHostOrDevicePtr(PtrT USMPtr, CUmemorytype *OutMemType,
+                           CUdeviceptr *OutDevPtr, PtrT *OutHostPtr) {
+  // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE
+  // checks with PI_CHECK_ERROR are not suggested
+  CUresult Ret = cuPointerGetAttribute(
+      OutMemType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)USMPtr);
+  // ARRAY, UNIFIED types are not supported!
+  assert(*OutMemType != CU_MEMORYTYPE_ARRAY &&
+         *OutMemType != CU_MEMORYTYPE_UNIFIED);
+
+  // pointer not known to the CUDA subsystem (possibly a system allocated ptr)
+  if (Ret == CUDA_ERROR_INVALID_VALUE) {
+    *OutMemType = CU_MEMORYTYPE_HOST;
+    *OutDevPtr = 0;
+    *OutHostPtr = USMPtr;
+
+    // todo: resets the above "non-stick" error
+  } else if (Ret == CUDA_SUCCESS) {
+    *OutDevPtr = (*OutMemType == CU_MEMORYTYPE_DEVICE)
+                     ? reinterpret_cast<CUdeviceptr>(USMPtr)
+                     : 0;
+    *OutHostPtr = (*OutMemType == CU_MEMORYTYPE_HOST) ? USMPtr : nullptr;
+  } else {
+    UR_CHECK_ERROR(Ret);
+  }
+}
+
+ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,
+                           ur_usm_advice_flags_t URAdviceFlags,
+                           CUdevice Device) {
+  std::unordered_map<ur_usm_advice_flags_t, CUmem_advise>
+      URToCUMemAdviseDeviceFlagsMap = {
+          {UR_USM_ADVICE_FLAG_SET_READ_MOSTLY, CU_MEM_ADVISE_SET_READ_MOSTLY},
+          {UR_USM_ADVICE_FLAG_CLEAR_READ_MOSTLY,
+           CU_MEM_ADVISE_UNSET_READ_MOSTLY},
+          {UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION,
+           CU_MEM_ADVISE_SET_PREFERRED_LOCATION},
+          {UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION,
+           CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION},
+          {UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE,
+           CU_MEM_ADVISE_SET_ACCESSED_BY},
+          {UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE,
+           CU_MEM_ADVISE_UNSET_ACCESSED_BY},
+      };
+  for (auto &FlagPair : URToCUMemAdviseDeviceFlagsMap) {
+    if (URAdviceFlags & FlagPair.first) {
+      UR_CHECK_ERROR(cuMemAdvise(DevPtr, Size, FlagPair.second, Device));
+    }
+  }
+
+  std::unordered_map<ur_usm_advice_flags_t, CUmem_advise>
+      URToCUMemAdviseHostFlagsMap = {
+          {UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION_HOST,
+           CU_MEM_ADVISE_SET_PREFERRED_LOCATION},
+          {UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION_HOST,
+           CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION},
+          {UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_HOST,
+           CU_MEM_ADVISE_SET_ACCESSED_BY},
+          {UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_HOST,
+           CU_MEM_ADVISE_UNSET_ACCESSED_BY},
+      };
+
+  for (auto &FlagPair : URToCUMemAdviseHostFlagsMap) {
+    if (URAdviceFlags & FlagPair.first) {
+      UR_CHECK_ERROR(cuMemAdvise(DevPtr, Size, FlagPair.second, CU_DEVICE_CPU));
+    }
+  }
+
+  std::array<ur_usm_advice_flags_t, 4> UnmappedMemAdviceFlags = {
+      UR_USM_ADVICE_FLAG_SET_NON_ATOMIC_MOSTLY,
+      UR_USM_ADVICE_FLAG_CLEAR_NON_ATOMIC_MOSTLY,
+      UR_USM_ADVICE_FLAG_BIAS_CACHED, UR_USM_ADVICE_FLAG_BIAS_UNCACHED};
+
+  for (auto &UnmappedFlag : UnmappedMemAdviceFlags) {
+    if (URAdviceFlags & UnmappedFlag) {
+      throw UR_RESULT_ERROR_INVALID_ENUMERATION;
+    }
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+// Determine local work sizes that result in uniform work groups.
+// The default threadsPerBlock only require handling the first work_dim
+// dimension.
+void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
+                        const size_t *GlobalWorkSize, const uint32_t WorkDim,
+                        const size_t MaxThreadsPerBlock[3],
+                        ur_kernel_handle_t Kernel, uint32_t LocalSize) {
+  assert(ThreadsPerBlock != nullptr);
+  assert(GlobalWorkSize != nullptr);
+  assert(Kernel != nullptr);
+  int MinGrid, MaxBlockSize, MaxBlockDim[3];
+
+  // The below assumes a three dimensional range but this is not guaranteed by
+  // UR.
+  size_t GlobalSizeNormalized[3] = {1, 1, 1};
+  for (uint32_t i = 0; i < WorkDim; i++) {
+    GlobalSizeNormalized[i] = GlobalWorkSize[i];
+  }
+
+  static auto IsPrime = [](size_t Number) -> bool {
+    auto LastNumToCheck = ceil(sqrt(Number));
+    if (Number < 2)
+      return false;
+    if (Number == 2)
+      return true;
+    if (Number % 2 == 0)
+      return false;
+    for (int i = 3; i <= LastNumToCheck; i += 2) {
+      if (Number % i == 0)
+        return false;
+    }
+    return true;
+  };
+
+  cuDeviceGetAttribute(&MaxBlockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
+                       Device->get());
+  cuDeviceGetAttribute(&MaxBlockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
+                       Device->get());
+
+  UR_CHECK_ERROR(
+      cuOccupancyMaxPotentialBlockSize(&MinGrid, &MaxBlockSize, Kernel->get(),
+                                       NULL, LocalSize, MaxThreadsPerBlock[0]));
+
+  ThreadsPerBlock[2] =
+      std::min(GlobalSizeNormalized[2], size_t(MaxBlockDim[2]));
+  ThreadsPerBlock[1] = std::min(
+      GlobalSizeNormalized[1],
+      std::min(MaxBlockSize / ThreadsPerBlock[2], size_t(MaxBlockDim[1])));
+  MaxBlockDim[0] = MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]);
+  ThreadsPerBlock[0] =
+      std::min(MaxThreadsPerBlock[0],
+               std::min(GlobalSizeNormalized[0], size_t(MaxBlockDim[0])));
+
+  // When GlobalSizeNormalized[0] is prime threadPerBlock[0] will later
+  // computed as 1, which is not efficient configuration. In such case we use
+  // GlobalSizeNormalized[0] + 1 to compute threadPerBlock[0].
+  int Adjusted0DimGlobalWorkSize =
+      (IsPrime(GlobalSizeNormalized[0]) &&
+       (ThreadsPerBlock[0] != GlobalSizeNormalized[0]))
+          ? GlobalSizeNormalized[0] + 1
+          : GlobalSizeNormalized[0];
+
+  static auto IsPowerOf2 = [](size_t Value) -> bool {
+    return Value && !(Value & (Value - 1));
+  };
+
+  // Find a local work group size that is a divisor of the global
+  // work group size to produce uniform work groups.
+  // Additionally, for best compute utilisation, the local size has
+  // to be a power of two.
+  while (0u != (Adjusted0DimGlobalWorkSize % ThreadsPerBlock[0]) ||
+         !IsPowerOf2(ThreadsPerBlock[0])) {
+    --ThreadsPerBlock[0];
+  }
+}
+
+// Helper to verify out-of-registers case (exceeded block max registers).
+// If the kernel requires a number of registers for the entire thread
+// block exceeds the hardware limitations, then the cuLaunchKernel call
+// will fail to launch with CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES error.
+bool hasExceededMaxRegistersPerBlock(ur_device_handle_t Device,
+                                     ur_kernel_handle_t Kernel,
+                                     size_t BlockSize) {
+  int MaxRegsPerBlock{0};
+  UR_CHECK_ERROR(cuDeviceGetAttribute(
+      &MaxRegsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
+      Device->get()));
+
+  int RegsPerThread{0};
+  UR_CHECK_ERROR(cuFuncGetAttribute(&RegsPerThread, CU_FUNC_ATTRIBUTE_NUM_REGS,
+                                    Kernel->get()));
+
+  return BlockSize * RegsPerThread > size_t(MaxRegsPerBlock);
+}
+
+/// Enqueues a wait on the given CUstream for all specified events (See
+/// \ref enqueueEventWaitWithBarrier.) If the events list is empty, the enqueued
+/// wait will wait on all previous events in the queue.
+///
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
+    ur_queue_handle_t hQueue, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  // This function makes one stream work on the previous work (or work
+  // represented by input events) and then all future work waits on that stream.
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE);
+
+  ur_result_t Result;
+
+  try {
+    ScopedContext Active(hQueue->getContext());
+    uint32_t StreamToken;
+    ur_stream_guard_ Guard;
+    CUstream CuStream = hQueue->getNextComputeStream(
+        numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
+    {
+      std::lock_guard<std::mutex> GuardBarrier(hQueue->BarrierMutex);
+      if (hQueue->BarrierEvent == nullptr) {
+        UR_CHECK_ERROR(
+            cuEventCreate(&hQueue->BarrierEvent, CU_EVENT_DISABLE_TIMING));
+      }
+      if (numEventsInWaitList == 0) { //  wait on all work
+        if (hQueue->BarrierTmpEvent == nullptr) {
+          UR_CHECK_ERROR(
+              cuEventCreate(&hQueue->BarrierTmpEvent, CU_EVENT_DISABLE_TIMING));
+        }
+        hQueue->syncStreams(
+            [CuStream, TmpEvent = hQueue->BarrierTmpEvent](CUstream s) {
+              if (CuStream != s) {
+                // record a new CUDA event on every stream and make one stream
+                // wait for these events
+                UR_CHECK_ERROR(cuEventRecord(TmpEvent, s));
+                UR_CHECK_ERROR(cuStreamWaitEvent(CuStream, TmpEvent, 0));
+              }
+            });
+      } else { // wait just on given events
+        forLatestEvents(phEventWaitList, numEventsInWaitList,
+                        [CuStream](ur_event_handle_t Event) -> ur_result_t {
+                          if (Event->getQueue()->hasBeenSynchronized(
+                                  Event->getComputeStreamToken())) {
+                            return UR_RESULT_SUCCESS;
+                          } else {
+                            return UR_CHECK_ERROR(
+                                cuStreamWaitEvent(CuStream, Event->get(), 0));
+                          }
+                        });
+      }
+
+      Result = UR_CHECK_ERROR(cuEventRecord(hQueue->BarrierEvent, CuStream));
+      for (unsigned int i = 0; i < hQueue->ComputeAppliedBarrier.size(); i++) {
+        hQueue->ComputeAppliedBarrier[i] = false;
+      }
+      for (unsigned int i = 0; i < hQueue->TransferAppliedBarrier.size(); i++) {
+        hQueue->TransferAppliedBarrier[i] = false;
+      }
+    }
+    if (Result != UR_RESULT_SUCCESS) {
+      return Result;
+    }
+
+    if (phEvent) {
+      *phEvent = ur_event_handle_t_::makeNative(
+          UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, hQueue, CuStream, StreamToken);
+      (*phEvent)->start();
+      (*phEvent)->record();
+    }
+
+    return UR_RESULT_SUCCESS;
+  } catch (ur_result_t Err) {
+    return Err;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+}
+
+/// Enqueues a wait on the given CUstream for all events.
+/// See \ref enqueueEventWait
+/// TODO: Add support for multiple streams once the Event class is properly
+/// refactored.
+///
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
+    ur_queue_handle_t hQueue, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
+                                        phEventWaitList, phEvent);
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
+    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  // Preconditions
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hQueue->getContext() == hKernel->getContext(),
+            UR_RESULT_ERROR_INVALID_KERNEL);
+  UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(pGlobalWorkOffset, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+  UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+
+  if (*pGlobalWorkSize == 0) {
+    return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
+                                          phEventWaitList, phEvent);
+  }
+
+  // Set the number of threads per block to the number of threads per warp
+  // by default unless user has provided a better number
+  size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
+  size_t MaxWorkGroupSize = 0u;
+  size_t MaxThreadsPerBlock[3] = {};
+  bool ProvidedLocalWorkGroupSize = (pLocalWorkSize != nullptr);
+  int32_t LocalSize = hKernel->getLocalSize();
+  ur_result_t Result = UR_RESULT_SUCCESS;
+
+  try {
+    // Set the active context here as guessLocalWorkSize needs an active context
+    ScopedContext Active(hQueue->getContext());
+    {
+      size_t *ReqdThreadsPerBlock = hKernel->ReqdThreadsPerBlock;
+      MaxWorkGroupSize = hQueue->Device->getMaxWorkGroupSize();
+      hQueue->Device->getMaxWorkItemSizes(sizeof(MaxThreadsPerBlock),
+                                          MaxThreadsPerBlock);
+
+      if (ProvidedLocalWorkGroupSize) {
+        auto IsValid = [&](int Dim) {
+          if (ReqdThreadsPerBlock[Dim] != 0 &&
+              pLocalWorkSize[Dim] != ReqdThreadsPerBlock[Dim])
+            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+
+          if (pLocalWorkSize[Dim] > MaxThreadsPerBlock[Dim])
+            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+          // Checks that local work sizes are a divisor of the global work sizes
+          // which includes that the local work sizes are neither larger than
+          // the global work sizes and not 0.
+          if (0u == pLocalWorkSize[Dim])
+            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+          if (0u != (pGlobalWorkSize[Dim] % pLocalWorkSize[Dim]))
+            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+          ThreadsPerBlock[Dim] = pLocalWorkSize[Dim];
+          return UR_RESULT_SUCCESS;
+        };
+
+        size_t KernelLocalWorkGroupSize = 0;
+        for (size_t Dim = 0; Dim < workDim; Dim++) {
+          auto Err = IsValid(Dim);
+          if (Err != UR_RESULT_SUCCESS)
+            return Err;
+          // If no error then sum the total local work size per dim.
+          KernelLocalWorkGroupSize += pLocalWorkSize[Dim];
+        }
+
+        if (hasExceededMaxRegistersPerBlock(hQueue->Device, hKernel,
+                                            KernelLocalWorkGroupSize)) {
+          return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+        }
+      } else {
+        guessLocalWorkSize(hQueue->Device, ThreadsPerBlock, pGlobalWorkSize,
+                           workDim, MaxThreadsPerBlock, hKernel, LocalSize);
+      }
+    }
+
+    if (MaxWorkGroupSize <
+        size_t(ThreadsPerBlock[0] * ThreadsPerBlock[1] * ThreadsPerBlock[2])) {
+      return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+    }
+
+    size_t BlocksPerGrid[3] = {1u, 1u, 1u};
+
+    for (size_t i = 0; i < workDim; i++) {
+      BlocksPerGrid[i] =
+          (pGlobalWorkSize[i] + ThreadsPerBlock[i] - 1) / ThreadsPerBlock[i];
+    }
+
+    std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+
+    uint32_t StreamToken;
+    ur_stream_guard_ Guard;
+    CUstream CuStream = hQueue->getNextComputeStream(
+        numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
+    CUfunction CuFunc = hKernel->get();
+
+    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                               phEventWaitList);
+
+    // Set the implicit global offset parameter if kernel has offset variant
+    if (hKernel->get_with_offset_parameter()) {
+      std::uint32_t CudaImplicitOffset[3] = {0, 0, 0};
+      if (pGlobalWorkOffset) {
+        for (size_t i = 0; i < workDim; i++) {
+          CudaImplicitOffset[i] =
+              static_cast<std::uint32_t>(pGlobalWorkOffset[i]);
+          if (pGlobalWorkOffset[i] != 0) {
+            CuFunc = hKernel->get_with_offset_parameter();
+          }
+        }
+      }
+      hKernel->setImplicitOffsetArg(sizeof(CudaImplicitOffset),
+                                    CudaImplicitOffset);
+    }
+
+    auto &ArgIndices = hKernel->getArgIndices();
+
+    if (phEvent) {
+      RetImplEvent =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_KERNEL_LAUNCH, hQueue, CuStream, StreamToken));
+      RetImplEvent->start();
+    }
+
+    // Set local mem max size if env var is present
+    static const char *LocalMemSizePtr =
+        std::getenv("SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE");
+
+    if (LocalMemSizePtr) {
+      int DeviceMaxLocalMem = 0;
+      cuDeviceGetAttribute(
+          &DeviceMaxLocalMem,
+          CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
+          hQueue->get_device()->get());
+
+      static const int EnvVal = std::atoi(LocalMemSizePtr);
+      if (EnvVal <= 0 || EnvVal > DeviceMaxLocalMem) {
+        setErrorMessage("Invalid value specified for "
+                        "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE",
+                        UR_RESULT_ERROR_ADAPTER_SPECIFIC);
+        return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+      }
+      UR_CHECK_ERROR(cuFuncSetAttribute(
+          CuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, EnvVal));
+    }
+
+    Result = UR_CHECK_ERROR(cuLaunchKernel(
+        CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2],
+        ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize,
+        CuStream, const_cast<void **>(ArgIndices.data()), nullptr));
+    if (LocalSize != 0)
+      hKernel->clearLocalSize();
+
+    if (phEvent) {
+      Result = RetImplEvent->record();
+      *phEvent = RetImplEvent.release();
+    }
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
+
+/// General 3D memory copy operation.
+/// This function requires the corresponding CUDA context to be at the top of
+/// the context stack
+/// If the source and/or destination is on the device, SrcPtr and/or DstPtr
+/// must be a pointer to a CUdeviceptr
+static ur_result_t commonEnqueueMemBufferCopyRect(
+    CUstream cu_stream, ur_rect_region_t region, const void *SrcPtr,
+    const CUmemorytype_enum SrcType, ur_rect_offset_t src_offset,
+    size_t src_row_pitch, size_t src_slice_pitch, void *DstPtr,
+    const CUmemorytype_enum DstType, ur_rect_offset_t dst_offset,
+    size_t dst_row_pitch, size_t dst_slice_pitch) {
+
+  UR_ASSERT(SrcType == CU_MEMORYTYPE_DEVICE || SrcType == CU_MEMORYTYPE_HOST,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(DstType == CU_MEMORYTYPE_DEVICE || DstType == CU_MEMORYTYPE_HOST,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  src_row_pitch =
+      (!src_row_pitch) ? region.width + src_offset.x : src_row_pitch;
+  src_slice_pitch = (!src_slice_pitch)
+                        ? ((region.height + src_offset.y) * src_row_pitch)
+                        : src_slice_pitch;
+  dst_row_pitch =
+      (!dst_row_pitch) ? region.width + dst_offset.x : dst_row_pitch;
+  dst_slice_pitch = (!dst_slice_pitch)
+                        ? ((region.height + dst_offset.y) * dst_row_pitch)
+                        : dst_slice_pitch;
+
+  CUDA_MEMCPY3D params = {};
+
+  params.WidthInBytes = region.width;
+  params.Height = region.height;
+  params.Depth = region.depth;
+
+  params.srcMemoryType = SrcType;
+  params.srcDevice = SrcType == CU_MEMORYTYPE_DEVICE
+                         ? *static_cast<const CUdeviceptr *>(SrcPtr)
+                         : 0;
+  params.srcHost = SrcType == CU_MEMORYTYPE_HOST ? SrcPtr : nullptr;
+  params.srcXInBytes = src_offset.x;
+  params.srcY = src_offset.y;
+  params.srcZ = src_offset.z;
+  params.srcPitch = src_row_pitch;
+  params.srcHeight = src_slice_pitch / src_row_pitch;
+
+  params.dstMemoryType = DstType;
+  params.dstDevice =
+      DstType == CU_MEMORYTYPE_DEVICE ? *static_cast<CUdeviceptr *>(DstPtr) : 0;
+  params.dstHost = DstType == CU_MEMORYTYPE_HOST ? DstPtr : nullptr;
+  params.dstXInBytes = dst_offset.x;
+  params.dstY = dst_offset.y;
+  params.dstZ = dst_offset.z;
+  params.dstPitch = dst_row_pitch;
+  params.dstHeight = dst_slice_pitch / dst_row_pitch;
+
+  return UR_CHECK_ERROR(cuMemcpy3DAsync(&params, cu_stream));
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
+    ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingRead,
+    ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin,
+    ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch,
+    size_t hostRowPitch, size_t hostSlicePitch, void *pDst,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  CUdeviceptr DevPtr = hBuffer->Mem.BufferMem.get();
+  std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+
+  try {
+    ScopedContext Active(hQueue->getContext());
+    CUstream CuStream = hQueue->getNextTransferStream();
+
+    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                               phEventWaitList);
+
+    if (phEvent) {
+      RetImplEvent =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_MEM_BUFFER_READ_RECT, hQueue, CuStream));
+      RetImplEvent->start();
+    }
+
+    Result = commonEnqueueMemBufferCopyRect(
+        CuStream, region, &DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin,
+        bufferRowPitch, bufferSlicePitch, pDst, CU_MEMORYTYPE_HOST, hostOrigin,
+        hostRowPitch, hostSlicePitch);
+
+    if (phEvent) {
+      Result = RetImplEvent->record();
+    }
+
+    if (blockingRead) {
+      Result = UR_CHECK_ERROR(cuStreamSynchronize(CuStream));
+    }
+
+    if (phEvent) {
+      *phEvent = RetImplEvent.release();
+    }
+
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
+    ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingWrite,
+    ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin,
+    ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch,
+    size_t hostRowPitch, size_t hostSlicePitch, void *pSrc,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  CUdeviceptr DevPtr = hBuffer->Mem.BufferMem.get();
+  std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+
+  try {
+    ScopedContext active(hQueue->getContext());
+    CUstream cuStream = hQueue->getNextTransferStream();
+    Result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+                               phEventWaitList);
+
+    if (phEvent) {
+      RetImplEvent =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_MEM_BUFFER_WRITE_RECT, hQueue, cuStream));
+      RetImplEvent->start();
+    }
+
+    Result = commonEnqueueMemBufferCopyRect(
+        cuStream, region, pSrc, CU_MEMORYTYPE_HOST, hostOrigin, hostRowPitch,
+        hostSlicePitch, &DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin,
+        bufferRowPitch, bufferSlicePitch);
+
+    if (phEvent) {
+      Result = RetImplEvent->record();
+    }
+
+    if (blockingWrite) {
+      Result = UR_CHECK_ERROR(cuStreamSynchronize(cuStream));
+    }
+
+    if (phEvent) {
+      *phEvent = RetImplEvent.release();
+    }
+
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy(
+    ur_queue_handle_t hQueue, ur_mem_handle_t hBufferSrc,
+    ur_mem_handle_t hBufferDst, size_t srcOffset, size_t dstOffset, size_t size,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+
+  try {
+    ScopedContext Active(hQueue->getContext());
+    ur_result_t Result;
+
+    auto Stream = hQueue->getNextTransferStream();
+    Result =
+        enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList);
+
+    if (phEvent) {
+      RetImplEvent =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_MEM_BUFFER_COPY, hQueue, Stream));
+      Result = RetImplEvent->start();
+    }
+
+    auto Src = hBufferSrc->Mem.BufferMem.get() + srcOffset;
+    auto Dst = hBufferDst->Mem.BufferMem.get() + dstOffset;
+
+    Result = UR_CHECK_ERROR(cuMemcpyDtoDAsync(Dst, Src, size, Stream));
+
+    if (phEvent) {
+      Result = RetImplEvent->record();
+      *phEvent = RetImplEvent.release();
+    }
+
+    return Result;
+  } catch (ur_result_t Err) {
+    return Err;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(
+    ur_queue_handle_t hQueue, ur_mem_handle_t hBufferSrc,
+    ur_mem_handle_t hBufferDst, ur_rect_offset_t srcOrigin,
+    ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch,
+    size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  UR_ASSERT(hBufferSrc, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hBufferDst, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  CUdeviceptr SrcPtr = hBufferSrc->Mem.BufferMem.get();
+  CUdeviceptr DstPtr = hBufferDst->Mem.BufferMem.get();
+  std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+
+  try {
+    ScopedContext Active(hQueue->getContext());
+    CUstream CuStream = hQueue->getNextTransferStream();
+    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                               phEventWaitList);
+
+    if (phEvent) {
+      RetImplEvent =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_MEM_BUFFER_COPY_RECT, hQueue, CuStream));
+      RetImplEvent->start();
+    }
+
+    Result = commonEnqueueMemBufferCopyRect(
+        CuStream, region, &SrcPtr, CU_MEMORYTYPE_DEVICE, srcOrigin, srcRowPitch,
+        srcSlicePitch, &DstPtr, CU_MEMORYTYPE_DEVICE, dstOrigin, dstRowPitch,
+        dstSlicePitch);
+
+    if (phEvent) {
+      RetImplEvent->record();
+      *phEvent = RetImplEvent.release();
+    }
+
+  } catch (ur_result_t err) {
+    Result = err;
+  }
+  return Result;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
+    ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, const void *pPattern,
+    size_t patternSize, size_t offset, size_t size,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  auto ArgsAreMultiplesOfPatternSize =
+      (offset % patternSize == 0) || (size % patternSize == 0);
+
+  auto PatternIsValid = (pPattern != nullptr);
+
+  auto PatternSizeIsValid =
+      ((patternSize & (patternSize - 1)) == 0) && // is power of two
+      (patternSize > 0) && (patternSize <= 128);  // falls within valid range
+
+  UR_ASSERT(ArgsAreMultiplesOfPatternSize && PatternIsValid &&
+                PatternSizeIsValid,
+            UR_RESULT_ERROR_INVALID_SIZE);
+
+  std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+
+  try {
+    ScopedContext Active(hQueue->getContext());
+
+    auto Stream = hQueue->getNextTransferStream();
+    ur_result_t Result;
+    Result =
+        enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList);
+
+    if (phEvent) {
+      RetImplEvent =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_MEM_BUFFER_FILL, hQueue, Stream));
+      Result = RetImplEvent->start();
+    }
+
+    auto DstDevice = hBuffer->Mem.BufferMem.get() + offset;
+    auto N = size / patternSize;
+
+    // pattern size in bytes
+    switch (patternSize) {
+    case 1: {
+      auto Value = *static_cast<const uint8_t *>(pPattern);
+      Result = UR_CHECK_ERROR(cuMemsetD8Async(DstDevice, Value, N, Stream));
+      break;
+    }
+    case 2: {
+      auto Value = *static_cast<const uint16_t *>(pPattern);
+      Result = UR_CHECK_ERROR(cuMemsetD16Async(DstDevice, Value, N, Stream));
+      break;
+    }
+    case 4: {
+      auto Value = *static_cast<const uint32_t *>(pPattern);
+      Result = UR_CHECK_ERROR(cuMemsetD32Async(DstDevice, Value, N, Stream));
+      break;
+    }
+    default: {
+      // CUDA has no memset functions that allow setting values more than 4
+      // bytes. PI API lets you pass an arbitrary "pattern" to the buffer
+      // fill, which can be more than 4 bytes. We must break up the pattern
+      // into 4 byte values, and set the buffer using multiple strided calls.
+      // This means that one cuMemsetD2D32Async call is made for every 4 bytes
+      // in the pattern.
+
+      auto NumberOfSteps = patternSize / sizeof(uint32_t);
+
+      // we walk up the pattern in 4-byte steps, and call cuMemset for each
+      // 4-byte chunk of the pattern.
+      for (auto Step = 0u; Step < NumberOfSteps; ++Step) {
+        // take 4 bytes of the pattern
+        auto Value = *(static_cast<const uint32_t *>(pPattern) + Step);
+
+        // offset the pointer to the part of the buffer we want to write to
+        auto OffsetPtr = DstDevice + (Step * sizeof(uint32_t));
+
+        // set all of the pattern chunks
+        Result = UR_CHECK_ERROR(
+            cuMemsetD2D32Async(OffsetPtr, patternSize, Value, 1, N, Stream));
+      }
+
+      break;
+    }
+    }
+
+    if (phEvent) {
+      Result = RetImplEvent->record();
+      *phEvent = RetImplEvent.release();
+    }
+
+    return Result;
+  } catch (ur_result_t Err) {
+    return Err;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+}
+
+static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR ArrayDesc) {
+  switch (ArrayDesc.Format) {
+  case CU_AD_FORMAT_UNSIGNED_INT8:
+  case CU_AD_FORMAT_SIGNED_INT8:
+    return 1;
+  case CU_AD_FORMAT_UNSIGNED_INT16:
+  case CU_AD_FORMAT_SIGNED_INT16:
+  case CU_AD_FORMAT_HALF:
+    return 2;
+  case CU_AD_FORMAT_UNSIGNED_INT32:
+  case CU_AD_FORMAT_SIGNED_INT32:
+  case CU_AD_FORMAT_FLOAT:
+    return 4;
+  default:
+    sycl::detail::ur::die("Invalid image format.");
+    return 0;
+  }
+}
+
+/// General ND memory copy operation for images (where N > 1).
+/// This function requires the corresponding CUDA context to be at the top of
+/// the context stack
+/// If the source and/or destination is an array, SrcPtr and/or DstPtr
+/// must be a pointer to a CUarray
+static ur_result_t commonEnqueueMemImageNDCopy(
+    CUstream CuStream, ur_mem_type_t ImgType, const ur_rect_region_t Region,
+    const void *SrcPtr, const CUmemorytype_enum SrcType,
+    const ur_rect_offset_t SrcOffset, void *DstPtr,
+    const CUmemorytype_enum DstType, const ur_rect_offset_t DstOffset) {
+  UR_ASSERT(SrcType == CU_MEMORYTYPE_ARRAY || SrcType == CU_MEMORYTYPE_HOST,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(DstType == CU_MEMORYTYPE_ARRAY || DstType == CU_MEMORYTYPE_HOST,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  if (ImgType == UR_MEM_TYPE_IMAGE2D) {
+    CUDA_MEMCPY2D CpyDesc;
+    memset(&CpyDesc, 0, sizeof(CpyDesc));
+    CpyDesc.srcMemoryType = SrcType;
+    if (SrcType == CU_MEMORYTYPE_ARRAY) {
+      CpyDesc.srcArray = *static_cast<const CUarray *>(SrcPtr);
+      CpyDesc.srcXInBytes = SrcOffset.x;
+      CpyDesc.srcY = SrcOffset.y;
+    } else {
+      CpyDesc.srcHost = SrcPtr;
+    }
+    CpyDesc.dstMemoryType = DstType;
+    if (DstType == CU_MEMORYTYPE_ARRAY) {
+      CpyDesc.dstArray = *static_cast<CUarray *>(DstPtr);
+      CpyDesc.dstXInBytes = DstOffset.x;
+      CpyDesc.dstY = DstOffset.y;
+    } else {
+      CpyDesc.dstHost = DstPtr;
+    }
+    CpyDesc.WidthInBytes = Region.width;
+    CpyDesc.Height = Region.height;
+    return UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc, CuStream));
+  }
+  if (ImgType == UR_MEM_TYPE_IMAGE3D) {
+    CUDA_MEMCPY3D CpyDesc;
+    memset(&CpyDesc, 0, sizeof(CpyDesc));
+    CpyDesc.srcMemoryType = SrcType;
+    if (SrcType == CU_MEMORYTYPE_ARRAY) {
+      CpyDesc.srcArray = *static_cast<const CUarray *>(SrcPtr);
+      CpyDesc.srcXInBytes = SrcOffset.x;
+      CpyDesc.srcY = SrcOffset.y;
+      CpyDesc.srcZ = SrcOffset.z;
+    } else {
+      CpyDesc.srcHost = SrcPtr;
+    }
+    CpyDesc.dstMemoryType = DstType;
+    if (DstType == CU_MEMORYTYPE_ARRAY) {
+      CpyDesc.dstArray = *static_cast<CUarray *>(DstPtr);
+      CpyDesc.dstXInBytes = DstOffset.x;
+      CpyDesc.dstY = DstOffset.y;
+      CpyDesc.dstZ = DstOffset.z;
+    } else {
+      CpyDesc.dstHost = DstPtr;
+    }
+    CpyDesc.WidthInBytes = Region.width;
+    CpyDesc.Height = Region.height;
+    CpyDesc.Depth = Region.depth;
+    return UR_CHECK_ERROR(cuMemcpy3DAsync(&CpyDesc, CuStream));
+  }
+  return UR_RESULT_ERROR_INVALID_VALUE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
+    ur_queue_handle_t hQueue, ur_mem_handle_t hImage, bool blockingRead,
+    ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch,
+    size_t slicePitch, void *pDst, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  std::ignore = rowPitch;
+  std::ignore = slicePitch;
+
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hImage, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+
+  try {
+    ScopedContext Active(hQueue->getContext());
+    CUstream CuStream = hQueue->getNextTransferStream();
+    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                               phEventWaitList);
+
+    CUarray Array = hImage->Mem.SurfaceMem.getArray();
+
+    CUDA_ARRAY_DESCRIPTOR ArrayDesc;
+    Result = UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array));
+
+    int ElementByteSize = imageElementByteSize(ArrayDesc);
+
+    size_t ByteOffsetX = origin.x * ElementByteSize * ArrayDesc.NumChannels;
+    size_t BytesToCopy = ElementByteSize * ArrayDesc.NumChannels * region.width;
+
+    ur_mem_type_t ImgType = hImage->Mem.SurfaceMem.getImageType();
+    if (ImgType == UR_MEM_TYPE_IMAGE1D) {
+      Result = UR_CHECK_ERROR(
+          cuMemcpyAtoHAsync(pDst, Array, ByteOffsetX, BytesToCopy, CuStream));
+    } else {
+      ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height,
+                                         region.depth};
+      ur_rect_offset_t SrcOffset = {ByteOffsetX, origin.y, origin.z};
+
+      Result = commonEnqueueMemImageNDCopy(
+          CuStream, ImgType, AdjustedRegion, &Array, CU_MEMORYTYPE_ARRAY,
+          SrcOffset, pDst, CU_MEMORYTYPE_HOST, ur_rect_offset_t{});
+
+      if (Result != UR_RESULT_SUCCESS) {
+        return Result;
+      }
+    }
+
+    if (phEvent) {
+      auto NewEvent = ur_event_handle_t_::makeNative(UR_COMMAND_MEM_IMAGE_READ,
+                                                     hQueue, CuStream);
+      NewEvent->record();
+      *phEvent = NewEvent;
+    }
+
+    if (blockingRead) {
+      Result = UR_CHECK_ERROR(cuStreamSynchronize(CuStream));
+    }
+  } catch (ur_result_t Err) {
+    return Err;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  return Result;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite(
+    ur_queue_handle_t hQueue, ur_mem_handle_t hImage, bool blockingWrite,
+    ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch,
+    size_t slicePitch, void *pSrc, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  std::ignore = blockingWrite;
+  std::ignore = rowPitch;
+  std::ignore = slicePitch;
+
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hImage, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+
+  try {
+    ScopedContext Active(hQueue->getContext());
+    CUstream CuStream = hQueue->getNextTransferStream();
+    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                               phEventWaitList);
+
+    CUarray Array = hImage->Mem.SurfaceMem.getArray();
+
+    CUDA_ARRAY_DESCRIPTOR ArrayDesc;
+    Result = UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array));
+
+    int ElementByteSize = imageElementByteSize(ArrayDesc);
+
+    size_t ByteOffsetX = origin.x * ElementByteSize * ArrayDesc.NumChannels;
+    size_t BytesToCopy = ElementByteSize * ArrayDesc.NumChannels * region.width;
+
+    ur_mem_type_t ImgType = hImage->Mem.SurfaceMem.getImageType();
+    if (ImgType == UR_MEM_TYPE_IMAGE1D) {
+      Result = UR_CHECK_ERROR(
+          cuMemcpyHtoAAsync(Array, ByteOffsetX, pSrc, BytesToCopy, CuStream));
+    } else {
+      ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height,
+                                         region.depth};
+      ur_rect_offset_t DstOffset = {ByteOffsetX, origin.y, origin.z};
+
+      Result = commonEnqueueMemImageNDCopy(
+          CuStream, ImgType, AdjustedRegion, pSrc, CU_MEMORYTYPE_HOST,
+          ur_rect_offset_t{}, &Array, CU_MEMORYTYPE_ARRAY, DstOffset);
+
+      if (Result != UR_RESULT_SUCCESS) {
+        return Result;
+      }
+    }
+
+    if (phEvent) {
+      auto NewEvent = ur_event_handle_t_::makeNative(UR_COMMAND_MEM_IMAGE_WRITE,
+                                                     hQueue, CuStream);
+      NewEvent->record();
+      *phEvent = NewEvent;
+    }
+  } catch (ur_result_t Err) {
+    return Err;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  return Result;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy(
+    ur_queue_handle_t hQueue, ur_mem_handle_t hImageSrc,
+    ur_mem_handle_t hImageDst, ur_rect_offset_t srcOrigin,
+    ur_rect_offset_t dstOrigin, ur_rect_region_t region,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  UR_ASSERT(hImageSrc->MemType == ur_mem_handle_t_::Type::Surface,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(hImageDst->MemType == ur_mem_handle_t_::Type::Surface,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(hImageSrc->Mem.SurfaceMem.getImageType() ==
+                hImageDst->Mem.SurfaceMem.getImageType(),
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+
+  try {
+    ScopedContext Active(hQueue->getContext());
+    CUstream CuStream = hQueue->getNextTransferStream();
+    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                               phEventWaitList);
+
+    CUarray SrcArray = hImageSrc->Mem.SurfaceMem.getArray();
+    CUarray DstArray = hImageDst->Mem.SurfaceMem.getArray();
+
+    CUDA_ARRAY_DESCRIPTOR SrcArrayDesc;
+    Result = UR_CHECK_ERROR(cuArrayGetDescriptor(&SrcArrayDesc, SrcArray));
+    CUDA_ARRAY_DESCRIPTOR DstArrayDesc;
+    Result = UR_CHECK_ERROR(cuArrayGetDescriptor(&DstArrayDesc, DstArray));
+
+    UR_ASSERT(SrcArrayDesc.Format == DstArrayDesc.Format,
+              UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+    UR_ASSERT(SrcArrayDesc.NumChannels == DstArrayDesc.NumChannels,
+              UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+    int ElementByteSize = imageElementByteSize(SrcArrayDesc);
+
+    size_t DstByteOffsetX =
+        dstOrigin.x * ElementByteSize * SrcArrayDesc.NumChannels;
+    size_t SrcByteOffsetX =
+        srcOrigin.x * ElementByteSize * DstArrayDesc.NumChannels;
+    size_t BytesToCopy =
+        ElementByteSize * SrcArrayDesc.NumChannels * region.width;
+
+    ur_mem_type_t ImgType = hImageSrc->Mem.SurfaceMem.getImageType();
+    if (ImgType == UR_MEM_TYPE_IMAGE1D) {
+      Result = UR_CHECK_ERROR(cuMemcpyAtoA(DstArray, DstByteOffsetX, SrcArray,
+                                           SrcByteOffsetX, BytesToCopy));
+    } else {
+      ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height,
+                                         region.depth};
+      ur_rect_offset_t SrcOffset = {SrcByteOffsetX, srcOrigin.y, srcOrigin.z};
+      ur_rect_offset_t DstOffset = {DstByteOffsetX, dstOrigin.y, dstOrigin.z};
+
+      Result = commonEnqueueMemImageNDCopy(
+          CuStream, ImgType, AdjustedRegion, &SrcArray, CU_MEMORYTYPE_ARRAY,
+          SrcOffset, &DstArray, CU_MEMORYTYPE_ARRAY, DstOffset);
+
+      if (Result != UR_RESULT_SUCCESS) {
+        return Result;
+      }
+    }
+
+    if (phEvent) {
+      auto NewEvent = ur_event_handle_t_::makeNative(UR_COMMAND_MEM_IMAGE_COPY,
+                                                     hQueue, CuStream);
+      NewEvent->record();
+      *phEvent = NewEvent;
+    }
+  } catch (ur_result_t Err) {
+    return Err;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  return Result;
+}
+
+/// Implements mapping on the host using a BufferRead operation.
+/// Mapped pointers are stored in the pi_mem object.
+/// If the buffer uses pinned host memory a pointer to that memory is returned
+/// and no read operation is done.
+///
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
+    ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingMap,
+    ur_map_flags_t mapFlags, size_t offset, size_t size,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent, void **ppRetMap) {
+  UR_ASSERT(ppRetMap != nullptr, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(hQueue != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hBuffer != nullptr, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hBuffer->MemType == ur_mem_handle_t_::Type::Buffer,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  ur_result_t Result = UR_RESULT_ERROR_INVALID_MEM_OBJECT;
+  const bool IsPinned =
+      hBuffer->Mem.BufferMem.MemAllocMode ==
+      ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr;
+
+  // Currently no support for overlapping regions
+  if (hBuffer->Mem.BufferMem.getMapPtr() != nullptr) {
+    return Result;
+  }
+
+  // Allocate a pointer in the host to store the mapped information
+  auto HostPtr = hBuffer->Mem.BufferMem.mapToPtr(offset, mapFlags);
+  *ppRetMap = hBuffer->Mem.BufferMem.getMapPtr();
+  if (HostPtr) {
+    Result = UR_RESULT_SUCCESS;
+  }
+
+  if (!IsPinned &&
+      ((mapFlags & UR_MAP_FLAG_READ) || (mapFlags & UR_MAP_FLAG_WRITE))) {
+    // Pinned host memory is already on host so it doesn't need to be read.
+    Result = urEnqueueMemBufferRead(hQueue, hBuffer, blockingMap, offset, size,
+                                    HostPtr, numEventsInWaitList,
+                                    phEventWaitList, phEvent);
+  } else {
+    ScopedContext Active(hQueue->getContext());
+
+    if (IsPinned) {
+      Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList,
+                                   nullptr);
+    }
+
+    if (phEvent) {
+      try {
+        *phEvent = ur_event_handle_t_::makeNative(
+            UR_COMMAND_MEM_BUFFER_MAP, hQueue, hQueue->getNextTransferStream());
+        (*phEvent)->start();
+        (*phEvent)->record();
+      } catch (ur_result_t Err) {
+        Result = Err;
+      }
+    }
+  }
+
+  return Result;
+}
+
+/// Implements the unmap from the host, using a BufferWrite operation.
+/// Requires the mapped pointer to be already registered in the given memobj.
+/// If memobj uses pinned host memory, this will not do a write.
+///
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
+    ur_queue_handle_t hQueue, ur_mem_handle_t hMem, void *pMappedPtr,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(pMappedPtr, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  UR_ASSERT(hMem->MemType == ur_mem_handle_t_::Type::Buffer,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(hMem->Mem.BufferMem.getMapPtr() != nullptr,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(hMem->Mem.BufferMem.getMapPtr() == pMappedPtr,
+            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  const bool IsPinned =
+      hMem->Mem.BufferMem.MemAllocMode ==
+      ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr;
+
+  if (!IsPinned && (hMem->Mem.BufferMem.getMapFlags() & UR_MAP_FLAG_WRITE)) {
+    // Pinned host memory is only on host so it doesn't need to be written to.
+    Result = urEnqueueMemBufferWrite(
+        hQueue, hMem, true, hMem->Mem.BufferMem.getMapOffset(pMappedPtr),
+        hMem->Mem.BufferMem.getSize(), pMappedPtr, numEventsInWaitList,
+        phEventWaitList, phEvent);
+  } else {
+    ScopedContext Active(hQueue->getContext());
+
+    if (IsPinned) {
+      Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList,
+                                   nullptr);
+    }
+
+    if (phEvent) {
+      try {
+        *phEvent = ur_event_handle_t_::makeNative(
+            UR_COMMAND_MEM_UNMAP, hQueue, hQueue->getNextTransferStream());
+        (*phEvent)->start();
+        (*phEvent)->record();
+      } catch (ur_result_t Err) {
+        Result = Err;
+      }
+    }
+  }
+
+  hMem->Mem.BufferMem.unmap(pMappedPtr);
+  return Result;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
+    ur_queue_handle_t hQueue, void *ptr, size_t patternSize,
+    const void *pPattern, size_t size, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE);
+  UR_ASSERT(ptr, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(size % patternSize == 0, UR_RESULT_ERROR_INVALID_SIZE);
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
+
+  try {
+    ScopedContext Active(hQueue->getContext());
+    uint32_t StreamToken;
+    ur_stream_guard_ Guard;
+    CUstream CuStream = hQueue->getNextComputeStream(
+        numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
+    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                               phEventWaitList);
+    if (phEvent) {
+      EventPtr =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_USM_FILL, hQueue, CuStream, StreamToken));
+      EventPtr->start();
+    }
+    switch (patternSize) {
+    case 1:
+      Result = UR_CHECK_ERROR(
+          cuMemsetD8Async((CUdeviceptr)ptr, *((const uint8_t *)pPattern) & 0xFF,
+                          size, CuStream));
+      break;
+    case 2:
+      Result = UR_CHECK_ERROR(cuMemsetD16Async(
+          (CUdeviceptr)ptr, *((const uint16_t *)pPattern) & 0xFFFF, size,
+          CuStream));
+      break;
+    case 4:
+      Result = UR_CHECK_ERROR(cuMemsetD32Async(
+          (CUdeviceptr)ptr, *((const uint32_t *)pPattern) & 0xFFFFFFFF, size,
+          CuStream));
+      break;
+    default:
+      return UR_RESULT_ERROR_INVALID_ARGUMENT;
+    }
+    if (phEvent) {
+      Result = EventPtr->record();
+      *phEvent = EventPtr.release();
+    }
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy(
+    ur_queue_handle_t hQueue, bool blocking, void *pDst, const void *pSrc,
+    size_t size, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE);
+  UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  ur_result_t Result = UR_RESULT_SUCCESS;
+
+  std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
+
+  try {
+    ScopedContext Active(hQueue->getContext());
+    CUstream CuStream = hQueue->getNextTransferStream();
+    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                               phEventWaitList);
+    if (phEvent) {
+      EventPtr =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_USM_MEMCPY, hQueue, CuStream));
+      EventPtr->start();
+    }
+    Result = UR_CHECK_ERROR(
+        cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, size, CuStream));
+    if (phEvent) {
+      Result = EventPtr->record();
+    }
+    if (blocking) {
+      Result = UR_CHECK_ERROR(cuStreamSynchronize(CuStream));
+    }
+    if (phEvent) {
+      *phEvent = EventPtr.release();
+    }
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
+    ur_queue_handle_t hQueue, const void *pMem, size_t size,
+    ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE);
+  ur_device_handle_t Device = hQueue->getContext()->getDevice();
+
+  // Certain cuda devices and Windows do not have support for some Unified
+  // Memory features. cuMemPrefetchAsync requires concurrent memory access
+  // for managed memory. Therfore, ignore prefetch hint if concurrent managed
+  // memory access is not available.
+  if (!getAttribute(Device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
+    setErrorMessage("Prefetch hint ignored as device does not support "
+                    "concurrent managed access",
+                    UR_RESULT_SUCCESS);
+    return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+  }
+
+  unsigned int IsManaged;
+  UR_CHECK_ERROR(cuPointerGetAttribute(
+      &IsManaged, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem));
+  if (!IsManaged) {
+    setErrorMessage("Prefetch hint ignored as prefetch only works with USM",
+                    UR_RESULT_SUCCESS);
+    return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+  }
+
+  // flags is currently unused so fail if set
+  if (flags != 0)
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
+
+  try {
+    ScopedContext Active(hQueue->getContext());
+    CUstream CuStream = hQueue->getNextTransferStream();
+    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                               phEventWaitList);
+    if (phEvent) {
+      EventPtr =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_MEM_BUFFER_COPY, hQueue, CuStream));
+      EventPtr->start();
+    }
+    Result = UR_CHECK_ERROR(
+        cuMemPrefetchAsync((CUdeviceptr)pMem, size, Device->get(), CuStream));
+    if (phEvent) {
+      Result = EventPtr->record();
+      *phEvent = EventPtr.release();
+    }
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
+
+/// USM: memadvise API to govern behavior of automatic migration mechanisms
+UR_APIEXPORT ur_result_t UR_APICALL
+urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
+                   ur_usm_advice_flags_t advice, ur_event_handle_t *phEvent) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE);
+  UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  // Certain cuda devices and Windows do not have support for some Unified
+  // Memory features. Passing CU_MEM_ADVISE_SET/CLEAR_PREFERRED_LOCATION and
+  // to cuMemAdvise on a GPU device requires the GPU device to report a non-zero
+  // value for CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Therfore, ignore
+  // memory advise if concurrent managed memory access is not available.
+  if ((advice & UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION) ||
+      (advice & UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION) ||
+      (advice & UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE) ||
+      (advice & UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE) ||
+      (advice & UR_USM_ADVICE_FLAG_DEFAULT)) {
+    ur_device_handle_t Device = hQueue->getContext()->getDevice();
+    if (!getAttribute(Device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
+      setErrorMessage("Mem advise ignored as device does not support "
+                      "concurrent managed access",
+                      UR_RESULT_SUCCESS);
+      return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+    }
+
+    // TODO: If ptr points to valid system-allocated pageable memory we should
+    // check that the device also has the
+    // CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS property.
+  }
+
+  unsigned int IsManaged;
+  UR_CHECK_ERROR(cuPointerGetAttribute(
+      &IsManaged, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem));
+  if (!IsManaged) {
+    setErrorMessage(
+        "Memory advice ignored as memory advices only works with USM",
+        UR_RESULT_SUCCESS);
+    return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+  }
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
+
+  try {
+    ScopedContext Active(hQueue->getContext());
+
+    if (phEvent) {
+      EventPtr =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_USM_ADVISE, hQueue, hQueue->getNextTransferStream()));
+      EventPtr->start();
+    }
+
+    if (advice & UR_USM_ADVICE_FLAG_DEFAULT) {
+      UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size,
+                                 CU_MEM_ADVISE_UNSET_READ_MOSTLY,
+                                 hQueue->getContext()->getDevice()->get()));
+      UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size,
+                                 CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION,
+                                 hQueue->getContext()->getDevice()->get()));
+      UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size,
+                                 CU_MEM_ADVISE_UNSET_ACCESSED_BY,
+                                 hQueue->getContext()->getDevice()->get()));
+    } else {
+      Result = setCuMemAdvise((CUdeviceptr)pMem, size, advice,
+                              hQueue->getContext()->getDevice()->get());
+    }
+
+    if (phEvent) {
+      Result = EventPtr->record();
+      *phEvent = EventPtr.release();
+    }
+  } catch (ur_result_t err) {
+    Result = err;
+  } catch (...) {
+    Result = UR_RESULT_ERROR_UNKNOWN;
+  }
+  return Result;
+}
+
+// TODO: Implement this. Remember to return true for
+//       PI_EXT_ONEAPI_CONTEXT_INFO_USM_FILL2D_SUPPORT when it is implemented.
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill2D(
+    ur_queue_handle_t, void *, size_t, size_t, const void *, size_t, size_t,
+    uint32_t, const ur_event_handle_t *, ur_event_handle_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
+    ur_queue_handle_t hQueue, bool blocking, void *pDst, size_t dstPitch,
+    const void *pSrc, size_t srcPitch, size_t width, size_t height,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_QUEUE);
+  ur_result_t result = UR_RESULT_SUCCESS;
+
+  try {
+    ScopedContext active(hQueue->getContext());
+    CUstream cuStream = hQueue->getNextTransferStream();
+    result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+                               phEventWaitList);
+    if (phEvent) {
+      (*phEvent) = ur_event_handle_t_::makeNative(
+          UR_COMMAND_MEM_BUFFER_COPY_RECT, hQueue, cuStream);
+      (*phEvent)->start();
+    }
+
+    // Determine the direction of copy using cuPointerGetAttribute
+    // for both the SrcPtr and DstPtr
+    CUDA_MEMCPY2D CpyDesc = {};
+    memset(&CpyDesc, 0, sizeof(CpyDesc));
+
+    getUSMHostOrDevicePtr(pSrc, &CpyDesc.srcMemoryType, &CpyDesc.srcDevice,
+                          &CpyDesc.srcHost);
+    getUSMHostOrDevicePtr(pDst, &CpyDesc.dstMemoryType, &CpyDesc.dstDevice,
+                          &CpyDesc.dstHost);
+
+    CpyDesc.dstPitch = dstPitch;
+    CpyDesc.srcPitch = srcPitch;
+    CpyDesc.WidthInBytes = width;
+    CpyDesc.Height = height;
+
+    result = UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc, cuStream));
+
+    if (phEvent) {
+      (*phEvent)->record();
+    }
+    if (blocking) {
+      result = UR_CHECK_ERROR(cuStreamSynchronize(cuStream));
+    }
+  } catch (ur_result_t err) {
+    result = err;
+  }
+  return result;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
+    ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingRead,
+    size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(!hBuffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(pDst, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  if (phEventWaitList) {
+    UR_ASSERT(numEventsInWaitList > 0, UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
+  } else {
+    UR_ASSERT(numEventsInWaitList == 0,
+              UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
+  }
+  UR_ASSERT(offset + size <= hBuffer->Mem.BufferMem.Size,
+            UR_RESULT_ERROR_INVALID_SIZE);
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  CUdeviceptr DevPtr = hBuffer->Mem.BufferMem.get();
+  std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+
+  try {
+    ScopedContext Active(hQueue->getContext());
+    CUstream CuStream = hQueue->getNextTransferStream();
+
+    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                               phEventWaitList);
+
+    if (phEvent) {
+      RetImplEvent =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_MEM_BUFFER_READ, hQueue, CuStream));
+      RetImplEvent->start();
+    }
+
+    UR_CHECK_ERROR(cuMemcpyDtoHAsync(pDst, DevPtr + offset, size, CuStream));
+
+    if (phEvent) {
+      Result = RetImplEvent->record();
+    }
+
+    if (blockingRead) {
+      UR_CHECK_ERROR(cuStreamSynchronize(CuStream));
+    }
+
+    if (phEvent) {
+      *phEvent = RetImplEvent.release();
+    }
+
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+
+  return Result;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
+    ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingWrite,
+    size_t offset, size_t size, const void *pSrc, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(!hBuffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(pSrc, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  if (phEventWaitList) {
+    UR_ASSERT(numEventsInWaitList > 0, UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
+  } else {
+    UR_ASSERT(numEventsInWaitList == 0,
+              UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
+  }
+  UR_ASSERT(offset + size <= hBuffer->Mem.BufferMem.Size,
+            UR_RESULT_ERROR_INVALID_SIZE);
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  CUdeviceptr DevPtr = hBuffer->Mem.BufferMem.get();
+  std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+
+  try {
+    ScopedContext Active(hQueue->getContext());
+    CUstream CuStream = hQueue->getNextTransferStream();
+
+    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                               phEventWaitList);
+
+    if (phEvent) {
+      RetImplEvent =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_MEM_BUFFER_WRITE, hQueue, CuStream));
+      RetImplEvent->start();
+    }
+
+    UR_CHECK_ERROR(cuMemcpyHtoDAsync(DevPtr + offset, pSrc, size, CuStream));
+
+    if (phEvent) {
+      Result = RetImplEvent->record();
+    }
+
+    if (blockingWrite) {
+      UR_CHECK_ERROR(cuStreamSynchronize(CuStream));
+    }
+
+    if (phEvent) {
+      *phEvent = RetImplEvent.release();
+    }
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
+    ur_queue_handle_t hQueue, ur_program_handle_t hProgram, const char *name,
+    bool blockingWrite, size_t count, size_t offset, const void *pSrc,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(name && pSrc, UR_RESULT_ERROR_INVALID_VALUE);
+
+  // Since CUDA requires a the global variable to be referenced by name, we use
+  // metadata to find the correct name to access it by.
+  auto DeviceGlobalNameIt = hProgram->GlobalIDMD.find(name);
+  if (DeviceGlobalNameIt == hProgram->GlobalIDMD.end())
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  std::string DeviceGlobalName = DeviceGlobalNameIt->second;
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  try {
+    CUdeviceptr DeviceGlobal = 0;
+    size_t DeviceGlobalSize = 0;
+    Result = UR_CHECK_ERROR(cuModuleGetGlobal(&DeviceGlobal, &DeviceGlobalSize,
+                                              hProgram->get(),
+                                              DeviceGlobalName.c_str()));
+
+    if (offset + count > DeviceGlobalSize)
+      return UR_RESULT_ERROR_INVALID_VALUE;
+
+    return urEnqueueUSMMemcpy(
+        hQueue, blockingWrite, reinterpret_cast<void *>(DeviceGlobal + offset),
+        pSrc, count, numEventsInWaitList, phEventWaitList, phEvent);
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead(
+    ur_queue_handle_t hQueue, ur_program_handle_t hProgram, const char *name,
+    bool blockingRead, size_t count, size_t offset, void *pDst,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(name && pDst, UR_RESULT_ERROR_INVALID_VALUE);
+
+  // Since CUDA requires a the global variable to be referenced by name, we use
+  // metadata to find the correct name to access it by.
+  auto DeviceGlobalNameIt = hProgram->GlobalIDMD.find(name);
+  if (DeviceGlobalNameIt == hProgram->GlobalIDMD.end())
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  std::string DeviceGlobalName = DeviceGlobalNameIt->second;
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  try {
+    CUdeviceptr DeviceGlobal = 0;
+    size_t DeviceGlobalSize = 0;
+    Result = UR_CHECK_ERROR(cuModuleGetGlobal(&DeviceGlobal, &DeviceGlobalSize,
+                                              hProgram->get(),
+                                              DeviceGlobalName.c_str()));
+
+    if (offset + count > DeviceGlobalSize)
+      return UR_RESULT_ERROR_INVALID_VALUE;
+
+    return urEnqueueUSMMemcpy(
+        hQueue, blockingRead, pDst,
+        reinterpret_cast<const void *>(DeviceGlobal + offset), count,
+        numEventsInWaitList, phEventWaitList, phEvent);
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
+
+/// Host Pipes
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueReadHostPipe(
+    ur_queue_handle_t hQueue, ur_program_handle_t hProgram,
+    const char *pipe_symbol, bool blocking, void *pDst, size_t size,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  (void)hQueue;
+  (void)hProgram;
+  (void)pipe_symbol;
+  (void)blocking;
+  (void)pDst;
+  (void)size;
+  (void)numEventsInWaitList;
+  (void)phEventWaitList;
+  (void)phEvent;
+
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe(
+    ur_queue_handle_t hQueue, ur_program_handle_t hProgram,
+    const char *pipe_symbol, bool blocking, void *pSrc, size_t size,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  (void)hQueue;
+  (void)hProgram;
+  (void)pipe_symbol;
+  (void)blocking;
+  (void)pSrc;
+  (void)size;
+  (void)numEventsInWaitList;
+  (void)phEventWaitList;
+  (void)phEvent;
+
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp
new file mode 100644
index 0000000000000..8916197b73f1c
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.cpp
@@ -0,0 +1,306 @@
+//===--------- event.cpp - CUDA Adapter ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include "event.hpp"
+#include "common.hpp"
+#include "context.hpp"
+#include "device.hpp"
+#include "queue.hpp"
+
+#include <cassert>
+#include <cuda.h>
+
+ur_event_handle_t_::ur_event_handle_t_(ur_command_t Type,
+                                       ur_context_handle_t Context,
+                                       ur_queue_handle_t Queue, CUstream Stream,
+                                       uint32_t StreamToken)
+    : CommandType{Type}, RefCount{1}, HasOwnership{true},
+      HasBeenWaitedOn{false}, IsRecorded{false}, IsStarted{false},
+      StreamToken{StreamToken}, EvEnd{nullptr}, EvStart{nullptr},
+      EvQueued{nullptr}, Queue{Queue}, Stream{Stream}, Context{Context} {
+
+  bool ProfilingEnabled = Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE;
+
+  UR_CHECK_ERROR(cuEventCreate(
+      &EvEnd, ProfilingEnabled ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING));
+
+  if (ProfilingEnabled) {
+    UR_CHECK_ERROR(cuEventCreate(&EvQueued, CU_EVENT_DEFAULT));
+    UR_CHECK_ERROR(cuEventCreate(&EvStart, CU_EVENT_DEFAULT));
+  }
+
+  if (Queue != nullptr) {
+    urQueueRetain(Queue);
+  }
+  urContextRetain(Context);
+}
+
+ur_event_handle_t_::ur_event_handle_t_(ur_context_handle_t Context,
+                                       CUevent EventNative)
+    : CommandType{UR_COMMAND_EVENTS_WAIT}, RefCount{1}, HasOwnership{false},
+      HasBeenWaitedOn{false}, IsRecorded{false}, IsStarted{false},
+      StreamToken{std::numeric_limits<uint32_t>::max()}, EvEnd{EventNative},
+      EvStart{nullptr}, EvQueued{nullptr}, Queue{nullptr}, Context{Context} {
+  urContextRetain(Context);
+}
+
+ur_event_handle_t_::~ur_event_handle_t_() {
+  if (Queue != nullptr) {
+    urQueueRelease(Queue);
+  }
+  urContextRelease(Context);
+}
+
+ur_result_t ur_event_handle_t_::start() {
+  assert(!isStarted());
+  ur_result_t Result = UR_RESULT_SUCCESS;
+
+  try {
+    if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) {
+      // NOTE: This relies on the default stream to be unused.
+      Result = UR_CHECK_ERROR(cuEventRecord(EvQueued, 0));
+      Result = UR_CHECK_ERROR(cuEventRecord(EvStart, Stream));
+    }
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+
+  IsStarted = true;
+  return Result;
+}
+
+bool ur_event_handle_t_::isCompleted() const noexcept {
+  if (!IsRecorded) {
+    return false;
+  }
+  if (!HasBeenWaitedOn) {
+    const CUresult Result = cuEventQuery(EvEnd);
+    if (Result != CUDA_SUCCESS && Result != CUDA_ERROR_NOT_READY) {
+      UR_CHECK_ERROR(Result);
+      return false;
+    }
+    if (Result == CUDA_ERROR_NOT_READY) {
+      return false;
+    }
+  }
+  return true;
+}
+
+uint64_t ur_event_handle_t_::getQueuedTime() const {
+  assert(isStarted());
+  return Queue->get_device()->getElapsedTime(EvQueued);
+}
+
+uint64_t ur_event_handle_t_::getStartTime() const {
+  assert(isStarted());
+  return Queue->get_device()->getElapsedTime(EvStart);
+}
+
+uint64_t ur_event_handle_t_::getEndTime() const {
+  assert(isStarted() && isRecorded());
+  return Queue->get_device()->getElapsedTime(EvEnd);
+}
+
+ur_result_t ur_event_handle_t_::record() {
+
+  if (isRecorded() || !isStarted()) {
+    return UR_RESULT_ERROR_INVALID_EVENT;
+  }
+
+  ur_result_t Result = UR_RESULT_ERROR_INVALID_OPERATION;
+
+  UR_ASSERT(Queue, UR_RESULT_ERROR_INVALID_QUEUE);
+
+  try {
+    EventID = Queue->getNextEventID();
+    if (EventID == 0) {
+      sycl::detail::ur::die(
+          "Unrecoverable program state reached in event identifier overflow");
+    }
+    Result = UR_CHECK_ERROR(cuEventRecord(EvEnd, Stream));
+  } catch (ur_result_t error) {
+    Result = error;
+  }
+
+  if (Result == UR_RESULT_SUCCESS) {
+    IsRecorded = true;
+  }
+
+  return Result;
+}
+
+ur_result_t ur_event_handle_t_::wait() {
+  ur_result_t Result;
+  try {
+    Result = UR_CHECK_ERROR(cuEventSynchronize(EvEnd));
+    HasBeenWaitedOn = true;
+  } catch (ur_result_t error) {
+    Result = error;
+  }
+
+  return Result;
+}
+
+ur_result_t ur_event_handle_t_::release() {
+  if (!backendHasOwnership())
+    return UR_RESULT_SUCCESS;
+
+  assert(Queue != nullptr);
+
+  UR_CHECK_ERROR(cuEventDestroy(EvEnd));
+
+  if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) {
+    UR_CHECK_ERROR(cuEventDestroy(EvQueued));
+    UR_CHECK_ERROR(cuEventDestroy(EvStart));
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent,
+                                                   ur_event_info_t propName,
+                                                   size_t propValueSize,
+                                                   void *pPropValue,
+                                                   size_t *pPropValueSizeRet) {
+  UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet);
+
+  switch (propName) {
+  case UR_EVENT_INFO_COMMAND_QUEUE:
+    return ReturnValue(hEvent->getQueue());
+  case UR_EVENT_INFO_COMMAND_TYPE:
+    return ReturnValue(hEvent->getCommandType());
+  case UR_EVENT_INFO_REFERENCE_COUNT:
+    return ReturnValue(hEvent->getReferenceCount());
+  case UR_EVENT_INFO_COMMAND_EXECUTION_STATUS:
+    return ReturnValue(hEvent->getExecutionStatus());
+  case UR_EVENT_INFO_CONTEXT:
+    return ReturnValue(hEvent->getContext());
+  default:
+    sycl::detail::ur::die("Event info request not implemented");
+  }
+
+  return UR_RESULT_ERROR_INVALID_ENUMERATION;
+}
+
+/// Obtain profiling information from PI CUDA events
+/// \TODO Timings from CUDA are only elapsed time.
+UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
+    ur_event_handle_t hEvent, ur_profiling_info_t propName,
+    size_t propValueSize, void *pPropValue, size_t *pPropValueSizeRet) {
+  UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet);
+
+  ur_queue_handle_t Queue = hEvent->getQueue();
+  if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE)) {
+    return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
+  }
+
+  switch (propName) {
+  case UR_PROFILING_INFO_COMMAND_QUEUED:
+  case UR_PROFILING_INFO_COMMAND_SUBMIT:
+    // Note: No user for this case
+    return ReturnValue(static_cast<uint64_t>(hEvent->getQueuedTime()));
+  case UR_PROFILING_INFO_COMMAND_START:
+    return ReturnValue(static_cast<uint64_t>(hEvent->getStartTime()));
+  case UR_PROFILING_INFO_COMMAND_END:
+    return ReturnValue(static_cast<uint64_t>(hEvent->getEndTime()));
+  default:
+    break;
+  }
+  sycl::detail::ur::die("Event Profiling info request not implemented");
+  return {};
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback(ur_event_handle_t,
+                                                       ur_execution_info_t,
+                                                       ur_event_callback_t,
+                                                       void *) {
+  sycl::detail::ur::die("Event Callback not implemented in CUDA adapter");
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) {
+  try {
+    UR_ASSERT(phEventWaitList, UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
+    UR_ASSERT(numEvents > 0, UR_RESULT_ERROR_INVALID_VALUE);
+
+    auto Context = phEventWaitList[0]->getContext();
+    ScopedContext Active(Context);
+
+    auto WaitFunc = [Context](ur_event_handle_t Event) -> ur_result_t {
+      UR_ASSERT(Event, UR_RESULT_ERROR_INVALID_EVENT);
+      UR_ASSERT(Event->getContext() == Context,
+                UR_RESULT_ERROR_INVALID_CONTEXT);
+
+      return Event->wait();
+    };
+    return forLatestEvents(phEventWaitList, numEvents, WaitFunc);
+  } catch (ur_result_t Err) {
+    return Err;
+  } catch (...) {
+    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  }
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) {
+  UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  const auto RefCount = hEvent->incrementReferenceCount();
+
+  sycl::detail::ur::assertion(
+      RefCount != 0, "Reference count overflow detected in urEventRetain.");
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) {
+  UR_ASSERT(hEvent, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  // double delete or someone is messing with the ref count.
+  // either way, cannot safely proceed.
+  sycl::detail::ur::assertion(
+      hEvent->getReferenceCount() != 0,
+      "Reference count overflow detected in urEventRelease.");
+
+  // decrement ref count. If it is 0, delete the event.
+  if (hEvent->decrementReferenceCount() == 0) {
+    std::unique_ptr<ur_event_handle_t_> event_ptr{hEvent};
+    ur_result_t Result = UR_RESULT_ERROR_INVALID_EVENT;
+    try {
+      ScopedContext Active(hEvent->getContext());
+      Result = hEvent->release();
+    } catch (...) {
+      Result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+    }
+    return Result;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEventGetNativeHandle(
+    ur_event_handle_t hEvent, ur_native_handle_t *phNativeEvent) {
+  *phNativeEvent = reinterpret_cast<ur_native_handle_t>(hEvent->get());
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle(
+    ur_native_handle_t hNativeEvent, ur_context_handle_t hContext,
+    const ur_event_native_properties_t *pProperties,
+    ur_event_handle_t *phEvent) {
+  std::ignore = pProperties;
+
+  std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
+
+  *phEvent = ur_event_handle_t_::makeWithNative(
+      hContext, reinterpret_cast<CUevent>(hNativeEvent));
+
+  return UR_RESULT_SUCCESS;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp
new file mode 100644
index 0000000000000..fe56c1e1ab501
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/event.hpp
@@ -0,0 +1,189 @@
+//===--------- event.hpp - CUDA Adapter ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+#pragma once
+
+#include <cuda.h>
+#include <ur/ur.hpp>
+
+#include "queue.hpp"
+
+/// UR Event mapping to CUevent
+///
+struct ur_event_handle_t_ {
+public:
+  using native_type = CUevent;
+
+  ur_result_t record();
+
+  ur_result_t wait();
+
+  ur_result_t start();
+
+  native_type get() const noexcept { return EvEnd; };
+
+  ur_queue_handle_t getQueue() const noexcept { return Queue; }
+
+  CUstream getStream() const noexcept { return Stream; }
+
+  uint32_t getComputeStreamToken() const noexcept { return StreamToken; }
+
+  ur_command_t getCommandType() const noexcept { return CommandType; }
+
+  uint32_t getReferenceCount() const noexcept { return RefCount; }
+
+  bool isRecorded() const noexcept { return IsRecorded; }
+
+  bool isStarted() const noexcept { return IsStarted; }
+
+  bool isCompleted() const noexcept;
+
+  uint32_t getExecutionStatus() const noexcept {
+
+    if (!isRecorded()) {
+      return UR_EVENT_STATUS_SUBMITTED;
+    }
+
+    if (!isCompleted()) {
+      return UR_EVENT_STATUS_RUNNING;
+    }
+    return UR_EVENT_STATUS_COMPLETE;
+  }
+
+  ur_context_handle_t getContext() const noexcept { return Context; };
+
+  uint32_t incrementReferenceCount() { return ++RefCount; }
+
+  uint32_t decrementReferenceCount() { return --RefCount; }
+
+  uint32_t getEventID() const noexcept { return EventID; }
+
+  bool backendHasOwnership() const noexcept { return HasOwnership; }
+
+  // Returns the counter time when the associated command(s) were enqueued
+  //
+  uint64_t getQueuedTime() const;
+
+  // Returns the counter time when the associated command(s) started execution
+  //
+  uint64_t getStartTime() const;
+
+  // Returns the counter time when the associated command(s) completed
+  //
+  uint64_t getEndTime() const;
+
+  // construct a native CUDA. This maps closely to the underlying CUDA event.
+  static ur_event_handle_t
+  makeNative(ur_command_t Type, ur_queue_handle_t Queue, CUstream Stream,
+             uint32_t StreamToken = std::numeric_limits<uint32_t>::max()) {
+    return new ur_event_handle_t_(Type, Queue->getContext(), Queue, Stream,
+                                  StreamToken);
+  }
+
+  static ur_event_handle_t makeWithNative(ur_context_handle_t context,
+                                          CUevent eventNative) {
+    return new ur_event_handle_t_(context, eventNative);
+  }
+
+  ur_result_t release();
+
+  ~ur_event_handle_t_();
+
+private:
+  // This constructor is private to force programmers to use the makeNative /
+  // make_user static members in order to create a pi_event for CUDA.
+  ur_event_handle_t_(ur_command_t Type, ur_context_handle_t Context,
+                     ur_queue_handle_t Queue, CUstream Stream,
+                     uint32_t StreamToken);
+
+  // This constructor is private to force programmers to use the
+  // makeWithNative for event interop
+  ur_event_handle_t_(ur_context_handle_t Context, CUevent EventNative);
+
+  ur_command_t CommandType; // The type of command associated with event.
+
+  std::atomic_uint32_t RefCount; // Event reference count.
+
+  bool HasOwnership; // Signifies if event owns the native type.
+
+  bool HasBeenWaitedOn; // Signifies whether the event has been waited
+                        // on through a call to wait(), which implies
+                        // that it has completed.
+
+  bool IsRecorded; // Signifies wether a native CUDA event has been recorded
+                   // yet.
+  bool IsStarted;  // Signifies wether the operation associated with the
+                   // UR event has started or not
+
+  uint32_t StreamToken;
+  uint32_t EventID; // Queue identifier of the event.
+
+  native_type EvEnd; // CUDA event handle. If this ur_event_handle_t represents
+                     // a user event, this will be nullptr.
+
+  native_type EvStart; // CUDA event handle associated with the start
+
+  native_type EvQueued; // CUDA event handle associated with the time
+                        // the command was enqueued
+
+  ur_queue_handle_t Queue; // ur_queue_handle_t associated with the event. If
+                           // this is a user event, this will be nullptr.
+
+  CUstream Stream; // CUstream associated with the event. If this is a user
+                   // event, this will be uninitialized.
+
+  ur_context_handle_t Context; // ur_context_handle_t associated with the event.
+                               // If this is a native event, this will be the
+                               // same context associated with the queue member.
+};
+
+// Iterate over `event_wait_list` and apply the given callback `f` to the
+// latest event on each queue therein. The callback must take a single
+// ur_event_handle_t argument and return a ur_result_t. If the callback returns
+// an error, the iteration terminates and the error is returned.
+template <typename Func>
+ur_result_t forLatestEvents(const ur_event_handle_t *EventWaitList,
+                            std::size_t NumEventsInWaitList, Func &&F) {
+
+  if (EventWaitList == nullptr || NumEventsInWaitList == 0) {
+    return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST;
+  }
+
+  // Fast path if we only have a single event
+  if (NumEventsInWaitList == 1) {
+    return F(EventWaitList[0]);
+  }
+
+  std::vector<ur_event_handle_t> Events{EventWaitList,
+                                        EventWaitList + NumEventsInWaitList};
+  std::sort(Events.begin(), Events.end(),
+            [](ur_event_handle_t Event0, ur_event_handle_t Event1) {
+              // Tiered sort creating sublists of streams (smallest value first)
+              // in which the corresponding events are sorted into a sequence of
+              // newest first.
+              return Event0->getStream() < Event1->getStream() ||
+                     (Event0->getStream() == Event1->getStream() &&
+                      Event0->getEventID() > Event1->getEventID());
+            });
+
+  CUstream LastSeenStream = 0;
+  for (size_t i = 0; i < Events.size(); i++) {
+    auto Event = Events[i];
+    if (!Event || (i != 0 && Event->getStream() == LastSeenStream)) {
+      continue;
+    }
+
+    LastSeenStream = Event->getStream();
+
+    auto Result = F(Event);
+    if (Result != UR_RESULT_SUCCESS) {
+      return Result;
+    }
+  }
+
+  return UR_RESULT_SUCCESS;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
new file mode 100644
index 0000000000000..e1d6f9f9a2cd3
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.cpp
@@ -0,0 +1,380 @@
+//===--------- kernel.cpp - CUDA Adapter ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include "kernel.hpp"
+#include "memory.hpp"
+#include "sampler.hpp"
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName,
+               ur_kernel_handle_t *phKernel) {
+  UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(phKernel, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(pKernelName, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  std::unique_ptr<ur_kernel_handle_t_> Kernel{nullptr};
+
+  try {
+    ScopedContext Active(hProgram->getContext());
+
+    CUfunction CuFunc;
+    CUresult FunctionResult =
+        cuModuleGetFunction(&CuFunc, hProgram->get(), pKernelName);
+
+    // We can't add this as a generic mapping in UR_CHECK_ERROR since cuda's
+    // NOT_FOUND error applies to more than just functions.
+    if (FunctionResult == CUDA_ERROR_NOT_FOUND) {
+      throw UR_RESULT_ERROR_INVALID_KERNEL_NAME;
+    } else {
+      Result = UR_CHECK_ERROR(FunctionResult);
+    }
+
+    std::string KernelNameWithOffset =
+        std::string(pKernelName) + "_with_offset";
+    CUfunction CuFuncWithOffsetParam;
+    CUresult OffsetRes = cuModuleGetFunction(
+        &CuFuncWithOffsetParam, hProgram->get(), KernelNameWithOffset.c_str());
+
+    // If there is no kernel with global offset parameter we mark it as missing
+    if (OffsetRes == CUDA_ERROR_NOT_FOUND) {
+      CuFuncWithOffsetParam = nullptr;
+    } else {
+      Result = UR_CHECK_ERROR(OffsetRes);
+    }
+    Kernel = std::unique_ptr<ur_kernel_handle_t_>(
+        new ur_kernel_handle_t_{CuFunc, CuFuncWithOffsetParam, pKernelName,
+                                hProgram, hProgram->getContext()});
+  } catch (ur_result_t Err) {
+    Result = Err;
+  } catch (...) {
+    Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  }
+
+  *phKernel = Kernel.release();
+  return Result;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
+                     ur_kernel_group_info_t propName, size_t propSize,
+                     void *pPropValue, size_t *pPropSizeRet) {
+  UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
+
+  switch (propName) {
+  case UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE: {
+    size_t GlobalWorkSize[3] = {0, 0, 0};
+
+    int MaxBlockDimX{0}, MaxBlockDimY{0}, MaxBlockDimZ{0};
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&MaxBlockDimX, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&MaxBlockDimY, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&MaxBlockDimZ, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
+                             hDevice->get()) == CUDA_SUCCESS);
+
+    int MaxGridDimX{0}, MaxGridDimY{0}, MaxGridDimZ{0};
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&MaxGridDimY, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y,
+                             hDevice->get()) == CUDA_SUCCESS);
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&MaxGridDimZ, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z,
+                             hDevice->get()) == CUDA_SUCCESS);
+
+    GlobalWorkSize[0] = MaxBlockDimX * MaxGridDimX;
+    GlobalWorkSize[1] = MaxBlockDimY * MaxGridDimY;
+    GlobalWorkSize[2] = MaxBlockDimZ * MaxGridDimZ;
+    return ReturnValue(GlobalWorkSize, 3);
+  }
+  case UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE: {
+    int MaxThreads = 0;
+    sycl::detail::ur::assertion(
+        cuFuncGetAttribute(&MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                           hKernel->get()) == CUDA_SUCCESS);
+    return ReturnValue(size_t(MaxThreads));
+  }
+  case UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: {
+    size_t GroupSize[3] = {0, 0, 0};
+    const auto &ReqdWGSizeMDMap =
+        hKernel->get_program()->KernelReqdWorkGroupSizeMD;
+    const auto ReqdWGSizeMD = ReqdWGSizeMDMap.find(hKernel->getName());
+    if (ReqdWGSizeMD != ReqdWGSizeMDMap.end()) {
+      const auto ReqdWGSize = ReqdWGSizeMD->second;
+      GroupSize[0] = std::get<0>(ReqdWGSize);
+      GroupSize[1] = std::get<1>(ReqdWGSize);
+      GroupSize[2] = std::get<2>(ReqdWGSize);
+    }
+    return ReturnValue(GroupSize, 3);
+  }
+  case UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: {
+    // OpenCL LOCAL == CUDA SHARED
+    int Bytes = 0;
+    sycl::detail::ur::assertion(
+        cuFuncGetAttribute(&Bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
+                           hKernel->get()) == CUDA_SUCCESS);
+    return ReturnValue(uint64_t(Bytes));
+  }
+  case UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: {
+    // Work groups should be multiples of the warp size
+    int WarpSize = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
+                             hDevice->get()) == CUDA_SUCCESS);
+    return ReturnValue(static_cast<size_t>(WarpSize));
+  }
+  case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: {
+    // OpenCL PRIVATE == CUDA LOCAL
+    int Bytes = 0;
+    sycl::detail::ur::assertion(
+        cuFuncGetAttribute(&Bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
+                           hKernel->get()) == CUDA_SUCCESS);
+    return ReturnValue(uint64_t(Bytes));
+  }
+  default:
+    break;
+  }
+
+  return UR_RESULT_ERROR_INVALID_ENUMERATION;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelRetain(ur_kernel_handle_t hKernel) {
+  UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hKernel->getReferenceCount() > 0u, UR_RESULT_ERROR_INVALID_KERNEL);
+
+  hKernel->incrementReferenceCount();
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urKernelRelease(ur_kernel_handle_t hKernel) {
+  UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  // double delete or someone is messing with the ref count.
+  // either way, cannot safely proceed.
+  UR_ASSERT(hKernel->getReferenceCount() != 0, UR_RESULT_ERROR_INVALID_KERNEL);
+
+  // decrement ref count. If it is 0, delete the program.
+  if (hKernel->decrementReferenceCount() == 0) {
+    // no internal cuda resources to clean up. Just delete it.
+    delete hKernel;
+    return UR_RESULT_SUCCESS;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+// TODO(ur): Not implemented on cuda atm. Also, need to add tests for this
+// feature.
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle(
+    ur_kernel_handle_t hKernel, ur_native_handle_t *phNativeKernel) {
+  (void)hKernel;
+  (void)phNativeKernel;
+
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urKernelSetArgValue(ur_kernel_handle_t hKernel, uint32_t argIndex,
+                    size_t argSize, const void *pArgValue) {
+  UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(argSize, UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE);
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  try {
+    if (pArgValue) {
+      hKernel->setKernelArg(argIndex, argSize, pArgValue);
+    } else {
+      hKernel->setKernelLocalArg(argIndex, argSize);
+    }
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel,
+                                                    ur_kernel_info_t propName,
+                                                    size_t propSize,
+                                                    void *pKernelInfo,
+                                                    size_t *pPropSizeRet) {
+  UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  UrReturnHelper ReturnValue(propSize, pKernelInfo, pPropSizeRet);
+
+  switch (propName) {
+  case UR_KERNEL_INFO_FUNCTION_NAME:
+    return ReturnValue(hKernel->getName());
+  case UR_KERNEL_INFO_NUM_ARGS:
+    return ReturnValue(hKernel->getNumArgs());
+  case UR_KERNEL_INFO_REFERENCE_COUNT:
+    return ReturnValue(hKernel->getReferenceCount());
+  case UR_KERNEL_INFO_CONTEXT:
+    return ReturnValue(hKernel->getContext());
+  case UR_KERNEL_INFO_PROGRAM:
+    return ReturnValue(hKernel->get_program());
+  case UR_KERNEL_INFO_ATTRIBUTES:
+    return ReturnValue("");
+  case UR_KERNEL_INFO_NUM_REGS: {
+    int NumRegs = 0;
+    sycl::detail::ur::assertion(
+        cuFuncGetAttribute(&NumRegs, CU_FUNC_ATTRIBUTE_NUM_REGS,
+                           hKernel->get()) == CUDA_SUCCESS);
+    return ReturnValue(static_cast<uint32_t>(NumRegs));
+  }
+  default:
+    break;
+  }
+
+  return UR_RESULT_ERROR_INVALID_ENUMERATION;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urKernelGetSubGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
+                        ur_kernel_sub_group_info_t propName, size_t propSize,
+                        void *pPropValue, size_t *pPropSizeRet) {
+  UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
+  switch (propName) {
+  case UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE: {
+    // Sub-group size is equivalent to warp size
+    int WarpSize = 0;
+    sycl::detail::ur::assertion(
+        cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE,
+                             hDevice->get()) == CUDA_SUCCESS);
+    return ReturnValue(static_cast<uint32_t>(WarpSize));
+  }
+  case UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS: {
+    // Number of sub-groups = max block size / warp size + possible remainder
+    int MaxThreads = 0;
+    sycl::detail::ur::assertion(
+        cuFuncGetAttribute(&MaxThreads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
+                           hKernel->get()) == CUDA_SUCCESS);
+    int WarpSize = 0;
+    urKernelGetSubGroupInfo(hKernel, hDevice,
+                            UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE,
+                            sizeof(uint32_t), &WarpSize, nullptr);
+    int MaxWarps = (MaxThreads + WarpSize - 1) / WarpSize;
+    return ReturnValue(static_cast<uint32_t>(MaxWarps));
+  }
+  case UR_KERNEL_SUB_GROUP_INFO_COMPILE_NUM_SUB_GROUPS: {
+    // Return value of 0 => not specified
+    // TODO: Revisit if PTX is generated for compile-time work-group sizes
+    return ReturnValue(0);
+  }
+  case UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL: {
+    // Return value of 0 => unspecified or "auto" sub-group size
+    // Correct for now, since warp size may be read from special register
+    // TODO: Return warp size once default is primary sub-group size
+    // TODO: Revisit if we can recover [[sub_group_size]] attribute from PTX
+    return ReturnValue(0);
+  }
+  default:
+    break;
+  }
+
+  return UR_RESULT_ERROR_INVALID_ENUMERATION;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer(
+    ur_kernel_handle_t hKernel, uint32_t argIndex, const void *pArgValue) {
+  hKernel->setKernelArg(argIndex, sizeof(pArgValue), pArgValue);
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj(
+    ur_kernel_handle_t hKernel, uint32_t argIndex, ur_mem_handle_t hArgValue) {
+
+  UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  // Below sets kernel arg when zero-sized buffers are handled.
+  // In such case the corresponding memory is null.
+  if (hArgValue == nullptr) {
+    hKernel->setKernelArg(argIndex, 0, nullptr);
+    return UR_RESULT_SUCCESS;
+  }
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  try {
+    if (hArgValue->MemType == ur_mem_handle_t_::Type::Surface) {
+      CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+      UR_CHECK_ERROR(cuArray3DGetDescriptor(
+          &arrayDesc, hArgValue->Mem.SurfaceMem.getArray()));
+      if (arrayDesc.Format != CU_AD_FORMAT_UNSIGNED_INT32 &&
+          arrayDesc.Format != CU_AD_FORMAT_SIGNED_INT32 &&
+          arrayDesc.Format != CU_AD_FORMAT_HALF &&
+          arrayDesc.Format != CU_AD_FORMAT_FLOAT) {
+        setErrorMessage("PI CUDA kernels only support images with channel "
+                        "types int32, uint32, float, and half.",
+                        UR_RESULT_ERROR_ADAPTER_SPECIFIC);
+        return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+      }
+      CUsurfObject CuSurf = hArgValue->Mem.SurfaceMem.getSurface();
+      hKernel->setKernelArg(argIndex, sizeof(CuSurf), (void *)&CuSurf);
+    } else {
+      CUdeviceptr CuPtr = hArgValue->Mem.BufferMem.get();
+      hKernel->setKernelArg(argIndex, sizeof(CUdeviceptr), (void *)&CuPtr);
+    }
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
+
+// A NOP for the CUDA backend
+UR_APIEXPORT ur_result_t UR_APICALL
+urKernelSetExecInfo(ur_kernel_handle_t hKernel, ur_kernel_exec_info_t propName,
+                    size_t propSize, const void *pPropValue) {
+  std::ignore = propSize;
+  UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(pPropValue, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  switch (propName) {
+  case UR_KERNEL_EXEC_INFO_USM_INDIRECT_ACCESS:
+  case UR_KERNEL_EXEC_INFO_USM_PTRS:
+  case UR_KERNEL_EXEC_INFO_CACHE_CONFIG:
+    return UR_RESULT_SUCCESS;
+  default:
+    return UR_RESULT_ERROR_INVALID_ENUMERATION;
+  }
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
+    ur_native_handle_t hNativeKernel, ur_context_handle_t hContext,
+    ur_program_handle_t hProgram,
+    const ur_kernel_native_properties_t *pProperties,
+    ur_kernel_handle_t *phKernel) {
+  std::ignore = hNativeKernel;
+  std::ignore = hContext;
+  std::ignore = hProgram;
+  std::ignore = pProperties;
+  std::ignore = phKernel;
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urKernelSetArgSampler(ur_kernel_handle_t hKernel, uint32_t argIndex,
+                      ur_sampler_handle_t hArgValue) {
+  UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  try {
+    uint32_t SamplerProps = hArgValue->Props;
+    hKernel->setKernelArg(argIndex, sizeof(uint32_t), (void *)&SamplerProps);
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp
new file mode 100644
index 0000000000000..8b6a617126b08
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/kernel.hpp
@@ -0,0 +1,200 @@
+//===--------- kernel.hpp - CUDA Adapter ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+#pragma once
+
+#include <cuda.h>
+#include <ur_api.h>
+
+#include <atomic>
+#include <cassert>
+#include <numeric>
+
+#include "program.hpp"
+
+/// Implementation of a UR Kernel for CUDA
+///
+/// UR Kernels are used to set kernel arguments,
+/// creating a state on the Kernel object for a given
+/// invocation. This is not the case of CUFunction objects,
+/// which are simply passed together with the arguments on the invocation.
+/// The UR Kernel implementation for CUDA stores the list of arguments,
+/// argument sizes, and offsets to emulate the interface of UR Kernel,
+/// saving the arguments for the later dispatch.
+/// Note that in UR API, the Local memory is specified as a size per
+/// individual argument, but in CUDA only the total usage of shared
+/// memory is required since it is not passed as a parameter.
+/// A compiler pass converts the UR API local memory model into the
+/// CUDA shared model. This object simply calculates the total of
+/// shared memory, and the initial offsets of each parameter.
+struct ur_kernel_handle_t_ {
+  using native_type = CUfunction;
+
+  native_type Function;
+  native_type FunctionWithOffsetParam;
+  std::string Name;
+  ur_context_handle_t Context;
+  ur_program_handle_t Program;
+  std::atomic_uint32_t RefCount;
+
+  static constexpr uint32_t ReqdThreadsPerBlockDimensions = 3u;
+  size_t ReqdThreadsPerBlock[ReqdThreadsPerBlockDimensions];
+
+  /// Structure that holds the arguments to the kernel.
+  /// Note each argument size is known, since it comes
+  /// from the kernel signature.
+  /// This is not something can be queried from the CUDA API
+  /// so there is a hard-coded size (\ref MAX_PARAM_BYTES)
+  /// and a storage.
+  struct arguments {
+    static constexpr size_t MaxParamBytes = 4000u;
+    using args_t = std::array<char, MaxParamBytes>;
+    using args_size_t = std::vector<size_t>;
+    using args_index_t = std::vector<void *>;
+    args_t Storage;
+    args_size_t ParamSizes;
+    args_index_t Indices;
+    args_size_t OffsetPerIndex;
+
+    std::uint32_t ImplicitOffsetArgs[3] = {0, 0, 0};
+
+    arguments() {
+      // Place the implicit offset index at the end of the indicies collection
+      Indices.emplace_back(&ImplicitOffsetArgs);
+    }
+
+    /// Add an argument to the kernel.
+    /// If the argument existed before, it is replaced.
+    /// Otherwise, it is added.
+    /// Gaps are filled with empty arguments.
+    /// Implicit offset argument is kept at the back of the indices collection.
+    void addArg(size_t Index, size_t Size, const void *Arg,
+                size_t LocalSize = 0) {
+      if (Index + 2 > Indices.size()) {
+        // Move implicit offset argument index with the end
+        Indices.resize(Index + 2, Indices.back());
+        // Ensure enough space for the new argument
+        ParamSizes.resize(Index + 1);
+        OffsetPerIndex.resize(Index + 1);
+      }
+      ParamSizes[Index] = Size;
+      // calculate the insertion point on the array
+      size_t InsertPos = std::accumulate(std::begin(ParamSizes),
+                                         std::begin(ParamSizes) + Index, 0);
+      // Update the stored value for the argument
+      std::memcpy(&Storage[InsertPos], Arg, Size);
+      Indices[Index] = &Storage[InsertPos];
+      OffsetPerIndex[Index] = LocalSize;
+    }
+
+    void addLocalArg(size_t Index, size_t Size) {
+      size_t LocalOffset = this->getLocalSize();
+
+      // maximum required alignment is the size of the largest vector type
+      const size_t MaxAlignment = sizeof(double) * 16;
+
+      // for arguments smaller than the maximum alignment simply align to the
+      // size of the argument
+      const size_t Alignment = std::min(MaxAlignment, Size);
+
+      // align the argument
+      size_t AlignedLocalOffset = LocalOffset;
+      size_t Pad = LocalOffset % Alignment;
+      if (Pad != 0) {
+        AlignedLocalOffset += Alignment - Pad;
+      }
+
+      addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
+             Size + (AlignedLocalOffset - LocalOffset));
+    }
+
+    void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) {
+      assert(Size == sizeof(std::uint32_t) * 3);
+      std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size);
+    }
+
+    void clearLocalSize() {
+      std::fill(std::begin(OffsetPerIndex), std::end(OffsetPerIndex), 0);
+    }
+
+    const args_index_t &getIndices() const noexcept { return Indices; }
+
+    uint32_t getLocalSize() const {
+      return std::accumulate(std::begin(OffsetPerIndex),
+                             std::end(OffsetPerIndex), 0);
+    }
+  } Args;
+
+  ur_kernel_handle_t_(CUfunction Func, CUfunction FuncWithOffsetParam,
+                      const char *Name, ur_program_handle_t Program,
+                      ur_context_handle_t Context)
+      : Function{Func}, FunctionWithOffsetParam{FuncWithOffsetParam},
+        Name{Name}, Context{Context}, Program{Program}, RefCount{1} {
+    urProgramRetain(Program);
+    urContextRetain(Context);
+    /// Note: this code assumes that there is only one device per context
+    ur_result_t RetError = urKernelGetGroupInfo(
+        this, Context->getDevice(),
+        UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
+        sizeof(ReqdThreadsPerBlock), ReqdThreadsPerBlock, nullptr);
+    (void)RetError;
+    assert(RetError == UR_RESULT_SUCCESS);
+  }
+
+  ~ur_kernel_handle_t_() {
+    urProgramRelease(Program);
+    urContextRelease(Context);
+  }
+
+  ur_program_handle_t get_program() const noexcept { return Program; }
+
+  uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
+
+  uint32_t decrementReferenceCount() noexcept { return --RefCount; }
+
+  uint32_t getReferenceCount() const noexcept { return RefCount; }
+
+  native_type get() const noexcept { return Function; };
+
+  native_type get_with_offset_parameter() const noexcept {
+    return FunctionWithOffsetParam;
+  };
+
+  bool has_with_offset_parameter() const noexcept {
+    return FunctionWithOffsetParam != nullptr;
+  }
+
+  ur_context_handle_t getContext() const noexcept { return Context; };
+
+  const char *getName() const noexcept { return Name.c_str(); }
+
+  /// Get the number of kernel arguments, excluding the implicit global offset.
+  /// Note this only returns the current known number of arguments, not the
+  /// real one required by the kernel, since this cannot be queried from
+  /// the CUDA Driver API
+  size_t getNumArgs() const noexcept { return Args.Indices.size() - 1; }
+
+  void setKernelArg(int Index, size_t Size, const void *Arg) {
+    Args.addArg(Index, Size, Arg);
+  }
+
+  void setKernelLocalArg(int Index, size_t Size) {
+    Args.addLocalArg(Index, Size);
+  }
+
+  void setImplicitOffsetArg(size_t Size, std::uint32_t *ImplicitOffset) {
+    return Args.setImplicitOffset(Size, ImplicitOffset);
+  }
+
+  const arguments::args_index_t &getArgIndices() const {
+    return Args.getIndices();
+  }
+
+  uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); }
+
+  void clearLocalSize() { Args.clearLocalSize(); }
+};
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
new file mode 100644
index 0000000000000..b19acea3159f2
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.cpp
@@ -0,0 +1,507 @@
+//===--------- memory.cpp - CUDA Adapter ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include <cuda.h>
+
+#include "common.hpp"
+#include "context.hpp"
+#include "memory.hpp"
+
+/// Creates a UR Memory object using a CUDA memory allocation.
+/// Can trigger a manual copy depending on the mode.
+/// \TODO Implement USE_HOST_PTR using cuHostRegister - See #9789
+///
+UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate(
+    ur_context_handle_t hContext, ur_mem_flags_t flags, size_t size,
+    const ur_buffer_properties_t *pProperties, ur_mem_handle_t *phBuffer) {
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  // Validate flags
+  UR_ASSERT((flags & UR_MEM_FLAGS_MASK) == 0,
+            UR_RESULT_ERROR_INVALID_ENUMERATION);
+  if (flags &
+      (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) {
+    UR_ASSERT(pProperties && pProperties->pHost,
+              UR_RESULT_ERROR_INVALID_HOST_PTR);
+  }
+  // Need input memory object
+  UR_ASSERT(phBuffer, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(size != 0, UR_RESULT_ERROR_INVALID_BUFFER_SIZE);
+
+  // Currently, USE_HOST_PTR is not implemented using host register
+  // since this triggers a weird segfault after program ends.
+  // Setting this constant to true enables testing that behavior.
+  const bool EnableUseHostPtr = false;
+  const bool PerformInitialCopy =
+      (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) ||
+      ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && !EnableUseHostPtr);
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  ur_mem_handle_t MemObj = nullptr;
+
+  try {
+    ScopedContext Active(hContext);
+    CUdeviceptr Ptr;
+    auto HostPtr = pProperties ? pProperties->pHost : nullptr;
+
+    ur_mem_handle_t_::MemImpl::BufferMem::AllocMode AllocMode =
+        ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::Classic;
+
+    if ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && EnableUseHostPtr) {
+      Result = UR_CHECK_ERROR(
+          cuMemHostRegister(HostPtr, size, CU_MEMHOSTREGISTER_DEVICEMAP));
+      Result = UR_CHECK_ERROR(cuMemHostGetDevicePointer(&Ptr, HostPtr, 0));
+      AllocMode = ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::UseHostPtr;
+    } else if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) {
+      Result = UR_CHECK_ERROR(cuMemAllocHost(&HostPtr, size));
+      Result = UR_CHECK_ERROR(cuMemHostGetDevicePointer(&Ptr, HostPtr, 0));
+      AllocMode = ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr;
+    } else {
+      Result = UR_CHECK_ERROR(cuMemAlloc(&Ptr, size));
+      if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) {
+        AllocMode = ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::CopyIn;
+      }
+    }
+
+    if (Result == UR_RESULT_SUCCESS) {
+      ur_mem_handle_t parentBuffer = nullptr;
+
+      auto URMemObj = std::unique_ptr<ur_mem_handle_t_>(new ur_mem_handle_t_{
+          hContext, parentBuffer, flags, AllocMode, Ptr, HostPtr, size});
+      if (URMemObj != nullptr) {
+        MemObj = URMemObj.release();
+        if (PerformInitialCopy) {
+          // Operates on the default stream of the current CUDA context.
+          Result = UR_CHECK_ERROR(cuMemcpyHtoD(Ptr, HostPtr, size));
+          // Synchronize with default stream implicitly used by cuMemcpyHtoD
+          // to make buffer data available on device before any other UR call
+          // uses it.
+          if (Result == UR_RESULT_SUCCESS) {
+            CUstream defaultStream = 0;
+            Result = UR_CHECK_ERROR(cuStreamSynchronize(defaultStream));
+          }
+        }
+      } else {
+        Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+      }
+    }
+  } catch (ur_result_t Err) {
+    Result = Err;
+  } catch (...) {
+    Result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  }
+
+  *phBuffer = MemObj;
+
+  return Result;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) {
+  UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hMem->getReferenceCount() > 0, UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  hMem->incrementReferenceCount();
+  return UR_RESULT_SUCCESS;
+}
+
+/// Decreases the reference count of the Mem object.
+/// If this is zero, calls the relevant CUDA Free function
+/// \return UR_RESULT_SUCCESS unless deallocation error
+UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) {
+  UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+
+  try {
+
+    // Do nothing if there are other references
+    if (hMem->decrementReferenceCount() > 0) {
+      return UR_RESULT_SUCCESS;
+    }
+
+    // make sure hMem is released in case checkErrorUR throws
+    std::unique_ptr<ur_mem_handle_t_> MemObjPtr(hMem);
+
+    if (hMem->isSubBuffer()) {
+      return UR_RESULT_SUCCESS;
+    }
+
+    ScopedContext Active(MemObjPtr->getContext());
+
+    if (hMem->MemType == ur_mem_handle_t_::Type::Buffer) {
+      switch (MemObjPtr->Mem.BufferMem.MemAllocMode) {
+      case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::CopyIn:
+      case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::Classic:
+        Result = UR_CHECK_ERROR(cuMemFree(MemObjPtr->Mem.BufferMem.Ptr));
+        break;
+      case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::UseHostPtr:
+        Result = UR_CHECK_ERROR(
+            cuMemHostUnregister(MemObjPtr->Mem.BufferMem.HostPtr));
+        break;
+      case ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::AllocHostPtr:
+        Result =
+            UR_CHECK_ERROR(cuMemFreeHost(MemObjPtr->Mem.BufferMem.HostPtr));
+      };
+    } else if (hMem->MemType == ur_mem_handle_t_::Type::Surface) {
+      Result = UR_CHECK_ERROR(
+          cuSurfObjectDestroy(MemObjPtr->Mem.SurfaceMem.getSurface()));
+      Result =
+          UR_CHECK_ERROR(cuArrayDestroy(MemObjPtr->Mem.SurfaceMem.getArray()));
+    }
+
+  } catch (ur_result_t Err) {
+    Result = Err;
+  } catch (...) {
+    Result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  }
+
+  if (Result != UR_RESULT_SUCCESS) {
+    // A reported CUDA error is either an implementation or an asynchronous CUDA
+    // error for which it is unclear if the function that reported it succeeded
+    // or not. Either way, the state of the program is compromised and likely
+    // unrecoverable.
+    sycl::detail::ur::die(
+        "Unrecoverable program state reached in urMemRelease");
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+/// Gets the native CUDA handle of a UR mem object
+///
+/// \param[in] hMem The UR mem to get the native CUDA object of.
+/// \param[out] phNativeMem Set to the native handle of the UR mem object.
+///
+/// \return UR_RESULT_SUCCESS
+UR_APIEXPORT ur_result_t UR_APICALL
+urMemGetNativeHandle(ur_mem_handle_t hMem, ur_native_handle_t *phNativeMem) {
+  UR_ASSERT(hMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(phNativeMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  *phNativeMem =
+      reinterpret_cast<ur_native_handle_t>(hMem->Mem.BufferMem.get());
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory,
+                                                 ur_mem_info_t MemInfoType,
+                                                 size_t propSize,
+                                                 void *pMemInfo,
+                                                 size_t *pPropSizeRet) {
+  UR_ASSERT(hMemory, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(MemInfoType <= UR_MEM_INFO_CONTEXT,
+            UR_RESULT_ERROR_INVALID_ENUMERATION);
+  UR_ASSERT(hMemory->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  UrReturnHelper ReturnValue(propSize, pMemInfo, pPropSizeRet);
+
+  ScopedContext Active(hMemory->getContext());
+
+  switch (MemInfoType) {
+  case UR_MEM_INFO_SIZE: {
+    try {
+      size_t AllocSize = 0;
+      UR_CHECK_ERROR(cuMemGetAddressRange(nullptr, &AllocSize,
+                                          hMemory->Mem.BufferMem.Ptr));
+      return ReturnValue(AllocSize);
+    } catch (ur_result_t Err) {
+      return Err;
+    } catch (...) {
+      return UR_RESULT_ERROR_UNKNOWN;
+    }
+  }
+  case UR_MEM_INFO_CONTEXT: {
+    return ReturnValue(hMemory->getContext());
+  }
+
+  default:
+    return UR_RESULT_ERROR_INVALID_ENUMERATION;
+  }
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreateWithNativeHandle(
+    ur_native_handle_t, ur_context_handle_t, const ur_mem_native_properties_t *,
+    ur_mem_handle_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreateWithNativeHandle(
+    ur_native_handle_t, ur_context_handle_t, const ur_image_format_t *,
+    const ur_image_desc_t *, const ur_mem_native_properties_t *,
+    ur_mem_handle_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+/// \TODO Not implemented
+UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
+    ur_context_handle_t hContext, ur_mem_flags_t flags,
+    const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc,
+    void *pHost, ur_mem_handle_t *phMem) {
+  // Need input memory object
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(phMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(pImageDesc, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT((flags & UR_MEM_FLAGS_MASK) == 0,
+            UR_RESULT_ERROR_INVALID_ENUMERATION);
+  if (flags &
+      (UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER | UR_MEM_FLAG_USE_HOST_POINTER)) {
+    UR_ASSERT(pHost, UR_RESULT_ERROR_INVALID_HOST_PTR);
+  }
+  const bool PerformInitialCopy =
+      (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) ||
+      ((flags & UR_MEM_FLAG_USE_HOST_POINTER));
+
+  UR_ASSERT(pImageDesc->stype == UR_STRUCTURE_TYPE_IMAGE_DESC,
+            UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+  UR_ASSERT(pImageDesc->type <= UR_MEM_TYPE_IMAGE1D_BUFFER,
+            UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+  UR_ASSERT(pImageDesc->numMipLevel == 0,
+            UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+  UR_ASSERT(pImageDesc->numSamples == 0,
+            UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+  if (!pHost) {
+    UR_ASSERT(pImageDesc->rowPitch == 0,
+              UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+    UR_ASSERT(pImageDesc->slicePitch == 0,
+              UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
+  }
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+
+  // We only support RBGA channel order
+  // TODO: check SYCL CTS and spec. May also have to support BGRA
+  UR_ASSERT(pImageFormat->channelOrder == UR_IMAGE_CHANNEL_ORDER_RGBA,
+            UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION);
+
+  // We have to use cuArray3DCreate, which has some caveats. The height and
+  // depth parameters must be set to 0 produce 1D or 2D arrays. pImageDesc gives
+  // a minimum value of 1, so we need to convert the answer.
+  CUDA_ARRAY3D_DESCRIPTOR ArrayDesc;
+  ArrayDesc.NumChannels = 4; // Only support 4 channel image
+  ArrayDesc.Flags = 0;       // No flags required
+  ArrayDesc.Width = pImageDesc->width;
+  if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) {
+    ArrayDesc.Height = 0;
+    ArrayDesc.Depth = 0;
+  } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
+    ArrayDesc.Height = pImageDesc->height;
+    ArrayDesc.Depth = 0;
+  } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
+    ArrayDesc.Height = pImageDesc->height;
+    ArrayDesc.Depth = pImageDesc->depth;
+  }
+
+  // We need to get this now in bytes for calculating the total image size later
+  size_t PixelTypeSizeBytes;
+
+  switch (pImageFormat->channelType) {
+  case UR_IMAGE_CHANNEL_TYPE_UNORM_INT8:
+  case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8:
+    ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
+    PixelTypeSizeBytes = 1;
+    break;
+  case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8:
+    ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT8;
+    PixelTypeSizeBytes = 1;
+    break;
+  case UR_IMAGE_CHANNEL_TYPE_UNORM_INT16:
+  case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16:
+    ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT16;
+    PixelTypeSizeBytes = 2;
+    break;
+  case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16:
+    ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT16;
+    PixelTypeSizeBytes = 2;
+    break;
+  case UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT:
+    ArrayDesc.Format = CU_AD_FORMAT_HALF;
+    PixelTypeSizeBytes = 2;
+    break;
+  case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32:
+    ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT32;
+    PixelTypeSizeBytes = 4;
+    break;
+  case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32:
+    ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT32;
+    PixelTypeSizeBytes = 4;
+    break;
+  case UR_IMAGE_CHANNEL_TYPE_FLOAT:
+    ArrayDesc.Format = CU_AD_FORMAT_FLOAT;
+    PixelTypeSizeBytes = 4;
+    break;
+  default:
+    sycl::detail::ur::die(
+        "urMemImageCreate given unsupported image_channel_data_type");
+  }
+
+  // When a dimension isn't used pImageDesc has the size set to 1
+  size_t PixelSizeBytes =
+      PixelTypeSizeBytes * 4; // 4 is the only number of channels we support
+  size_t ImageSizeBytes = PixelSizeBytes * pImageDesc->width *
+                          pImageDesc->height * pImageDesc->depth;
+
+  ScopedContext Active(hContext);
+  CUarray ImageArray = nullptr;
+  try {
+    Result = UR_CHECK_ERROR(cuArray3DCreate(&ImageArray, &ArrayDesc));
+  } catch (ur_result_t Err) {
+    if (Err == UR_RESULT_ERROR_INVALID_VALUE) {
+      return UR_RESULT_ERROR_INVALID_IMAGE_SIZE;
+    }
+    return Err;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  try {
+    if (PerformInitialCopy) {
+      // We have to use a different copy function for each image dimensionality
+      if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) {
+        Result =
+            UR_CHECK_ERROR(cuMemcpyHtoA(ImageArray, 0, pHost, ImageSizeBytes));
+      } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
+        CUDA_MEMCPY2D CpyDesc;
+        memset(&CpyDesc, 0, sizeof(CpyDesc));
+        CpyDesc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
+        CpyDesc.srcHost = pHost;
+        CpyDesc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
+        CpyDesc.dstArray = ImageArray;
+        CpyDesc.WidthInBytes = PixelSizeBytes * pImageDesc->width;
+        CpyDesc.Height = pImageDesc->height;
+        Result = UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc));
+      } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
+        CUDA_MEMCPY3D CpyDesc;
+        memset(&CpyDesc, 0, sizeof(CpyDesc));
+        CpyDesc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
+        CpyDesc.srcHost = pHost;
+        CpyDesc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
+        CpyDesc.dstArray = ImageArray;
+        CpyDesc.WidthInBytes = PixelSizeBytes * pImageDesc->width;
+        CpyDesc.Height = pImageDesc->height;
+        CpyDesc.Depth = pImageDesc->depth;
+        Result = UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc));
+      }
+    }
+
+    // CUDA_RESOURCE_DESC is a union of different structs, shown here
+    // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html
+    // We need to fill it as described here to use it for a surface or texture
+    // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__SURFOBJECT.html
+    // CUDA_RESOURCE_DESC::resType must be CU_RESOURCE_TYPE_ARRAY and
+    // CUDA_RESOURCE_DESC::res::array::hArray must be set to a valid CUDA array
+    // handle.
+    // CUDA_RESOURCE_DESC::flags must be set to zero
+
+    CUDA_RESOURCE_DESC ImageResDesc;
+    ImageResDesc.res.array.hArray = ImageArray;
+    ImageResDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+    ImageResDesc.flags = 0;
+
+    CUsurfObject Surface;
+    Result = UR_CHECK_ERROR(cuSurfObjectCreate(&Surface, &ImageResDesc));
+
+    auto MemObj = std::unique_ptr<ur_mem_handle_t_>(new ur_mem_handle_t_(
+        hContext, ImageArray, Surface, flags, pImageDesc->type, phMem));
+
+    if (MemObj == nullptr) {
+      return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+    }
+
+    *phMem = MemObj.release();
+  } catch (ur_result_t Err) {
+    if (ImageArray) {
+      cuArrayDestroy(ImageArray);
+    }
+    return Err;
+  } catch (...) {
+    if (ImageArray) {
+      cuArrayDestroy(ImageArray);
+    }
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+
+  return Result;
+}
+
+/// \TODO Not implemented
+UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(ur_mem_handle_t,
+                                                      ur_image_info_t, size_t,
+                                                      void *, size_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+/// Implements a buffer partition in the CUDA backend.
+/// A buffer partition (or a sub-buffer, in OpenCL terms) is simply implemented
+/// as an offset over an existing CUDA allocation.
+UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition(
+    ur_mem_handle_t hBuffer, ur_mem_flags_t flags,
+    ur_buffer_create_type_t bufferCreateType, const ur_buffer_region_t *pRegion,
+    ur_mem_handle_t *phMem) {
+  UR_ASSERT(hBuffer, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT((flags & UR_MEM_FLAGS_MASK) == 0,
+            UR_RESULT_ERROR_INVALID_ENUMERATION);
+  UR_ASSERT(hBuffer->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(!hBuffer->isSubBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+
+  // Default value for flags means UR_MEM_FLAG_READ_WRITE.
+  if (flags == 0) {
+    flags = UR_MEM_FLAG_READ_WRITE;
+  }
+
+  UR_ASSERT(!(flags &
+              (UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER |
+               UR_MEM_FLAG_ALLOC_HOST_POINTER | UR_MEM_FLAG_USE_HOST_POINTER)),
+            UR_RESULT_ERROR_INVALID_VALUE);
+  if (hBuffer->MemFlags & UR_MEM_FLAG_WRITE_ONLY) {
+    UR_ASSERT(!(flags & (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_READ_ONLY)),
+              UR_RESULT_ERROR_INVALID_VALUE);
+  }
+  if (hBuffer->MemFlags & UR_MEM_FLAG_READ_ONLY) {
+    UR_ASSERT(!(flags & (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)),
+              UR_RESULT_ERROR_INVALID_VALUE);
+  }
+
+  UR_ASSERT(bufferCreateType == UR_BUFFER_CREATE_TYPE_REGION,
+            UR_RESULT_ERROR_INVALID_ENUMERATION);
+  UR_ASSERT(pRegion != nullptr, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(phMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  UR_ASSERT(pRegion->size != 0u, UR_RESULT_ERROR_INVALID_BUFFER_SIZE);
+
+  assert((pRegion->origin <= (pRegion->origin + pRegion->size)) && "Overflow");
+  UR_ASSERT(
+      ((pRegion->origin + pRegion->size) <= hBuffer->Mem.BufferMem.getSize()),
+      UR_RESULT_ERROR_INVALID_BUFFER_SIZE);
+  // Retained indirectly due to retaining parent buffer below.
+  ur_context_handle_t Context = hBuffer->Context;
+
+  ur_mem_handle_t_::MemImpl::BufferMem::AllocMode AllocMode =
+      ur_mem_handle_t_::MemImpl::BufferMem::AllocMode::Classic;
+
+  assert(hBuffer->Mem.BufferMem.Ptr !=
+         ur_mem_handle_t_::MemImpl::BufferMem::native_type{0});
+  ur_mem_handle_t_::MemImpl::BufferMem::native_type Ptr =
+      hBuffer->Mem.BufferMem.Ptr + pRegion->origin;
+
+  void *HostPtr = nullptr;
+  if (hBuffer->Mem.BufferMem.HostPtr) {
+    HostPtr =
+        static_cast<char *>(hBuffer->Mem.BufferMem.HostPtr) + pRegion->origin;
+  }
+
+  std::unique_ptr<ur_mem_handle_t_> MemObj{nullptr};
+  try {
+    MemObj = std::unique_ptr<ur_mem_handle_t_>{new ur_mem_handle_t_{
+        Context, hBuffer, flags, AllocMode, Ptr, HostPtr, pRegion->size}};
+  } catch (ur_result_t Err) {
+    *phMem = nullptr;
+    return Err;
+  } catch (...) {
+    *phMem = nullptr;
+    return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+  }
+
+  *phMem = MemObj.release();
+  return UR_RESULT_SUCCESS;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp
new file mode 100644
index 0000000000000..a986607a65d5e
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/memory.hpp
@@ -0,0 +1,185 @@
+//===--------- memory.hpp - CUDA Adapter ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+#pragma once
+
+#include <cassert>
+#include <cuda.h>
+#include <ur_api.h>
+
+#include "common.hpp"
+
+/// UR Mem mapping to CUDA memory allocations, both data and texture/surface.
+/// \brief Represents non-SVM allocations on the CUDA backend.
+/// Keeps tracks of all mapped regions used for Map/Unmap calls.
+/// Only one region can be active at the same time per allocation.
+struct ur_mem_handle_t_ {
+  // Context where the memory object is accessible
+  ur_context_handle_t Context;
+
+  /// Reference counting of the handler
+  std::atomic_uint32_t RefCount;
+  enum class Type { Buffer, Surface } MemType;
+
+  // Original mem flags passed
+  ur_mem_flags_t MemFlags;
+
+  /// A UR Memory object represents either plain memory allocations ("Buffers"
+  /// in OpenCL) or typed allocations ("Images" in OpenCL).
+  /// In CUDA their API handlers are different. Whereas "Buffers" are allocated
+  /// as pointer-like structs, "Images" are stored in Textures or Surfaces.
+  /// This union allows implementation to use either from the same handler.
+  union MemImpl {
+    // Handler for plain, pointer-based CUDA allocations
+    struct BufferMem {
+      using native_type = CUdeviceptr;
+
+      // If this allocation is a sub-buffer (i.e., a view on an existing
+      // allocation), this is the pointer to the parent handler structure
+      ur_mem_handle_t Parent;
+      // CUDA handler for the pointer
+      native_type Ptr;
+
+      /// Pointer associated with this device on the host
+      void *HostPtr;
+      /// Size of the allocation in bytes
+      size_t Size;
+      /// Offset of the active mapped region.
+      size_t MapOffset;
+      /// Pointer to the active mapped region, if any
+      void *MapPtr;
+      /// Original flags for the mapped region
+      ur_map_flags_t MapFlags;
+
+      /** AllocMode
+       * classic: Just a normal buffer allocated on the device via cuda malloc
+       * use_host_ptr: Use an address on the host for the device
+       * copy_in: The data for the device comes from the host but the host
+       pointer is not available later for re-use
+       * alloc_host_ptr: Uses pinned-memory allocation
+      */
+      enum class AllocMode {
+        Classic,
+        UseHostPtr,
+        CopyIn,
+        AllocHostPtr,
+      } MemAllocMode;
+
+      native_type get() const noexcept { return Ptr; }
+
+      size_t getSize() const noexcept { return Size; }
+
+      void *getMapPtr() const noexcept { return MapPtr; }
+
+      size_t getMapOffset(void *) const noexcept { return MapOffset; }
+
+      /// Returns a pointer to data visible on the host that contains
+      /// the data on the device associated with this allocation.
+      /// The offset is used to index into the CUDA allocation.
+      void *mapToPtr(size_t Offset, ur_map_flags_t Flags) noexcept {
+        assert(MapPtr == nullptr);
+        MapOffset = Offset;
+        MapFlags = Flags;
+        if (HostPtr) {
+          MapPtr = static_cast<char *>(HostPtr) + Offset;
+        } else {
+          // TODO: Allocate only what is needed based on the offset
+          MapPtr = static_cast<void *>(malloc(this->getSize()));
+        }
+        return MapPtr;
+      }
+
+      /// Detach the allocation from the host memory.
+      void unmap(void *) noexcept {
+        assert(MapPtr != nullptr);
+
+        if (MapPtr != HostPtr) {
+          free(MapPtr);
+        }
+        MapPtr = nullptr;
+        MapOffset = 0;
+      }
+
+      ur_map_flags_t getMapFlags() const noexcept {
+        assert(MapPtr != nullptr);
+        return MapFlags;
+      }
+    } BufferMem;
+
+    // Handler data for surface object (i.e. Images)
+    struct SurfaceMem {
+      CUarray Array;
+      CUsurfObject SurfObj;
+      ur_mem_type_t ImageType;
+
+      CUarray getArray() const noexcept { return Array; }
+
+      CUsurfObject getSurface() const noexcept { return SurfObj; }
+
+      ur_mem_type_t getImageType() const noexcept { return ImageType; }
+    } SurfaceMem;
+  } Mem;
+
+  /// Constructs the UR mem handler for a non-typed allocation ("buffer")
+  ur_mem_handle_t_(ur_context_handle_t Context, ur_mem_handle_t Parent,
+                   ur_mem_flags_t MemFlags, MemImpl::BufferMem::AllocMode Mode,
+                   CUdeviceptr Ptr, void *HostPtr, size_t Size)
+      : Context{Context}, RefCount{1}, MemType{Type::Buffer},
+        MemFlags{MemFlags} {
+    Mem.BufferMem.Ptr = Ptr;
+    Mem.BufferMem.Parent = Parent;
+    Mem.BufferMem.HostPtr = HostPtr;
+    Mem.BufferMem.Size = Size;
+    Mem.BufferMem.MapOffset = 0;
+    Mem.BufferMem.MapPtr = nullptr;
+    Mem.BufferMem.MapFlags = UR_MAP_FLAG_WRITE;
+    Mem.BufferMem.MemAllocMode = Mode;
+    if (isSubBuffer()) {
+      urMemRetain(Mem.BufferMem.Parent);
+    } else {
+      urContextRetain(Context);
+    }
+  };
+
+  /// Constructs the UR allocation for an Image object (surface in CUDA)
+  ur_mem_handle_t_(ur_context_handle_t Context, CUarray Array,
+                   CUsurfObject Surf, ur_mem_flags_t MemFlags,
+                   ur_mem_type_t ImageType, void *HostPtr)
+      : Context{Context}, RefCount{1}, MemType{Type::Surface},
+        MemFlags{MemFlags} {
+    (void)HostPtr;
+
+    Mem.SurfaceMem.Array = Array;
+    Mem.SurfaceMem.SurfObj = Surf;
+    Mem.SurfaceMem.ImageType = ImageType;
+    urContextRetain(Context);
+  }
+
+  ~ur_mem_handle_t_() {
+    if (isBuffer() && isSubBuffer()) {
+      urMemRelease(Mem.BufferMem.Parent);
+      return;
+    }
+    urContextRelease(Context);
+  }
+
+  bool isBuffer() const noexcept { return MemType == Type::Buffer; }
+
+  bool isSubBuffer() const noexcept {
+    return (isBuffer() && (Mem.BufferMem.Parent != nullptr));
+  }
+
+  bool isImage() const noexcept { return MemType == Type::Surface; }
+
+  ur_context_handle_t getContext() const noexcept { return Context; }
+
+  uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
+
+  uint32_t decrementReferenceCount() noexcept { return --RefCount; }
+
+  uint32_t getReferenceCount() const noexcept { return RefCount; }
+};
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
new file mode 100644
index 0000000000000..600512d0b01c7
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.cpp
@@ -0,0 +1,203 @@
+//===--------- platform.cpp - CUDA Adapter ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include "platform.hpp"
+#include "common.hpp"
+#include "context.hpp"
+#include "device.hpp"
+
+#include <cassert>
+#include <cuda.h>
+#include <sstream>
+
+void enableCUDATracing();
+void disableCUDATracing();
+
+UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetInfo(
+    ur_platform_handle_t hPlatform, ur_platform_info_t PlatformInfoType,
+    size_t Size, void *pPlatformInfo, size_t *pSizeRet) {
+
+  UR_ASSERT(hPlatform, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UrReturnHelper ReturnValue(Size, pPlatformInfo, pSizeRet);
+
+  switch (PlatformInfoType) {
+  case UR_PLATFORM_INFO_NAME:
+    return ReturnValue("NVIDIA CUDA BACKEND");
+  case UR_PLATFORM_INFO_VENDOR_NAME:
+    return ReturnValue("NVIDIA Corporation");
+  case UR_PLATFORM_INFO_PROFILE:
+    return ReturnValue("FULL PROFILE");
+  case UR_PLATFORM_INFO_VERSION: {
+    auto Version = getCudaVersionString();
+    return ReturnValue(Version.c_str());
+  }
+  case UR_PLATFORM_INFO_EXTENSIONS: {
+    return ReturnValue("");
+  }
+  case UR_PLATFORM_INFO_BACKEND: {
+    return ReturnValue(UR_PLATFORM_BACKEND_CUDA);
+  }
+  default:
+    return UR_RESULT_ERROR_INVALID_ENUMERATION;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+/// Obtains the CUDA platform.
+/// There is only one CUDA platform, and contains all devices on the system.
+/// Triggers the CUDA Driver initialization (cuInit) the first time, so this
+/// must be the first PI API called.
+///
+/// However because multiple devices in a context is not currently supported,
+/// place each device in a separate platform.
+UR_APIEXPORT ur_result_t UR_APICALL
+urPlatformGet(uint32_t NumEntries, ur_platform_handle_t *phPlatforms,
+              uint32_t *pNumPlatforms) {
+
+  try {
+    static std::once_flag InitFlag;
+    static uint32_t NumPlatforms = 1;
+    static std::vector<ur_platform_handle_t_> Platforms;
+
+    UR_ASSERT(phPlatforms || pNumPlatforms, UR_RESULT_ERROR_INVALID_VALUE);
+    UR_ASSERT(!phPlatforms || NumEntries > 0, UR_RESULT_ERROR_INVALID_SIZE);
+
+    ur_result_t Result = UR_RESULT_SUCCESS;
+
+    std::call_once(
+        InitFlag,
+        [](ur_result_t &Result) {
+          if (cuInit(0) != CUDA_SUCCESS) {
+            NumPlatforms = 0;
+            return;
+          }
+          int NumDevices = 0;
+          Result = UR_CHECK_ERROR(cuDeviceGetCount(&NumDevices));
+          if (NumDevices == 0) {
+            NumPlatforms = 0;
+            return;
+          }
+          try {
+            // make one platform per device
+            NumPlatforms = NumDevices;
+            Platforms.resize(NumDevices);
+
+            for (int i = 0; i < NumDevices; ++i) {
+              CUdevice Device;
+              Result = UR_CHECK_ERROR(cuDeviceGet(&Device, i));
+              CUcontext Context;
+              Result =
+                  UR_CHECK_ERROR(cuDevicePrimaryCtxRetain(&Context, Device));
+
+              ScopedContext active(Context);
+              CUevent EvBase;
+              Result = UR_CHECK_ERROR(cuEventCreate(&EvBase, CU_EVENT_DEFAULT));
+
+              // Use default stream to record base event counter
+              Result = UR_CHECK_ERROR(cuEventRecord(EvBase, 0));
+
+              Platforms[i].Devices.emplace_back(new ur_device_handle_t_{
+                  Device, Context, EvBase, &Platforms[i]});
+              {
+                const auto &Dev = Platforms[i].Devices.back().get();
+                size_t MaxWorkGroupSize = 0u;
+                size_t MaxThreadsPerBlock[3] = {};
+                ur_result_t RetError = urDeviceGetInfo(
+                    Dev, UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES,
+                    sizeof(MaxThreadsPerBlock), MaxThreadsPerBlock, nullptr);
+                if (RetError != UR_RESULT_SUCCESS) {
+                  throw RetError;
+                }
+
+                RetError = urDeviceGetInfo(
+                    Dev, UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE,
+                    sizeof(MaxWorkGroupSize), &MaxWorkGroupSize, nullptr);
+                if (RetError != UR_RESULT_SUCCESS) {
+                  throw RetError;
+                }
+
+                Dev->saveMaxWorkItemSizes(sizeof(MaxThreadsPerBlock),
+                                          MaxThreadsPerBlock);
+                Dev->saveMaxWorkGroupSize(MaxWorkGroupSize);
+              }
+            }
+          } catch (const std::bad_alloc &) {
+            // Signal out-of-memory situation
+            for (int i = 0; i < NumDevices; ++i) {
+              Platforms[i].Devices.clear();
+            }
+            Platforms.clear();
+            Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+          } catch (...) {
+            // Clear and rethrow to allow retry
+            for (int i = 0; i < NumDevices; ++i) {
+              Platforms[i].Devices.clear();
+            }
+            Platforms.clear();
+            throw;
+          }
+        },
+        Result);
+
+    if (pNumPlatforms != nullptr) {
+      *pNumPlatforms = NumPlatforms;
+    }
+
+    if (phPlatforms != nullptr) {
+      for (unsigned i = 0; i < std::min(NumEntries, NumPlatforms); ++i) {
+        phPlatforms[i] = &Platforms[i];
+      }
+    }
+
+    return Result;
+  } catch (ur_result_t Err) {
+    return Err;
+  } catch (...) {
+    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  }
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetApiVersion(
+    ur_platform_handle_t hDriver, ur_api_version_t *pVersion) {
+  UR_ASSERT(hDriver, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(pVersion, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  *pVersion = UR_API_VERSION_CURRENT;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urInit(ur_device_init_flags_t) {
+  enableCUDATracing();
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urTearDown(void *) {
+  disableCUDATracing();
+  return UR_RESULT_SUCCESS;
+}
+
+// Get CUDA plugin specific backend option.
+// Current support is only for optimization options.
+// Return empty string for cuda.
+// TODO: Determine correct string to be passed.
+UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetBackendOption(
+    ur_platform_handle_t hPlatform, const char *pFrontendOption,
+    const char **ppPlatformOption) {
+  std::ignore = hPlatform;
+  using namespace std::literals;
+  if (pFrontendOption == nullptr)
+    return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+  if (pFrontendOption == "-O0"sv || pFrontendOption == "-O1"sv ||
+      pFrontendOption == "-O2"sv || pFrontendOption == "-O3"sv ||
+      pFrontendOption == ""sv) {
+    *ppPlatformOption = "";
+    return UR_RESULT_SUCCESS;
+  }
+  return UR_RESULT_ERROR_INVALID_VALUE;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.hpp
new file mode 100644
index 0000000000000..187290718aebf
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/platform.hpp
@@ -0,0 +1,15 @@
+//===--------- platform.hpp - CUDA Adapter ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+#pragma once
+
+#include <ur/ur.hpp>
+#include <vector>
+
+struct ur_platform_handle_t_ {
+  std::vector<std::unique_ptr<ur_device_handle_t_>> Devices;
+};
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
new file mode 100644
index 0000000000000..e7467af0b8cbf
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.cpp
@@ -0,0 +1,476 @@
+//===--------- program.cpp - CUDA Adapter ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include "program.hpp"
+
+bool getMaxRegistersJitOptionValue(const std::string &BuildOptions,
+                                   unsigned int &Value) {
+  using namespace std::string_view_literals;
+  const std::size_t OptionPos = BuildOptions.find_first_of("maxrregcount"sv);
+  if (OptionPos == std::string::npos) {
+    return false;
+  }
+
+  const std::size_t DelimPos = BuildOptions.find('=', OptionPos + 1u);
+  if (DelimPos == std::string::npos) {
+    return false;
+  }
+
+  const std::size_t Length = BuildOptions.length();
+  const std::size_t StartPos = DelimPos + 1u;
+  if (DelimPos == std::string::npos || StartPos >= Length) {
+    return false;
+  }
+
+  std::size_t Pos = StartPos;
+  while (Pos < Length &&
+         std::isdigit(static_cast<unsigned char>(BuildOptions[Pos]))) {
+    Pos++;
+  }
+
+  const std::string ValueString = BuildOptions.substr(StartPos, Pos - StartPos);
+  if (ValueString.empty()) {
+    return false;
+  }
+
+  Value = static_cast<unsigned int>(std::stoi(ValueString));
+  return true;
+}
+
+ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Context)
+    : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1},
+      Context{Context}, KernelReqdWorkGroupSizeMD{} {
+  urContextRetain(Context);
+}
+
+ur_program_handle_t_::~ur_program_handle_t_() { urContextRelease(Context); }
+
+std::pair<std::string, std::string>
+splitMetadataName(const std::string &metadataName) {
+  size_t splitPos = metadataName.rfind('@');
+  if (splitPos == std::string::npos)
+    return std::make_pair(metadataName, std::string{});
+  return std::make_pair(metadataName.substr(0, splitPos),
+                        metadataName.substr(splitPos, metadataName.length()));
+}
+
+ur_result_t
+ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata,
+                                  size_t Length) {
+  for (size_t i = 0; i < Length; ++i) {
+    const ur_program_metadata_t MetadataElement = Metadata[i];
+    std::string MetadataElementName{MetadataElement.pName};
+
+    auto [Prefix, Tag] = splitMetadataName(MetadataElementName);
+
+    if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) {
+      // If metadata is reqd_work_group_size, record it for the corresponding
+      // kernel name.
+      size_t MDElemsSize = MetadataElement.size - sizeof(std::uint64_t);
+
+      // Expect between 1 and 3 32-bit integer values.
+      UR_ASSERT(MDElemsSize >= sizeof(std::uint32_t) &&
+                    MDElemsSize <= sizeof(std::uint32_t) * 3,
+                UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE);
+
+      // Get pointer to data, skipping 64-bit size at the start of the data.
+      const char *ValuePtr =
+          reinterpret_cast<const char *>(MetadataElement.value.pData) +
+          sizeof(std::uint64_t);
+      // Read values and pad with 1's for values not present.
+      std::uint32_t ReqdWorkGroupElements[] = {1, 1, 1};
+      std::memcpy(ReqdWorkGroupElements, ValuePtr, MDElemsSize);
+      KernelReqdWorkGroupSizeMD[Prefix] =
+          std::make_tuple(ReqdWorkGroupElements[0], ReqdWorkGroupElements[1],
+                          ReqdWorkGroupElements[2]);
+    } else if (Tag == __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING) {
+      const char *MetadataValPtr =
+          reinterpret_cast<const char *>(MetadataElement.value.pData) +
+          sizeof(std::uint64_t);
+      const char *MetadataValPtrEnd =
+          MetadataValPtr + MetadataElement.size - sizeof(std::uint64_t);
+      GlobalIDMD[Prefix] = std::string{MetadataValPtr, MetadataValPtrEnd};
+    }
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t ur_program_handle_t_::setBinary(const char *Source, size_t Length) {
+  // Do not re-set program binary data which has already been set as that will
+  // delete the old binary data.
+  UR_ASSERT(Binary == nullptr && BinarySizeInBytes == 0,
+            UR_RESULT_ERROR_INVALID_OPERATION);
+  Binary = Source;
+  BinarySizeInBytes = Length;
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t ur_program_handle_t_::buildProgram(const char *BuildOptions) {
+  if (BuildOptions) {
+    this->BuildOptions = BuildOptions;
+  }
+
+  constexpr const unsigned int NumberOfOptions = 4u;
+
+  std::vector<CUjit_option> Options(NumberOfOptions);
+  std::vector<void *> OptionVals(NumberOfOptions);
+
+  // Pass a buffer for info messages
+  Options[0] = CU_JIT_INFO_LOG_BUFFER;
+  OptionVals[0] = (void *)InfoLog;
+  // Pass the size of the info buffer
+  Options[1] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+  OptionVals[1] = (void *)(long)MaxLogSize;
+  // Pass a buffer for error message
+  Options[2] = CU_JIT_ERROR_LOG_BUFFER;
+  OptionVals[2] = (void *)ErrorLog;
+  // Pass the size of the error buffer
+  Options[3] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+  OptionVals[3] = (void *)(long)MaxLogSize;
+
+  if (!this->BuildOptions.empty()) {
+    unsigned int MaxRegs;
+    bool Valid = getMaxRegistersJitOptionValue(BuildOptions, MaxRegs);
+    if (Valid) {
+      Options.push_back(CU_JIT_MAX_REGISTERS);
+      OptionVals.push_back(reinterpret_cast<void *>(MaxRegs));
+    }
+  }
+
+  auto result = UR_CHECK_ERROR(
+      cuModuleLoadDataEx(&Module, static_cast<const void *>(Binary),
+                         Options.size(), Options.data(), OptionVals.data()));
+
+  const auto Success = (result == UR_RESULT_SUCCESS);
+
+  BuildStatus =
+      Success ? UR_PROGRAM_BUILD_STATUS_SUCCESS : UR_PROGRAM_BUILD_STATUS_ERROR;
+
+  // If no exception, result is correct
+  return Success ? UR_RESULT_SUCCESS : UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE;
+}
+
+/// Finds kernel names by searching for entry points in the PTX source, as the
+/// CUDA driver API doesn't expose an operation for this.
+/// Note: This is currently only being used by the SYCL program class for the
+///       has_kernel method, so an alternative would be to move the has_kernel
+///       query to UR and use cuModuleGetFunction to check for a kernel.
+/// Note: Another alternative is to add kernel names as metadata, like with
+///       reqd_work_group_size.
+ur_result_t getKernelNames(ur_program_handle_t) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+/// CUDA will handle the PTX/CUBIN binaries internally through CUmodule object.
+/// So, urProgramCreateWithIL and urProgramCreateWithBinary are equivalent in
+/// terms of CUDA adapter. See \ref urProgramCreateWithBinary.
+UR_APIEXPORT ur_result_t UR_APICALL
+urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
+                      size_t length, const ur_program_properties_t *pProperties,
+                      ur_program_handle_t *phProgram) {
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  ur_device_handle_t hDevice = hContext->getDevice();
+  auto pBinary = reinterpret_cast<const uint8_t *>(pIL);
+
+  return urProgramCreateWithBinary(hContext, hDevice, length, pBinary,
+                                   pProperties, phProgram);
+}
+
+/// CUDA will handle the PTX/CUBIN binaries internally through a call to
+/// cuModuleLoadDataEx. So, urProgramCompile and urProgramBuild are equivalent
+/// in terms of CUDA adapter. \TODO Implement asynchronous compilation
+UR_APIEXPORT ur_result_t UR_APICALL
+urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram,
+                 const char *pOptions) {
+  return urProgramBuild(hContext, hProgram, pOptions);
+}
+
+/// Loads the images from a UR program into a CUmodule that can be
+/// used later on to extract functions (kernels).
+/// See \ref ur_program_handle_t for implementation details.
+UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext,
+                                                   ur_program_handle_t hProgram,
+                                                   const char *pOptions) {
+  std::ignore = hContext;
+  UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+
+  try {
+    ScopedContext Active(hProgram->getContext());
+
+    hProgram->buildProgram(pOptions);
+
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
+
+/// Creates a new UR program object that is the outcome of linking all input
+/// programs.
+/// \TODO Implement linker options, requires mapping of OpenCL to CUDA
+UR_APIEXPORT ur_result_t UR_APICALL
+urProgramLink(ur_context_handle_t hContext, uint32_t count,
+              const ur_program_handle_t *phPrograms, const char *pOptions,
+              ur_program_handle_t *phProgram) {
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(count, UR_RESULT_ERROR_PROGRAM_LINK_FAILURE);
+  UR_ASSERT(phPrograms, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(phProgram, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+
+  try {
+    ScopedContext Active(hContext);
+
+    CUlinkState State;
+    std::unique_ptr<ur_program_handle_t_> RetProgram{
+        new ur_program_handle_t_{hContext}};
+
+    Result = UR_CHECK_ERROR(cuLinkCreate(0, nullptr, nullptr, &State));
+    try {
+      for (size_t i = 0; i < count; ++i) {
+        ur_program_handle_t Program = phPrograms[i];
+        Result = UR_CHECK_ERROR(cuLinkAddData(
+            State, CU_JIT_INPUT_PTX, const_cast<char *>(Program->Binary),
+            Program->BinarySizeInBytes, nullptr, 0, nullptr, nullptr));
+      }
+      void *CuBin = nullptr;
+      size_t CuBinSize = 0;
+      Result = UR_CHECK_ERROR(cuLinkComplete(State, &CuBin, &CuBinSize));
+
+      Result =
+          RetProgram->setBinary(static_cast<const char *>(CuBin), CuBinSize);
+
+      Result = RetProgram->buildProgram(pOptions);
+    } catch (...) {
+      // Upon error attempt cleanup
+      UR_CHECK_ERROR(cuLinkDestroy(State));
+      throw;
+    }
+
+    Result = UR_CHECK_ERROR(cuLinkDestroy(State));
+    *phProgram = RetProgram.release();
+
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
+
+/// Created a UR program object from a CUDA program handle.
+/// TODO: Implement this.
+/// NOTE: The created UR object takes ownership of the native handle.
+///
+/// \param[in] nativeHandle The native handle to create UR program object from.
+/// \param[in] context The UR context of the program.
+/// \param[out] program Set to the UR program object created from native handle.
+///
+/// \return TBD
+UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithNativeHandle(
+    ur_native_handle_t, ur_context_handle_t,
+    const ur_program_native_properties_t *, ur_program_handle_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urProgramGetBuildInfo(ur_program_handle_t hProgram, ur_device_handle_t hDevice,
+                      ur_program_build_info_t propName, size_t propSize,
+                      void *pPropValue, size_t *pPropSizeRet) {
+  // Ignore unused parameter
+  (void)hDevice;
+
+  UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
+
+  switch (propName) {
+  case UR_PROGRAM_BUILD_INFO_STATUS: {
+    return ReturnValue(hProgram->BuildStatus);
+  }
+  case UR_PROGRAM_BUILD_INFO_OPTIONS:
+    return ReturnValue(hProgram->BuildOptions.c_str());
+  case UR_PROGRAM_BUILD_INFO_LOG:
+    return ReturnValue(hProgram->InfoLog, hProgram->MaxLogSize);
+  default:
+    break;
+  }
+  return UR_RESULT_ERROR_INVALID_ENUMERATION;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName,
+                 size_t propSize, void *pProgramInfo, size_t *pPropSizeRet) {
+  UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  UrReturnHelper ReturnValue(propSize, pProgramInfo, pPropSizeRet);
+
+  switch (propName) {
+  case UR_PROGRAM_INFO_REFERENCE_COUNT:
+    return ReturnValue(hProgram->getReferenceCount());
+  case UR_PROGRAM_INFO_CONTEXT:
+    return ReturnValue(hProgram->Context);
+  case UR_PROGRAM_INFO_NUM_DEVICES:
+    return ReturnValue(1u);
+  case UR_PROGRAM_INFO_DEVICES:
+    return ReturnValue(&hProgram->Context->DeviceID, 1);
+  case UR_PROGRAM_INFO_SOURCE:
+    return ReturnValue(hProgram->Binary);
+  case UR_PROGRAM_INFO_BINARY_SIZES:
+    return ReturnValue(&hProgram->BinarySizeInBytes, 1);
+  case UR_PROGRAM_INFO_BINARIES:
+    return ReturnValue(&hProgram->Binary, 1);
+  case UR_PROGRAM_INFO_KERNEL_NAMES:
+    return getKernelNames(hProgram);
+  default:
+    break;
+  }
+  return UR_RESULT_ERROR_INVALID_ENUMERATION;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urProgramRetain(ur_program_handle_t program) {
+  UR_ASSERT(program, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(program->getReferenceCount() > 0, UR_RESULT_ERROR_INVALID_PROGRAM);
+  program->incrementReferenceCount();
+  return UR_RESULT_SUCCESS;
+}
+
+/// Decreases the reference count of a ur_program_handle_t object.
+/// When the reference count reaches 0, it unloads the module from
+/// the context.
+UR_APIEXPORT ur_result_t UR_APICALL
+urProgramRelease(ur_program_handle_t hProgram) {
+  UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  // double delete or someone is messing with the ref count.
+  // either way, cannot safely proceed.
+  UR_ASSERT(hProgram->getReferenceCount() != 0,
+            UR_RESULT_ERROR_INVALID_PROGRAM);
+
+  // decrement ref count. If it is 0, delete the program.
+  if (hProgram->decrementReferenceCount() == 0) {
+
+    std::unique_ptr<ur_program_handle_t_> ProgramPtr{hProgram};
+
+    ur_result_t Result = UR_RESULT_ERROR_INVALID_PROGRAM;
+
+    try {
+      ScopedContext Active(hProgram->getContext());
+      auto cuModule = hProgram->get();
+      // "0" is a valid handle for a cuModule, so the best way to check if we
+      // actually loaded a module and need to unload it is to look at the build
+      // status.
+      if (hProgram->BuildStatus == UR_PROGRAM_BUILD_STATUS_SUCCESS) {
+        Result = UR_CHECK_ERROR(cuModuleUnload(cuModule));
+      } else if (hProgram->BuildStatus == UR_PROGRAM_BUILD_STATUS_NONE) {
+        // Nothing to free.
+        Result = UR_RESULT_SUCCESS;
+      }
+    } catch (...) {
+      Result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+    }
+
+    return Result;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+/// Gets the native CUDA handle of a UR program object
+///
+/// \param[in] program The UR program handle to get the native CUDA object of.
+/// \param[out] nativeHandle Set to the native handle of the UR program object.
+///
+/// \return ur_result_t
+UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle(
+    ur_program_handle_t program, ur_native_handle_t *nativeHandle) {
+  UR_ASSERT(program, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(nativeHandle, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  *nativeHandle = reinterpret_cast<ur_native_handle_t>(program->get());
+  return UR_RESULT_SUCCESS;
+}
+
+/// Loads images from a list of PTX or CUBIN binaries.
+/// Note: No calls to CUDA driver API in this function, only store binaries
+/// for later.
+///
+/// Note: Only supports one device
+///
+UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
+    ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size,
+    const uint8_t *pBinary, const ur_program_properties_t *pProperties,
+    ur_program_handle_t *phProgram) {
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(phProgram, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(pBinary != nullptr, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(hContext->getDevice()->get() == hDevice->get(),
+            UR_RESULT_ERROR_INVALID_CONTEXT);
+  UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE);
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+
+  std::unique_ptr<ur_program_handle_t_> RetProgram{
+      new ur_program_handle_t_{hContext}};
+
+  if (pProperties) {
+    if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) {
+      return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+    } else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) {
+      return UR_RESULT_ERROR_INVALID_SIZE;
+    }
+    Result =
+        RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count);
+  }
+  UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);
+
+  auto pBinary_string = reinterpret_cast<const char *>(pBinary);
+
+  Result = RetProgram->setBinary(pBinary_string, size);
+  UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);
+
+  *phProgram = RetProgram.release();
+
+  return Result;
+}
+
+// This entry point is only used for native specialization constants (SPIR-V),
+// and the CUDA plugin is AOT only so this entry point is not supported.
+UR_APIEXPORT ur_result_t UR_APICALL urProgramSetSpecializationConstants(
+    ur_program_handle_t, uint32_t, const ur_specialization_constant_info_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urProgramGetFunctionPointer(
+    ur_device_handle_t hDevice, ur_program_handle_t hProgram,
+    const char *pFunctionName, void **ppFunctionPointer) {
+  // Check if device passed is the same the device bound to the context
+  UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hProgram, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hDevice == hProgram->getContext()->getDevice(),
+            UR_RESULT_ERROR_INVALID_DEVICE);
+  UR_ASSERT(pFunctionName, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(ppFunctionPointer, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  CUfunction Func;
+  CUresult Ret = cuModuleGetFunction(&Func, hProgram->get(), pFunctionName);
+  *ppFunctionPointer = Func;
+  ur_result_t Result = UR_RESULT_SUCCESS;
+
+  if (Ret != CUDA_SUCCESS && Ret != CUDA_ERROR_NOT_FOUND)
+    Result = UR_CHECK_ERROR(Ret);
+  if (Ret == CUDA_ERROR_NOT_FOUND) {
+    *ppFunctionPointer = 0;
+    Result = UR_RESULT_ERROR_INVALID_FUNCTION_NAME;
+  }
+
+  return Result;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/program.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.hpp
new file mode 100644
index 0000000000000..6d47df5b78523
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/program.hpp
@@ -0,0 +1,54 @@
+//===--------- program.hpp - CUDA Adapter ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+#pragma once
+
+#include <cuda.h>
+#include <ur_api.h>
+
+#include <atomic>
+#include <unordered_map>
+
+#include "context.hpp"
+
+struct ur_program_handle_t_ {
+  using native_type = CUmodule;
+  native_type Module;
+  const char *Binary;
+  size_t BinarySizeInBytes;
+  std::atomic_uint32_t RefCount;
+  ur_context_handle_t Context;
+
+  // Metadata
+  std::unordered_map<std::string, std::tuple<uint32_t, uint32_t, uint32_t>>
+      KernelReqdWorkGroupSizeMD;
+  std::unordered_map<std::string, std::string> GlobalIDMD;
+
+  constexpr static size_t MaxLogSize = 8192u;
+
+  char ErrorLog[MaxLogSize], InfoLog[MaxLogSize];
+  std::string BuildOptions;
+  ur_program_build_status_t BuildStatus = UR_PROGRAM_BUILD_STATUS_NONE;
+
+  ur_program_handle_t_(ur_context_handle_t Context);
+  ~ur_program_handle_t_();
+
+  ur_result_t setMetadata(const ur_program_metadata_t *Metadata, size_t Length);
+
+  ur_result_t setBinary(const char *Binary, size_t BinarySizeInBytes);
+
+  ur_result_t buildProgram(const char *BuildOptions);
+  ur_context_handle_t getContext() const { return Context; };
+
+  native_type get() const noexcept { return Module; };
+
+  uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
+
+  uint32_t decrementReferenceCount() noexcept { return --RefCount; }
+
+  uint32_t getReferenceCount() const noexcept { return RefCount; }
+};
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
new file mode 100644
index 0000000000000..1aded75fb0741
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.cpp
@@ -0,0 +1,331 @@
+//===--------- queue.cpp - CUDA Adapter ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include "queue.hpp"
+#include "common.hpp"
+#include "context.hpp"
+#include "event.hpp"
+
+#include <cassert>
+#include <cuda.h>
+
+void ur_queue_handle_t_::computeStreamWaitForBarrierIfNeeded(CUstream Stream,
+                                                             uint32_t StreamI) {
+  if (BarrierEvent && !ComputeAppliedBarrier[StreamI]) {
+    UR_CHECK_ERROR(cuStreamWaitEvent(Stream, BarrierEvent, 0));
+    ComputeAppliedBarrier[StreamI] = true;
+  }
+}
+
+void ur_queue_handle_t_::transferStreamWaitForBarrierIfNeeded(
+    CUstream Stream, uint32_t StreamI) {
+  if (BarrierEvent && !TransferAppliedBarrier[StreamI]) {
+    UR_CHECK_ERROR(cuStreamWaitEvent(Stream, BarrierEvent, 0));
+    TransferAppliedBarrier[StreamI] = true;
+  }
+}
+
+CUstream ur_queue_handle_t_::getNextComputeStream(uint32_t *StreamToken) {
+  uint32_t StreamI;
+  uint32_t Token;
+  while (true) {
+    if (NumComputeStreams < ComputeStreams.size()) {
+      // the check above is for performance - so as not to lock mutex every time
+      std::lock_guard<std::mutex> guard(ComputeStreamMutex);
+      // The second check is done after mutex is locked so other threads can not
+      // change NumComputeStreams after that
+      if (NumComputeStreams < ComputeStreams.size()) {
+        UR_CHECK_ERROR(
+            cuStreamCreate(&ComputeStreams[NumComputeStreams++], Flags));
+      }
+    }
+    Token = ComputeStreamIndex++;
+    StreamI = Token % ComputeStreams.size();
+    // if a stream has been reused before it was next selected round-robin
+    // fashion, we want to delay its next use and instead select another one
+    // that is more likely to have completed all the enqueued work.
+    if (DelayCompute[StreamI]) {
+      DelayCompute[StreamI] = false;
+    } else {
+      break;
+    }
+  }
+  if (StreamToken) {
+    *StreamToken = Token;
+  }
+  CUstream res = ComputeStreams[StreamI];
+  computeStreamWaitForBarrierIfNeeded(res, StreamI);
+  return res;
+}
+
+CUstream ur_queue_handle_t_::getNextComputeStream(
+    uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList,
+    ur_stream_guard_ &Guard, uint32_t *StreamToken) {
+  for (uint32_t i = 0; i < NumEventsInWaitList; i++) {
+    uint32_t Token = EventWaitList[i]->getComputeStreamToken();
+    if (reinterpret_cast<ur_queue_handle_t>(EventWaitList[i]->getQueue()) ==
+            this &&
+        canReuseStream(Token)) {
+      std::unique_lock<std::mutex> ComputeSyncGuard(ComputeStreamSyncMutex);
+      // redo the check after lock to avoid data races on
+      // LastSyncComputeStreams
+      if (canReuseStream(Token)) {
+        uint32_t StreamI = Token % DelayCompute.size();
+        DelayCompute[StreamI] = true;
+        if (StreamToken) {
+          *StreamToken = Token;
+        }
+        Guard = ur_stream_guard_{std::move(ComputeSyncGuard)};
+        CUstream Result = EventWaitList[i]->getStream();
+        computeStreamWaitForBarrierIfNeeded(Result, StreamI);
+        return Result;
+      }
+    }
+  }
+  Guard = {};
+  return getNextComputeStream(StreamToken);
+}
+
+CUstream ur_queue_handle_t_::getNextTransferStream() {
+  if (TransferStreams.empty()) { // for example in in-order queue
+    return getNextComputeStream();
+  }
+  if (NumTransferStreams < TransferStreams.size()) {
+    // the check above is for performance - so as not to lock mutex every time
+    std::lock_guard<std::mutex> Guuard(TransferStreamMutex);
+    // The second check is done after mutex is locked so other threads can not
+    // change NumTransferStreams after that
+    if (NumTransferStreams < TransferStreams.size()) {
+      UR_CHECK_ERROR(
+          cuStreamCreate(&TransferStreams[NumTransferStreams++], Flags));
+    }
+  }
+  uint32_t StreamI = TransferStreamIndex++ % TransferStreams.size();
+  CUstream Result = TransferStreams[StreamI];
+  transferStreamWaitForBarrierIfNeeded(Result, StreamI);
+  return Result;
+}
+
+/// Creates a `ur_queue_handle_t` object on the CUDA backend.
+/// Valid properties
+/// * __SYCL_PI_CUDA_USE_DEFAULT_STREAM -> CU_STREAM_DEFAULT
+/// * __SYCL_PI_CUDA_SYNC_WITH_DEFAULT -> CU_STREAM_NON_BLOCKING
+UR_APIEXPORT ur_result_t UR_APICALL
+urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice,
+              const ur_queue_properties_t *pProps, ur_queue_handle_t *phQueue) {
+  try {
+    std::unique_ptr<ur_queue_handle_t_> Queue{nullptr};
+    UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+    UR_ASSERT(phQueue, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+    UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+    if (hContext->getDevice() != hDevice) {
+      *phQueue = nullptr;
+      return UR_RESULT_ERROR_INVALID_DEVICE;
+    }
+
+    unsigned int Flags = CU_STREAM_NON_BLOCKING;
+    ur_queue_flags_t URFlags = 0;
+    bool IsOutOfOrder = false;
+    if (pProps && pProps->stype == UR_STRUCTURE_TYPE_QUEUE_PROPERTIES) {
+      URFlags = pProps->flags;
+      if (URFlags == __SYCL_UR_CUDA_USE_DEFAULT_STREAM) {
+        Flags = CU_STREAM_DEFAULT;
+      } else if (URFlags == __SYCL_UR_CUDA_SYNC_WITH_DEFAULT) {
+        Flags = 0;
+      }
+
+      if (URFlags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
+        IsOutOfOrder = true;
+      }
+    }
+
+    std::vector<CUstream> ComputeCuStreams(
+        IsOutOfOrder ? ur_queue_handle_t_::DefaultNumComputeStreams : 1);
+    std::vector<CUstream> TransferCuStreams(
+        IsOutOfOrder ? ur_queue_handle_t_::DefaultNumTransferStreams : 0);
+
+    Queue = std::unique_ptr<ur_queue_handle_t_>(new ur_queue_handle_t_{
+        std::move(ComputeCuStreams), std::move(TransferCuStreams), hContext,
+        hDevice, Flags, URFlags});
+
+    *phQueue = Queue.release();
+
+    return UR_RESULT_SUCCESS;
+  } catch (ur_result_t Err) {
+
+    return Err;
+
+  } catch (...) {
+
+    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  }
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  assert(hQueue->getReferenceCount() > 0);
+
+  hQueue->incrementReferenceCount();
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  if (hQueue->decrementReferenceCount() > 0) {
+    return UR_RESULT_SUCCESS;
+  }
+
+  try {
+    std::unique_ptr<ur_queue_handle_t_> Queue(hQueue);
+
+    if (!hQueue->backendHasOwnership())
+      return UR_RESULT_SUCCESS;
+
+    ScopedContext Active(hQueue->getContext());
+
+    hQueue->forEachStream([](CUstream S) {
+      UR_CHECK_ERROR(cuStreamSynchronize(S));
+      UR_CHECK_ERROR(cuStreamDestroy(S));
+    });
+
+    return UR_RESULT_SUCCESS;
+  } catch (ur_result_t Err) {
+    return Err;
+  } catch (...) {
+    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  }
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) {
+  ur_result_t Result = UR_RESULT_SUCCESS;
+
+  try {
+    UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+    ScopedContext active(hQueue->getContext());
+
+    hQueue->syncStreams</*ResetUsed=*/true>([&Result](CUstream s) {
+      Result = UR_CHECK_ERROR(cuStreamSynchronize(s));
+    });
+
+  } catch (ur_result_t Err) {
+
+    Result = Err;
+
+  } catch (...) {
+
+    Result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+  }
+
+  return Result;
+}
+
+// There is no CUDA counterpart for queue flushing and we don't run into the
+// same problem of having to flush cross-queue dependencies as some of the
+// other plugins, so it can be left as no-op.
+UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t hQueue) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urQueueGetNativeHandle(ur_queue_handle_t hQueue, ur_queue_native_desc_t *pDesc,
+                       ur_native_handle_t *phNativeQueue) {
+  std::ignore = pDesc;
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(phNativeQueue, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  ScopedContext Active(hQueue->getContext());
+  *phNativeQueue =
+      reinterpret_cast<ur_native_handle_t>(hQueue->getNextComputeStream());
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle(
+    ur_native_handle_t hNativeQueue, ur_context_handle_t hContext,
+    ur_device_handle_t hDevice, const ur_queue_native_properties_t *pProperties,
+    ur_queue_handle_t *phQueue) {
+  (void)pProperties;
+
+  unsigned int CuFlags;
+  CUstream CuStream = reinterpret_cast<CUstream>(hNativeQueue);
+  UR_ASSERT(hContext->getDevice() == hDevice, UR_RESULT_ERROR_INVALID_DEVICE);
+
+  auto Return = UR_CHECK_ERROR(cuStreamGetFlags(CuStream, &CuFlags));
+
+  ur_queue_flags_t Flags = 0;
+  if (CuFlags == CU_STREAM_DEFAULT)
+    Flags = __SYCL_UR_CUDA_USE_DEFAULT_STREAM;
+  else if (CuFlags == CU_STREAM_NON_BLOCKING)
+    Flags = __SYCL_UR_CUDA_SYNC_WITH_DEFAULT;
+  else
+    sycl::detail::ur::die("Unknown cuda stream");
+
+  std::vector<CUstream> ComputeCuStreams(1, CuStream);
+  std::vector<CUstream> TransferCuStreams(0);
+
+  // Create queue and set num_compute_streams to 1, as computeCuStreams has
+  // valid stream
+  *phQueue = new ur_queue_handle_t_{std::move(ComputeCuStreams),
+                                    std::move(TransferCuStreams),
+                                    hContext,
+                                    hDevice,
+                                    CuFlags,
+                                    Flags,
+                                    /*backend_owns*/ false};
+  (*phQueue)->NumComputeStreams = 1;
+
+  return Return;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue,
+                                                   ur_queue_info_t propName,
+                                                   size_t propValueSize,
+                                                   void *pPropValue,
+                                                   size_t *pPropSizeRet) {
+  UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(pPropValue || pPropSizeRet, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropSizeRet);
+
+  switch (propName) {
+  case UR_QUEUE_INFO_CONTEXT:
+    return ReturnValue(hQueue->Context);
+  case UR_QUEUE_INFO_DEVICE:
+    return ReturnValue(hQueue->Device);
+  case UR_QUEUE_INFO_REFERENCE_COUNT:
+    return ReturnValue(hQueue->getReferenceCount());
+  case UR_QUEUE_INFO_FLAGS:
+    return ReturnValue(hQueue->URFlags);
+  case UR_QUEUE_INFO_EMPTY: {
+    try {
+      bool IsReady = hQueue->allOf([](CUstream S) -> bool {
+        const CUresult Ret = cuStreamQuery(S);
+        if (Ret == CUDA_SUCCESS)
+          return true;
+
+        if (Ret == CUDA_ERROR_NOT_READY)
+          return false;
+
+        UR_CHECK_ERROR(Ret);
+        return false;
+      });
+      return ReturnValue(IsReady);
+    } catch (ur_result_t Err) {
+      return Err;
+    } catch (...) {
+      return UR_RESULT_ERROR_OUT_OF_RESOURCES;
+    }
+  }
+  default:
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+  }
+
+  return UR_RESULT_ERROR_INVALID_ENUMERATION;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp
new file mode 100644
index 0000000000000..69232efcc77e6
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/queue.hpp
@@ -0,0 +1,244 @@
+//===--------- queue.hpp - CUDA Adapter ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+#pragma once
+
+#include <ur/ur.hpp>
+
+#include <cuda.h>
+#include <vector>
+
+using ur_stream_guard_ = std::unique_lock<std::mutex>;
+
+/// UR queue mapping on to CUstream objects.
+///
+struct ur_queue_handle_t_ {
+
+  using native_type = CUstream;
+  static constexpr int DefaultNumComputeStreams = 128;
+  static constexpr int DefaultNumTransferStreams = 64;
+
+  std::vector<native_type> ComputeStreams;
+  std::vector<native_type> TransferStreams;
+  // delay_compute_ keeps track of which streams have been recently reused and
+  // their next use should be delayed. If a stream has been recently reused it
+  // will be skipped the next time it would be selected round-robin style. When
+  // skipped, its delay flag is cleared.
+  std::vector<bool> DelayCompute;
+  // keep track of which streams have applied barrier
+  std::vector<bool> ComputeAppliedBarrier;
+  std::vector<bool> TransferAppliedBarrier;
+  ur_context_handle_t_ *Context;
+  ur_device_handle_t_ *Device;
+  CUevent BarrierEvent = nullptr;
+  CUevent BarrierTmpEvent = nullptr;
+  std::atomic_uint32_t RefCount;
+  std::atomic_uint32_t EventCount;
+  std::atomic_uint32_t ComputeStreamIndex;
+  std::atomic_uint32_t TransferStreamIndex;
+  unsigned int NumComputeStreams;
+  unsigned int NumTransferStreams;
+  unsigned int LastSyncComputeStreams;
+  unsigned int LastSyncTransferStreams;
+  unsigned int Flags;
+  ur_queue_flags_t URFlags;
+  // When ComputeStreamSyncMutex and ComputeStreamMutex both need to be
+  // locked at the same time, ComputeStreamSyncMutex should be locked first
+  // to avoid deadlocks
+  std::mutex ComputeStreamSyncMutex;
+  std::mutex ComputeStreamMutex;
+  std::mutex TransferStreamMutex;
+  std::mutex BarrierMutex;
+  bool HasOwnership;
+
+  ur_queue_handle_t_(std::vector<CUstream> &&ComputeStreams,
+                     std::vector<CUstream> &&TransferStreams,
+                     ur_context_handle_t_ *Context, ur_device_handle_t_ *Device,
+                     unsigned int Flags, ur_queue_flags_t URFlags,
+                     bool BackendOwns = true)
+      : ComputeStreams{std::move(ComputeStreams)},
+        TransferStreams{std::move(TransferStreams)},
+        DelayCompute(this->ComputeStreams.size(), false),
+        ComputeAppliedBarrier(this->ComputeStreams.size()),
+        TransferAppliedBarrier(this->TransferStreams.size()), Context{Context},
+        Device{Device}, RefCount{1}, EventCount{0}, ComputeStreamIndex{0},
+        TransferStreamIndex{0}, NumComputeStreams{0}, NumTransferStreams{0},
+        LastSyncComputeStreams{0}, LastSyncTransferStreams{0}, Flags(Flags),
+        URFlags(URFlags), HasOwnership{BackendOwns} {
+    urContextRetain(Context);
+    urDeviceRetain(Device);
+  }
+
+  ~ur_queue_handle_t_() {
+    urContextRelease(Context);
+    urDeviceRelease(Device);
+  }
+
+  void computeStreamWaitForBarrierIfNeeded(CUstream Strean, uint32_t StreamI);
+  void transferStreamWaitForBarrierIfNeeded(CUstream Stream, uint32_t StreamI);
+
+  // get_next_compute/transfer_stream() functions return streams from
+  // appropriate pools in round-robin fashion
+  native_type getNextComputeStream(uint32_t *StreamToken = nullptr);
+  // this overload tries select a stream that was used by one of dependencies.
+  // If that is not possible returns a new stream. If a stream is reused it
+  // returns a lock that needs to remain locked as long as the stream is in use
+  native_type getNextComputeStream(uint32_t NumEventsInWaitList,
+                                   const ur_event_handle_t *EventWaitList,
+                                   ur_stream_guard_ &Guard,
+                                   uint32_t *StreamToken = nullptr);
+  native_type getNextTransferStream();
+  native_type get() { return getNextComputeStream(); };
+
+  bool hasBeenSynchronized(uint32_t StreamToken) {
+    // stream token not associated with one of the compute streams
+    if (StreamToken == std::numeric_limits<uint32_t>::max()) {
+      return false;
+    }
+    return LastSyncComputeStreams >= StreamToken;
+  }
+
+  bool canReuseStream(uint32_t StreamToken) {
+    // stream token not associated with one of the compute streams
+    if (StreamToken == std::numeric_limits<uint32_t>::max()) {
+      return false;
+    }
+    // If the command represented by the stream token was not the last command
+    // enqueued to the stream we can not reuse the stream - we need to allow for
+    // commands enqueued after it and the one we are about to enqueue to run
+    // concurrently
+    bool IsLastCommand =
+        (ComputeStreamIndex - StreamToken) <= ComputeStreams.size();
+    // If there was a barrier enqueued to the queue after the command
+    // represented by the stream token we should not reuse the stream, as we can
+    // not take that stream into account for the bookkeeping for the next
+    // barrier - such a stream would not be synchronized with. Performance-wise
+    // it does not matter that we do not reuse the stream, as the work
+    // represented by the stream token is guaranteed to be complete by the
+    // barrier before any work we are about to enqueue to the stream will start,
+    // so the event does not need to be synchronized with.
+    return IsLastCommand && !hasBeenSynchronized(StreamToken);
+  }
+
+  template <typename T> bool allOf(T &&F) {
+    {
+      std::lock_guard<std::mutex> ComputeGuard(ComputeStreamMutex);
+      unsigned int End = std::min(
+          static_cast<unsigned int>(ComputeStreams.size()), NumComputeStreams);
+      if (!std::all_of(ComputeStreams.begin(), ComputeStreams.begin() + End, F))
+        return false;
+    }
+    {
+      std::lock_guard<std::mutex> TransferGuard(TransferStreamMutex);
+      unsigned int End =
+          std::min(static_cast<unsigned int>(TransferStreams.size()),
+                   NumTransferStreams);
+      if (!std::all_of(TransferStreams.begin(), TransferStreams.begin() + End,
+                       F))
+        return false;
+    }
+    return true;
+  }
+
+  template <typename T> void forEachStream(T &&F) {
+    {
+      std::lock_guard<std::mutex> compute_guard(ComputeStreamMutex);
+      unsigned int End = std::min(
+          static_cast<unsigned int>(ComputeStreams.size()), NumComputeStreams);
+      for (unsigned int i = 0; i < End; i++) {
+        F(ComputeStreams[i]);
+      }
+    }
+    {
+      std::lock_guard<std::mutex> transfer_guard(TransferStreamMutex);
+      unsigned int End =
+          std::min(static_cast<unsigned int>(TransferStreams.size()),
+                   NumTransferStreams);
+      for (unsigned int i = 0; i < End; i++) {
+        F(TransferStreams[i]);
+      }
+    }
+  }
+
+  template <bool ResetUsed = false, typename T> void syncStreams(T &&F) {
+    auto SyncCompute = [&F, &Streams = ComputeStreams, &Delay = DelayCompute](
+                           unsigned int Start, unsigned int Stop) {
+      for (unsigned int i = Start; i < Stop; i++) {
+        F(Streams[i]);
+        Delay[i] = false;
+      }
+    };
+    auto SyncTransfer = [&F, &streams = TransferStreams](unsigned int Start,
+                                                         unsigned int Stop) {
+      for (unsigned int i = Start; i < Stop; i++) {
+        F(streams[i]);
+      }
+    };
+    {
+      unsigned int Size = static_cast<unsigned int>(ComputeStreams.size());
+      std::lock_guard<std::mutex> ComputeSyncGuard(ComputeStreamSyncMutex);
+      std::lock_guard<std::mutex> ComputeGuard(ComputeStreamMutex);
+      unsigned int Start = LastSyncComputeStreams;
+      unsigned int End = NumComputeStreams < Size ? NumComputeStreams
+                                                  : ComputeStreamIndex.load();
+      if (ResetUsed) {
+        LastSyncComputeStreams = End;
+      }
+      if (End - Start >= Size) {
+        SyncCompute(0, Size);
+      } else {
+        Start %= Size;
+        End %= Size;
+        if (Start <= End) {
+          SyncCompute(Start, End);
+        } else {
+          SyncCompute(Start, Size);
+          SyncCompute(0, End);
+        }
+      }
+    }
+    {
+      unsigned int Size = static_cast<unsigned int>(TransferStreams.size());
+      if (!Size) {
+        return;
+      }
+      std::lock_guard<std::mutex> TransferGuard(TransferStreamMutex);
+      unsigned int Start = LastSyncTransferStreams;
+      unsigned int End = NumTransferStreams < Size ? NumTransferStreams
+                                                   : TransferStreamIndex.load();
+      if (ResetUsed) {
+        LastSyncTransferStreams = End;
+      }
+      if (End - Start >= Size) {
+        SyncTransfer(0, Size);
+      } else {
+        Start %= Size;
+        End %= Size;
+        if (Start <= End) {
+          SyncTransfer(Start, End);
+        } else {
+          SyncTransfer(Start, Size);
+          SyncTransfer(0, End);
+        }
+      }
+    }
+  }
+
+  ur_context_handle_t_ *getContext() const { return Context; };
+
+  ur_device_handle_t_ *get_device() const { return Device; };
+
+  uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
+
+  uint32_t decrementReferenceCount() noexcept { return --RefCount; }
+
+  uint32_t getReferenceCount() const noexcept { return RefCount; }
+
+  uint32_t getNextEventID() noexcept { return ++EventCount; }
+
+  bool backendHasOwnership() const noexcept { return HasOwnership; }
+};
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
new file mode 100644
index 0000000000000..36ec89fb9da3c
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.cpp
@@ -0,0 +1,86 @@
+//===--------- sampler.cpp - CUDA Adapter ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include "sampler.hpp"
+#include "common.hpp"
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urSamplerCreate(ur_context_handle_t hContext, const ur_sampler_desc_t *pDesc,
+                ur_sampler_handle_t *phSampler) {
+  std::unique_ptr<ur_sampler_handle_t_> Sampler{
+      new ur_sampler_handle_t_(hContext)};
+
+  if (pDesc && pDesc->stype == UR_STRUCTURE_TYPE_SAMPLER_DESC) {
+    Sampler->Props |= pDesc->normalizedCoords;
+    Sampler->Props |= pDesc->filterMode << 1;
+    Sampler->Props |= pDesc->addressingMode << 2;
+  } else {
+    // Set default values
+    Sampler->Props |= true; // Normalized Coords
+    Sampler->Props |= UR_SAMPLER_ADDRESSING_MODE_CLAMP << 2;
+  }
+
+  *phSampler = Sampler.release();
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urSamplerGetInfo(ur_sampler_handle_t hSampler, ur_sampler_info_t propName,
+                 size_t propValueSize, void *pPropValue, size_t *pPropSizeRet) {
+  UR_ASSERT(hSampler, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropSizeRet);
+
+  switch (propName) {
+  case UR_SAMPLER_INFO_REFERENCE_COUNT:
+    return ReturnValue(hSampler->getReferenceCount());
+  case UR_SAMPLER_INFO_CONTEXT:
+    return ReturnValue(hSampler->Context);
+  case UR_SAMPLER_INFO_NORMALIZED_COORDS: {
+    bool NormCoordsProp = static_cast<bool>(hSampler->Props);
+    return ReturnValue(NormCoordsProp);
+  }
+  case UR_SAMPLER_INFO_FILTER_MODE: {
+    auto FilterProp =
+        static_cast<ur_sampler_filter_mode_t>((hSampler->Props >> 1) & 0x1);
+    return ReturnValue(FilterProp);
+  }
+  case UR_SAMPLER_INFO_ADDRESSING_MODE: {
+    auto AddressingProp =
+        static_cast<ur_sampler_addressing_mode_t>(hSampler->Props >> 2);
+    return ReturnValue(AddressingProp);
+  }
+  default:
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+  }
+  return {};
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urSamplerRetain(ur_sampler_handle_t hSampler) {
+  UR_ASSERT(hSampler, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  hSampler->incrementReferenceCount();
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urSamplerRelease(ur_sampler_handle_t hSampler) {
+  UR_ASSERT(hSampler, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  // double delete or someone is messing with the ref count.
+  // either way, cannot safely proceed.
+  sycl::detail::ur::assertion(
+      hSampler->getReferenceCount() != 0,
+      "Reference count overflow detected in urSamplerRelease.");
+
+  // decrement ref count. If it is 0, delete the sampler.
+  if (hSampler->decrementReferenceCount() == 0) {
+    delete hSampler;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.hpp
new file mode 100644
index 0000000000000..6dbbb124ffc3e
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/sampler.hpp
@@ -0,0 +1,29 @@
+//===--------- sampler.hpp - CUDA Adapter ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include <ur/ur.hpp>
+
+/// Implementation of samplers for CUDA
+///
+/// Sampler property layout:
+/// | 31 30 ... 6 5 |      4 3 2      |     1      |         0        |
+/// |      N/A      | addressing mode | fiter mode | normalize coords |
+struct ur_sampler_handle_t_ {
+  std::atomic_uint32_t RefCount;
+  uint32_t Props;
+  ur_context_handle_t Context;
+
+  ur_sampler_handle_t_(ur_context_handle_t Context)
+      : RefCount(1), Props(0), Context(Context) {}
+
+  uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
+
+  uint32_t decrementReferenceCount() noexcept { return --RefCount; }
+
+  uint32_t getReferenceCount() const noexcept { return RefCount; }
+};
diff --git a/sycl/plugins/cuda/tracing.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/tracing.cpp
similarity index 100%
rename from sycl/plugins/cuda/tracing.cpp
rename to sycl/plugins/unified_runtime/ur/adapters/cuda/tracing.cpp
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
new file mode 100644
index 0000000000000..c7258ad241373
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp
@@ -0,0 +1,264 @@
+//===--------- ur_interface_loader.cpp - Unified Runtime  ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include <ur_api.h>
+#include <ur_ddi.h>
+
+namespace {
+
+// TODO - this is a duplicate of what is in the L0 plugin
+// We should move this to somewhere common
+ur_result_t validateProcInputs(ur_api_version_t version, void *pDdiTable) {
+  if (pDdiTable == nullptr) {
+    return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+  }
+  // Pre 1.0 we enforce that loader and adapter must have the same version.
+  // Post 1.0 only a major version match should be required.
+  if (version != UR_API_VERSION_CURRENT) {
+    return UR_RESULT_ERROR_UNSUPPORTED_VERSION;
+  }
+  return UR_RESULT_SUCCESS;
+}
+} // namespace
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetPlatformProcAddrTable(
+    ur_api_version_t version, ur_platform_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnCreateWithNativeHandle = nullptr;
+  pDdiTable->pfnGet = urPlatformGet;
+  pDdiTable->pfnGetApiVersion = urPlatformGetApiVersion;
+  pDdiTable->pfnGetInfo = urPlatformGetInfo;
+  pDdiTable->pfnGetNativeHandle = nullptr;
+  pDdiTable->pfnGetBackendOption = urPlatformGetBackendOption;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetContextProcAddrTable(
+    ur_api_version_t version, ur_context_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnCreate = urContextCreate;
+  pDdiTable->pfnCreateWithNativeHandle = urContextCreateWithNativeHandle;
+  pDdiTable->pfnGetInfo = urContextGetInfo;
+  pDdiTable->pfnGetNativeHandle = urContextGetNativeHandle;
+  pDdiTable->pfnRelease = urContextRelease;
+  pDdiTable->pfnRetain = urContextRetain;
+  pDdiTable->pfnSetExtendedDeleter = urContextSetExtendedDeleter;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetEventProcAddrTable(
+    ur_api_version_t version, ur_event_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnCreateWithNativeHandle = urEventCreateWithNativeHandle;
+  pDdiTable->pfnGetInfo = urEventGetInfo;
+  pDdiTable->pfnGetNativeHandle = urEventGetNativeHandle;
+  pDdiTable->pfnGetProfilingInfo = urEventGetProfilingInfo;
+  pDdiTable->pfnRelease = urEventRelease;
+  pDdiTable->pfnRetain = urEventRetain;
+  pDdiTable->pfnSetCallback = urEventSetCallback;
+  pDdiTable->pfnWait = urEventWait;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramProcAddrTable(
+    ur_api_version_t version, ur_program_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnBuild = urProgramBuild;
+  pDdiTable->pfnCompile = urProgramCompile;
+  pDdiTable->pfnCreateWithBinary = urProgramCreateWithBinary;
+  pDdiTable->pfnCreateWithIL = urProgramCreateWithIL;
+  pDdiTable->pfnCreateWithNativeHandle = urProgramCreateWithNativeHandle;
+  pDdiTable->pfnGetBuildInfo = urProgramGetBuildInfo;
+  pDdiTable->pfnGetFunctionPointer = urProgramGetFunctionPointer;
+  pDdiTable->pfnGetInfo = urProgramGetInfo;
+  pDdiTable->pfnGetNativeHandle = urProgramGetNativeHandle;
+  pDdiTable->pfnLink = urProgramLink;
+  pDdiTable->pfnRelease = urProgramRelease;
+  pDdiTable->pfnRetain = urProgramRetain;
+  pDdiTable->pfnSetSpecializationConstants =
+      urProgramSetSpecializationConstants;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
+    ur_api_version_t version, ur_kernel_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnCreate = urKernelCreate;
+  pDdiTable->pfnCreateWithNativeHandle = urKernelCreateWithNativeHandle;
+  pDdiTable->pfnGetGroupInfo = urKernelGetGroupInfo;
+  pDdiTable->pfnGetInfo = urKernelGetInfo;
+  pDdiTable->pfnGetNativeHandle = urKernelGetNativeHandle;
+  pDdiTable->pfnGetSubGroupInfo = urKernelGetSubGroupInfo;
+  pDdiTable->pfnRelease = urKernelRelease;
+  pDdiTable->pfnRetain = urKernelRetain;
+  pDdiTable->pfnSetArgLocal = nullptr;
+  pDdiTable->pfnSetArgMemObj = urKernelSetArgMemObj;
+  pDdiTable->pfnSetArgPointer = urKernelSetArgPointer;
+  pDdiTable->pfnSetArgSampler = urKernelSetArgSampler;
+  pDdiTable->pfnSetArgValue = urKernelSetArgValue;
+  pDdiTable->pfnSetExecInfo = urKernelSetExecInfo;
+  pDdiTable->pfnSetSpecializationConstants = nullptr;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetSamplerProcAddrTable(
+    ur_api_version_t version, ur_sampler_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnCreate = urSamplerCreate;
+  pDdiTable->pfnCreateWithNativeHandle = nullptr;
+  pDdiTable->pfnGetInfo = urSamplerGetInfo;
+  pDdiTable->pfnGetNativeHandle = nullptr;
+  pDdiTable->pfnRelease = urSamplerRelease;
+  pDdiTable->pfnRetain = urSamplerRetain;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL
+urGetMemProcAddrTable(ur_api_version_t version, ur_mem_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnBufferCreate = urMemBufferCreate;
+  pDdiTable->pfnBufferPartition = urMemBufferPartition;
+  pDdiTable->pfnBufferCreateWithNativeHandle =
+      urMemBufferCreateWithNativeHandle;
+  pDdiTable->pfnImageCreateWithNativeHandle = urMemImageCreateWithNativeHandle;
+  pDdiTable->pfnGetInfo = urMemGetInfo;
+  pDdiTable->pfnGetNativeHandle = urMemGetNativeHandle;
+  pDdiTable->pfnImageCreate = urMemImageCreate;
+  pDdiTable->pfnImageGetInfo = urMemImageGetInfo;
+  pDdiTable->pfnRelease = urMemRelease;
+  pDdiTable->pfnRetain = urMemRetain;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable(
+    ur_api_version_t version, ur_enqueue_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnDeviceGlobalVariableRead = urEnqueueDeviceGlobalVariableRead;
+  pDdiTable->pfnDeviceGlobalVariableWrite = urEnqueueDeviceGlobalVariableWrite;
+  pDdiTable->pfnEventsWait = urEnqueueEventsWait;
+  pDdiTable->pfnEventsWaitWithBarrier = urEnqueueEventsWaitWithBarrier;
+  pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch;
+  pDdiTable->pfnMemBufferCopy = urEnqueueMemBufferCopy;
+  pDdiTable->pfnMemBufferCopyRect = urEnqueueMemBufferCopyRect;
+  pDdiTable->pfnMemBufferFill = urEnqueueMemBufferFill;
+  pDdiTable->pfnMemBufferMap = urEnqueueMemBufferMap;
+  pDdiTable->pfnMemBufferRead = urEnqueueMemBufferRead;
+  pDdiTable->pfnMemBufferReadRect = urEnqueueMemBufferReadRect;
+  pDdiTable->pfnMemBufferWrite = urEnqueueMemBufferWrite;
+  pDdiTable->pfnMemBufferWriteRect = urEnqueueMemBufferWriteRect;
+  pDdiTable->pfnMemImageCopy = urEnqueueMemImageCopy;
+  pDdiTable->pfnMemImageRead = urEnqueueMemImageRead;
+  pDdiTable->pfnMemImageWrite = urEnqueueMemImageWrite;
+  pDdiTable->pfnMemUnmap = urEnqueueMemUnmap;
+  pDdiTable->pfnUSMFill2D = urEnqueueUSMFill2D;
+  pDdiTable->pfnUSMFill = urEnqueueUSMFill;
+  pDdiTable->pfnUSMAdvise = urEnqueueUSMAdvise;
+  pDdiTable->pfnUSMMemcpy2D = urEnqueueUSMMemcpy2D;
+  pDdiTable->pfnUSMMemcpy = urEnqueueUSMMemcpy;
+  pDdiTable->pfnUSMPrefetch = urEnqueueUSMPrefetch;
+  pDdiTable->pfnReadHostPipe = urEnqueueReadHostPipe;
+  pDdiTable->pfnWriteHostPipe = urEnqueueWriteHostPipe;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetGlobalProcAddrTable(
+    ur_api_version_t version, ur_global_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnGetLastResult = urGetLastResult;
+  pDdiTable->pfnInit = urInit;
+  pDdiTable->pfnTearDown = urTearDown;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable(
+    ur_api_version_t version, ur_queue_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnCreate = urQueueCreate;
+  pDdiTable->pfnCreateWithNativeHandle = urQueueCreateWithNativeHandle;
+  pDdiTable->pfnFinish = urQueueFinish;
+  pDdiTable->pfnFlush = urQueueFlush;
+  pDdiTable->pfnGetInfo = urQueueGetInfo;
+  pDdiTable->pfnGetNativeHandle = urQueueGetNativeHandle;
+  pDdiTable->pfnRelease = urQueueRelease;
+  pDdiTable->pfnRetain = urQueueRetain;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL
+urGetUSMProcAddrTable(ur_api_version_t version, ur_usm_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnDeviceAlloc = urUSMDeviceAlloc;
+  pDdiTable->pfnFree = urUSMFree;
+  pDdiTable->pfnGetMemAllocInfo = urUSMGetMemAllocInfo;
+  pDdiTable->pfnHostAlloc = urUSMHostAlloc;
+  pDdiTable->pfnPoolCreate = nullptr;
+  pDdiTable->pfnPoolRetain = nullptr;
+  pDdiTable->pfnPoolRelease = nullptr;
+  pDdiTable->pfnPoolGetInfo = nullptr;
+  pDdiTable->pfnSharedAlloc = urUSMSharedAlloc;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable(
+    ur_api_version_t version, ur_device_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnCreateWithNativeHandle = urDeviceCreateWithNativeHandle;
+  pDdiTable->pfnGet = urDeviceGet;
+  pDdiTable->pfnGetGlobalTimestamps = urDeviceGetGlobalTimestamps;
+  pDdiTable->pfnGetInfo = urDeviceGetInfo;
+  pDdiTable->pfnGetNativeHandle = urDeviceGetNativeHandle;
+  pDdiTable->pfnPartition = urDevicePartition;
+  pDdiTable->pfnRelease = urDeviceRelease;
+  pDdiTable->pfnRetain = urDeviceRetain;
+  pDdiTable->pfnSelectBinary = urDeviceSelectBinary;
+  return UR_RESULT_SUCCESS;
+}
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp
new file mode 100644
index 0000000000000..7584e79a7c774
--- /dev/null
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp
@@ -0,0 +1,256 @@
+//===--------- usm.cpp - CUDA Adapter ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===-----------------------------------------------------------------===//
+
+#include <cassert>
+
+#include "common.hpp"
+#include "context.hpp"
+#include "device.hpp"
+#include "event.hpp"
+#include "platform.hpp"
+#include "queue.hpp"
+
+#include <cuda.h>
+
+/// USM: Implements USM Host allocations using CUDA Pinned Memory
+/// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#page-locked-host-memory
+UR_APIEXPORT ur_result_t UR_APICALL
+urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc,
+               ur_usm_pool_handle_t pool, size_t size, void **ppMem) {
+  UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+
+  size_t DeviceMaxMemAllocSize = 0;
+  UR_ASSERT(urDeviceGetInfo(hContext->getDevice(),
+                            UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, sizeof(size_t),
+                            static_cast<void *>(&DeviceMaxMemAllocSize),
+                            nullptr) == UR_RESULT_SUCCESS,
+            UR_RESULT_ERROR_INVALID_DEVICE);
+  UR_ASSERT(size > 0 && size <= DeviceMaxMemAllocSize,
+            UR_RESULT_ERROR_INVALID_USM_SIZE);
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  try {
+    ScopedContext Active(hContext);
+    Result = UR_CHECK_ERROR(cuMemAllocHost(ppMem, size));
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+
+  UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 ||
+                          ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)),
+            UR_RESULT_ERROR_INVALID_VALUE);
+
+  assert(Result == UR_RESULT_SUCCESS &&
+         (!pUSMDesc || pUSMDesc->align == 0 ||
+          reinterpret_cast<std::uintptr_t>(*ppMem) % pUSMDesc->align == 0));
+
+  return Result;
+}
+
+/// USM: Implements USM device allocations using a normal CUDA device pointer
+///
+UR_APIEXPORT ur_result_t UR_APICALL
+urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
+                 const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool,
+                 size_t size, void **ppMem) {
+  UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 ||
+                          ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)),
+            UR_RESULT_ERROR_INVALID_VALUE);
+
+  size_t DeviceMaxMemAllocSize = 0;
+  UR_ASSERT(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE,
+                            sizeof(size_t),
+                            static_cast<void *>(&DeviceMaxMemAllocSize),
+                            nullptr) == UR_RESULT_SUCCESS,
+            UR_RESULT_ERROR_INVALID_DEVICE);
+  UR_ASSERT(size > 0 && size <= DeviceMaxMemAllocSize,
+            UR_RESULT_ERROR_INVALID_USM_SIZE);
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  try {
+    ScopedContext Active(hContext);
+    Result = UR_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)ppMem, size));
+  } catch (ur_result_t Err) {
+    return Err;
+  }
+
+  assert(Result == UR_RESULT_SUCCESS &&
+         (!pUSMDesc || pUSMDesc->align == 0 ||
+          reinterpret_cast<std::uintptr_t>(*ppMem) % pUSMDesc->align == 0));
+
+  return Result;
+}
+
+/// USM: Implements USM Shared allocations using CUDA Managed Memory
+///
+UR_APIEXPORT ur_result_t UR_APICALL
+urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
+                 const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool,
+                 size_t size, void **ppMem) {
+  UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(!pUSMDesc || (pUSMDesc->align == 0 ||
+                          ((pUSMDesc->align & (pUSMDesc->align - 1)) == 0)),
+            UR_RESULT_ERROR_INVALID_VALUE);
+
+  size_t DeviceMaxMemAllocSize = 0;
+  UR_ASSERT(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE,
+                            sizeof(size_t),
+                            static_cast<void *>(&DeviceMaxMemAllocSize),
+                            nullptr) == UR_RESULT_SUCCESS,
+            UR_RESULT_ERROR_INVALID_DEVICE);
+  UR_ASSERT(size > 0 && size <= DeviceMaxMemAllocSize,
+            UR_RESULT_ERROR_INVALID_USM_SIZE);
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  try {
+    ScopedContext Active(hContext);
+    Result = UR_CHECK_ERROR(
+        cuMemAllocManaged((CUdeviceptr *)ppMem, size, CU_MEM_ATTACH_GLOBAL));
+  } catch (ur_result_t Err) {
+    return Err;
+  }
+
+  assert(Result == UR_RESULT_SUCCESS &&
+         (!pUSMDesc || pUSMDesc->align == 0 ||
+          reinterpret_cast<std::uintptr_t>(*ppMem) % pUSMDesc->align == 0));
+
+  return Result;
+}
+
+/// USM: Frees the given USM pointer associated with the context.
+///
+UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext,
+                                              void *pMem) {
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  try {
+    ScopedContext Active(hContext);
+    bool IsManaged;
+    unsigned int Type;
+    void *AttributeValues[2] = {&IsManaged, &Type};
+    CUpointer_attribute Attributes[2] = {CU_POINTER_ATTRIBUTE_IS_MANAGED,
+                                         CU_POINTER_ATTRIBUTE_MEMORY_TYPE};
+    Result = UR_CHECK_ERROR(cuPointerGetAttributes(
+        2, Attributes, AttributeValues, (CUdeviceptr)pMem));
+    UR_ASSERT(Type == CU_MEMORYTYPE_DEVICE || Type == CU_MEMORYTYPE_HOST,
+              UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+    if (IsManaged || Type == CU_MEMORYTYPE_DEVICE) {
+      // Memory allocated with cuMemAlloc and cuMemAllocManaged must be freed
+      // with cuMemFree
+      Result = UR_CHECK_ERROR(cuMemFree((CUdeviceptr)pMem));
+    } else {
+      // Memory allocated with cuMemAllocHost must be freed with cuMemFreeHost
+      Result = UR_CHECK_ERROR(cuMemFreeHost(pMem));
+    }
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem,
+                     ur_usm_alloc_info_t propName, size_t propValueSize,
+                     void *pPropValue, size_t *pPropValueSizeRet) {
+  UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  UR_ASSERT(pMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+
+  UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet);
+
+  try {
+    ScopedContext Active(hContext);
+    switch (propName) {
+    case UR_USM_ALLOC_INFO_TYPE: {
+      unsigned int Value;
+      // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE
+      CUresult Ret = cuPointerGetAttribute(
+          &Value, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)pMem);
+      if (Ret == CUDA_ERROR_INVALID_VALUE) {
+        // pointer not known to the CUDA subsystem
+        return ReturnValue(UR_USM_TYPE_UNKNOWN);
+      }
+      Result = checkErrorUR(Ret, __func__, __LINE__ - 5, __FILE__);
+      if (Value) {
+        // pointer to managed memory
+        return ReturnValue(UR_USM_TYPE_SHARED);
+      }
+      Result = UR_CHECK_ERROR(cuPointerGetAttribute(
+          &Value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)pMem));
+      UR_ASSERT(Value == CU_MEMORYTYPE_DEVICE || Value == CU_MEMORYTYPE_HOST,
+                UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+      if (Value == CU_MEMORYTYPE_DEVICE) {
+        // pointer to device memory
+        return ReturnValue(UR_USM_TYPE_DEVICE);
+      }
+      if (Value == CU_MEMORYTYPE_HOST) {
+        // pointer to host memory
+        return ReturnValue(UR_USM_TYPE_HOST);
+      }
+      // should never get here
+#ifdef _MSC_VER
+      __assume(0);
+#else
+      __builtin_unreachable();
+#endif
+    }
+    case UR_USM_ALLOC_INFO_BASE_PTR: {
+#if __CUDA_API_VERSION >= 10020
+      // CU_POINTER_ATTRIBUTE_RANGE_START_ADDR was introduced in CUDA 10.2
+      unsigned int Value;
+      result = UR_CHECK_ERROR(cuPointerGetAttribute(
+          &Value, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, (CUdeviceptr)pMem));
+      return ReturnValue(Value);
+#else
+      return UR_RESULT_ERROR_INVALID_VALUE;
+#endif
+    }
+    case UR_USM_ALLOC_INFO_SIZE: {
+#if __CUDA_API_VERSION >= 10020
+      // CU_POINTER_ATTRIBUTE_RANGE_SIZE was introduced in CUDA 10.2
+      unsigned int Value;
+      result = UR_CHECK_ERROR(cuPointerGetAttribute(
+          &Value, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem));
+      return ReturnValue(Value);
+#else
+      return UR_RESULT_ERROR_INVALID_VALUE;
+#endif
+    }
+    case UR_USM_ALLOC_INFO_DEVICE: {
+      // get device index associated with this pointer
+      unsigned int DeviceIndex;
+      Result = UR_CHECK_ERROR(cuPointerGetAttribute(
+          &DeviceIndex, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
+          (CUdeviceptr)pMem));
+
+      // currently each device is in its own platform, so find the platform at
+      // the same index
+      std::vector<ur_platform_handle_t> Platforms;
+      Platforms.resize(DeviceIndex + 1);
+      Result = urPlatformGet(DeviceIndex + 1, Platforms.data(), nullptr);
+
+      // get the device from the platform
+      ur_device_handle_t Device = Platforms[DeviceIndex]->Devices[0].get();
+      return ReturnValue(Device);
+    }
+    default:
+      return UR_RESULT_ERROR_INVALID_ENUMERATION;
+    }
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
diff --git a/sycl/plugins/unified_runtime/ur/ur.hpp b/sycl/plugins/unified_runtime/ur/ur.hpp
index d0d1fb8f46912..2099b31529176 100644
--- a/sycl/plugins/unified_runtime/ur/ur.hpp
+++ b/sycl/plugins/unified_runtime/ur/ur.hpp
@@ -39,6 +39,9 @@ template <> uint32_t inline ur_cast(uint64_t Value) {
 const ur_device_info_t UR_EXT_DEVICE_INFO_OPENCL_C_VERSION =
     (ur_device_info_t)0x103D;
 
+const ur_device_info_t UR_EXT_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP =
+    (ur_device_info_t)((uint32_t)UR_DEVICE_INFO_FORCE_UINT32 - 1);
+
 const ur_command_t UR_EXT_COMMAND_TYPE_USER =
     (ur_command_t)((uint32_t)UR_COMMAND_FORCE_UINT32 - 1);
 
@@ -197,15 +200,19 @@ extern bool PiPlatformCachePopulated;
 
 // The getInfo*/ReturnHelper facilities provide shortcut way of
 // writing return bytes for the various getInfo APIs.
+namespace ur {
 template <typename T, typename Assign>
 ur_result_t getInfoImpl(size_t param_value_size, void *param_value,
                         size_t *param_value_size_ret, T value,
                         size_t value_size, Assign &&assign_func) {
+  if (!param_value && !param_value_size_ret) {
+    return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+  }
 
   if (param_value != nullptr) {
 
     if (param_value_size < value_size) {
-      return UR_RESULT_ERROR_INVALID_VALUE;
+      return UR_RESULT_ERROR_INVALID_SIZE;
     }
 
     assign_func(param_value, value, value_size);
@@ -260,6 +267,7 @@ getInfo<const char *>(size_t param_value_size, void *param_value,
   return getInfoArray(strlen(value) + 1, param_value_size, param_value,
                       param_value_size_ret, value);
 }
+} // namespace ur
 
 class UrReturnHelper {
 public:
@@ -276,20 +284,20 @@ class UrReturnHelper {
 
   // Scalar return value
   template <class T> ur_result_t operator()(const T &t) {
-    return getInfo(param_value_size, param_value, param_value_size_ret, t);
+    return ur::getInfo(param_value_size, param_value, param_value_size_ret, t);
   }
 
   // Array return value
   template <class T> ur_result_t operator()(const T *t, size_t s) {
-    return getInfoArray(s, param_value_size, param_value, param_value_size_ret,
-                        t);
+    return ur::getInfoArray(s, param_value_size, param_value,
+                            param_value_size_ret, t);
   }
 
   // Array return value where element type is differrent from T
   template <class RetType, class T>
   ur_result_t operator()(const T *t, size_t s) {
-    return getInfoArray<T, RetType>(s, param_value_size, param_value,
-                                    param_value_size_ret, t);
+    return ur::getInfoArray<T, RetType>(s, param_value_size, param_value,
+                                        param_value_size_ret, t);
   }
 
 protected:
diff --git a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
index 4fe263431aed2..4c3c4f5f8ecb7 100644
--- a/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_two_queues_event_dep.cpp
@@ -1,5 +1,6 @@
 // For this test, complete_fusion must be supported.
 // REQUIRES: fusion
+// UNSUPPORTED: cuda
 // RUN: %{build} -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s
 
diff --git a/sycl/unittests/pi/cuda/CMakeLists.txt b/sycl/unittests/pi/cuda/CMakeLists.txt
index 94ac39f07e474..7808340cc4302 100644
--- a/sycl/unittests/pi/cuda/CMakeLists.txt
+++ b/sycl/unittests/pi/cuda/CMakeLists.txt
@@ -22,9 +22,11 @@ target_include_directories(PiCudaTests
     "${sycl_inc_dir}/sycl/detail/"
     "${sycl_inc_dir}"
     "${sycl_plugin_dir}/cuda/"
+    "${sycl_plugin_dir}/unified_runtime/"
 )
 
 target_link_libraries(PiCudaTests
   PRIVATE
     cudadrv
+    UnifiedRuntime-Headers
 )

From f749876e1c169a097de0c513acd1dceefebe037c Mon Sep 17 00:00:00 2001
From: Steffen Larsen <steffen.larsen@intel.com>
Date: Wed, 14 Jun 2023 11:53:17 +0100
Subject: [PATCH 17/55] [SYCL][CODEOWNERS] Make intel/llvm-reviewers-runtime
 design docs codeowners (#9851)

This commit changes the code-owners of the sycl/docs/design section from
@intel/dpcpp-specification-reviewers to intel/llvm-reviewers-runtime.
With this, the intel/llvm-reviewers-runtime would be responsible for
either reviewing the design changes or assign the appropriate teams to
make a design review.

Signed-off-by: Larsen, Steffen <steffen.larsen@intel.com>
---
 .github/CODEOWNERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 5ed2853134ba4..f40685f18f7f7 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -30,7 +30,7 @@ sycl/ @intel/llvm-reviewers-runtime
 # Documentation
 sycl/ReleaseNotes.md @intel/dpcpp-doc-reviewers @tfzhu
 sycl/doc/ @intel/dpcpp-doc-reviewers
-sycl/doc/design/ @intel/dpcpp-specification-reviewers
+sycl/doc/design/ @intel/llvm-reviewers-runtime
 sycl/doc/design/spirv-extensions/ @intel/dpcpp-spirv-doc-reviewers
 sycl/doc/extensions/ @intel/dpcpp-specification-reviewers
 

From de5b479a2de9395c6243066a8dcf3472537c77dd Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Wed, 14 Jun 2023 06:57:07 -0400
Subject: [PATCH 18/55] [SYCL][ESIMD] Allow implicit conversion from
 std::experimental::simd_mask to ESIMD::simd_mask (#9830)

Users of `invoke_simd` need to use `std::experimental::simd_mask` for
masks as per the spec, but once they enter ESIMD code they will likely
want to use the ESIMD classes. Provide an implicit conversion from
`std::experimental::simd_mask` to `esimd::simd_mask`

Without this change, you need to use a manual loop, as all you can do is
access `std::experimental::simd_mask` element-by-element.

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 sycl/include/std/experimental/simd.hpp        |  9 +++-
 .../ext/intel/esimd/detail/simd_mask_impl.hpp |  6 +++
 .../experimental/detail/invoke_simd_types.hpp | 46 +++++++++++++++++++
 .../ext/oneapi/experimental/invoke_simd.hpp   | 25 +---------
 .../InvokeSimd/Spec/simd_mask_merge.cpp       |  4 +-
 5 files changed, 62 insertions(+), 28 deletions(-)
 create mode 100644 sycl/include/sycl/ext/oneapi/experimental/detail/invoke_simd_types.hpp

diff --git a/sycl/include/std/experimental/simd.hpp b/sycl/include/std/experimental/simd.hpp
index 68f070724ad05..87c92a70248c8 100644
--- a/sycl/include/std/experimental/simd.hpp
+++ b/sycl/include/std/experimental/simd.hpp
@@ -854,6 +854,9 @@ class __simd_storage<_Tp, __simd_abi<_StorageKind::_VecExt, __num_element>> {
   void __set(size_t __index, _Tp __val) noexcept {
     __storage_[__index] = __val;
   }
+#ifdef ENABLE_SYCL_EXT_ONEAPI_INVOKE_SIMD
+  const _StorageType& data() const noexcept { return __storage_; }
+#endif
 };
 
 #endif // _LIBCPP_HAS_NO_VECTOR_EXTENSION
@@ -1666,6 +1669,10 @@ class simd_mask {
   static constexpr size_t size() noexcept;
 #endif // ENABLE_SYCL_EXT_ONEAPI_INVOKE_SIMD
 
+#ifdef ENABLE_SYCL_EXT_ONEAPI_INVOKE_SIMD
+  const auto& data() const noexcept { return __s_.data(); }
+#endif
+
   simd_mask() = default;
 
   // broadcast constructor
@@ -1756,4 +1763,4 @@ _LIBCPP_POP_MACROS
 
 // Removed for ENABLE_SYCL_EXT_ONEAPI_INVOKE_SIMD {
 //#endif /* _LIBCPP_EXPERIMENTAL_SIMD */
-// } Removed for ENABLE_SYCL_EXT_ONEAPI_INVOKE_SIMD
\ No newline at end of file
+// } Removed for ENABLE_SYCL_EXT_ONEAPI_INVOKE_SIMD
diff --git a/sycl/include/sycl/ext/intel/esimd/detail/simd_mask_impl.hpp b/sycl/include/sycl/ext/intel/esimd/detail/simd_mask_impl.hpp
index 46e5b4507d860..09cb4465b4be0 100644
--- a/sycl/include/sycl/ext/intel/esimd/detail/simd_mask_impl.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/detail/simd_mask_impl.hpp
@@ -12,6 +12,7 @@
 
 #include <sycl/ext/intel/esimd/detail/simd_obj_impl.hpp>
 #include <sycl/ext/intel/esimd/detail/types.hpp>
+#include <sycl/ext/oneapi/experimental/detail/invoke_simd_types.hpp>
 
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
@@ -95,6 +96,11 @@ class simd_mask_impl
   /// Implicit conversion from simd.
   simd_mask_impl(const simd<T, N> &Val) : base_type(Val.data()) {}
 
+  /// Implicit conversion from std::experimental::simd_mask
+  template <typename T1>
+  simd_mask_impl(const ext::oneapi::experimental::simd_mask<T1, N> &Val)
+      : base_type(convert_vector<T, T1, N>(Val.data())) {}
+
 private:
   /// @cond ESIMD_DETAIL
   static inline constexpr bool mask_size_ok_for_mem_io() {
diff --git a/sycl/include/sycl/ext/oneapi/experimental/detail/invoke_simd_types.hpp b/sycl/include/sycl/ext/oneapi/experimental/detail/invoke_simd_types.hpp
new file mode 100644
index 0000000000000..9921046c1978b
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/experimental/detail/invoke_simd_types.hpp
@@ -0,0 +1,46 @@
+//==- invoke_simd_types.hpp - SYCL invoke_simd extension types --*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// ===--------------------------------------------------------------------=== //
+// Part of the implemenation of the sycl_ext_oneapi_invoke_simd extension.
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/proposed/sycl_ext_oneapi_invoke_simd.asciidoc
+// ===--------------------------------------------------------------------=== //
+
+#pragma once
+
+// SYCL extension macro definition as required by the SYCL specification.
+// 1 - Initial extension version. Base features are supported.
+#define SYCL_EXT_ONEAPI_INVOKE_SIMD 1
+
+#include <std/experimental/simd.hpp>
+#include <sycl/detail/defines_elementary.hpp>
+
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+
+namespace ext::oneapi::experimental {
+
+// --- Basic definitions prescribed by the spec.
+namespace simd_abi {
+// "Fixed-size simd width of N" ABI based on clang vectors - used as the ABI for
+// SIMD objects this implementation of invoke_simd spec is based on.
+template <class T, int N>
+using native_fixed_size = typename std::experimental::__simd_abi<
+    std::experimental::_StorageKind::_VecExt, N>;
+} // namespace simd_abi
+
+// The SIMD object type, which is the generic std::experimental::simd type with
+// the native fixed size ABI.
+template <class T, int N>
+using simd = std::experimental::simd<T, simd_abi::native_fixed_size<T, N>>;
+
+// The SIMD mask object type.
+template <class T, int N>
+using simd_mask =
+    std::experimental::simd_mask<T, simd_abi::native_fixed_size<T, N>>;
+} // namespace ext::oneapi::experimental
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl
diff --git a/sycl/include/sycl/ext/oneapi/experimental/invoke_simd.hpp b/sycl/include/sycl/ext/oneapi/experimental/invoke_simd.hpp
index be24c8c99e581..6c3d1bc0374d6 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/invoke_simd.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/invoke_simd.hpp
@@ -11,13 +11,9 @@
 
 #pragma once
 
-// SYCL extension macro definition as required by the SYCL specification.
-// 1 - Initial extension version. Base features are supported.
-#define SYCL_EXT_ONEAPI_INVOKE_SIMD 1
-
+#include <sycl/ext/oneapi/experimental/detail/invoke_simd_types.hpp>
 #include <sycl/ext/oneapi/experimental/uniform.hpp>
 
-#include <std/experimental/simd.hpp>
 #include <sycl/detail/boost/mp11.hpp>
 #include <sycl/sub_group.hpp>
 
@@ -72,25 +68,6 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) {
 
 namespace ext::oneapi::experimental {
 
-// --- Basic definitions prescribed by the spec.
-namespace simd_abi {
-// "Fixed-size simd width of N" ABI based on clang vectors - used as the ABI for
-// SIMD objects this implementation of invoke_simd spec is based on.
-template <class T, int N>
-using native_fixed_size = typename std::experimental::__simd_abi<
-    std::experimental::_StorageKind::_VecExt, N>;
-} // namespace simd_abi
-
-// The SIMD object type, which is the generic std::experimental::simd type with
-// the native fixed size ABI.
-template <class T, int N>
-using simd = std::experimental::simd<T, simd_abi::native_fixed_size<T, N>>;
-
-// The SIMD mask object type.
-template <class T, int N>
-using simd_mask =
-    std::experimental::simd_mask<T, simd_abi::native_fixed_size<T, N>>;
-
 // --- Helpers
 namespace detail {
 
diff --git a/sycl/test-e2e/InvokeSimd/Spec/simd_mask_merge.cpp b/sycl/test-e2e/InvokeSimd/Spec/simd_mask_merge.cpp
index 9158bb504a7ae..398d7641863fd 100644
--- a/sycl/test-e2e/InvokeSimd/Spec/simd_mask_merge.cpp
+++ b/sycl/test-e2e/InvokeSimd/Spec/simd_mask_merge.cpp
@@ -17,9 +17,7 @@ constexpr int VL = 16;
 [[intel::device_indirectly_callable]] simd<float, VL>
 SIMD_CALLEE(simd<float, VL> va, simd_mask<float, VL> mask) SYCL_ESIMD_FUNCTION {
   esimd::simd<float, VL> ret(0);
-  esimd::simd_mask<VL> emask;
-  for(int i = 0; i < VL; i++)
-    emask[i] = static_cast<bool>(mask[i]);
+  esimd::simd_mask<VL> emask = mask;
   ret.merge(va, !emask);
   return ret;
 }

From 447f5980fe21470d8314ff1d85c052916b4228ab Mon Sep 17 00:00:00 2001
From: Byoungro So <byoungro.so@intel.com>
Date: Wed, 14 Jun 2023 04:02:47 -0700
Subject: [PATCH 19/55] [SYCL] Disable ZE_DEBUG test for interop (#9857)

This test uses an interop API to create a kernel.
So, ZE_DEBUG should be disabled.

Signed-off-by: Byoungro So <byoungro.so@intel.com>
---
 sycl/test-e2e/KernelAndProgram/spec_constants_after_link.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/test-e2e/KernelAndProgram/spec_constants_after_link.cpp b/sycl/test-e2e/KernelAndProgram/spec_constants_after_link.cpp
index 06c7947d9f9ea..f27a9d997ef72 100644
--- a/sycl/test-e2e/KernelAndProgram/spec_constants_after_link.cpp
+++ b/sycl/test-e2e/KernelAndProgram/spec_constants_after_link.cpp
@@ -7,8 +7,8 @@
 // HIP backend does not currently implement linking.
 // UNSUPPORTED: hip
 
-// Windows doesn't yet have full shutdown().
-// UNSUPPORTED: ze_debug && windows
+// This test uses interop that has the ownership on a kernel.
+// UNSUPPORTED: ze_debug
 
 // This test checks that specialization constant information is available on
 // kernel bundles produced by sycl::link.

From 054ed1c2a21f4b6670612dcd47a96893919b11e9 Mon Sep 17 00:00:00 2001
From: "Kenneth Benzie (Benie)" <k.benzie83@gmail.com>
Date: Wed, 14 Jun 2023 13:48:05 +0100
Subject: [PATCH 20/55] [SYCL][CUDA] Fix post merge errors from #9512 (#9872)

Resolves the warnings as errors reported in [post
merge](https://github.com/intel/llvm/actions/runs/5266121277/jobs/9519634360)
as a result of merging #9512. Additionally move pre-processor guards to
resolve unused global variables which would also fail in this build
configuration (clang & SYCL_ENABLE_WERROR=ON).
---
 .../ur/adapters/cuda/tracing.cpp              |  4 +---
 .../unified_runtime/ur/adapters/cuda/usm.cpp  | 22 +++++++++----------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/tracing.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/tracing.cpp
index bf1f2b892de6a..9c0183960eebb 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/tracing.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/tracing.cpp
@@ -19,22 +19,20 @@
 #include <exception>
 #include <iostream>
 
+#ifdef XPTI_ENABLE_INSTRUMENTATION
 constexpr auto CUDA_CALL_STREAM_NAME = "sycl.experimental.cuda.call";
 constexpr auto CUDA_DEBUG_STREAM_NAME = "sycl.experimental.cuda.debug";
 
 thread_local uint64_t CallCorrelationID = 0;
 thread_local uint64_t DebugCorrelationID = 0;
 
-#ifdef XPTI_ENABLE_INSTRUMENTATION
 static xpti_td *GCallEvent = nullptr;
 static xpti_td *GDebugEvent = nullptr;
-#endif // XPTI_ENABLE_INSTRUMENTATION
 
 constexpr auto GVerStr = "0.1";
 constexpr int GMajVer = 0;
 constexpr int GMinVer = 1;
 
-#ifdef XPTI_ENABLE_INSTRUMENTATION
 static void cuptiCallback(void *, CUpti_CallbackDomain, CUpti_CallbackId CBID,
                           const void *CBData) {
   if (xptiTraceEnabled()) {
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp
index 7584e79a7c774..06b955968f19a 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/usm.cpp
@@ -19,9 +19,9 @@
 
 /// USM: Implements USM Host allocations using CUDA Pinned Memory
 /// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#page-locked-host-memory
-UR_APIEXPORT ur_result_t UR_APICALL
-urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc,
-               ur_usm_pool_handle_t pool, size_t size, void **ppMem) {
+UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc(
+    ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc,
+    [[maybe_unused]] ur_usm_pool_handle_t pool, size_t size, void **ppMem) {
   UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
 
@@ -55,10 +55,10 @@ urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc,
 
 /// USM: Implements USM device allocations using a normal CUDA device pointer
 ///
-UR_APIEXPORT ur_result_t UR_APICALL
-urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
-                 const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool,
-                 size_t size, void **ppMem) {
+UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc(
+    ur_context_handle_t hContext, ur_device_handle_t hDevice,
+    const ur_usm_desc_t *pUSMDesc, [[maybe_unused]] ur_usm_pool_handle_t pool,
+    size_t size, void **ppMem) {
   UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
@@ -92,10 +92,10 @@ urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
 
 /// USM: Implements USM Shared allocations using CUDA Managed Memory
 ///
-UR_APIEXPORT ur_result_t UR_APICALL
-urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
-                 const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool,
-                 size_t size, void **ppMem) {
+UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc(
+    ur_context_handle_t hContext, ur_device_handle_t hDevice,
+    const ur_usm_desc_t *pUSMDesc, [[maybe_unused]] ur_usm_pool_handle_t pool,
+    size_t size, void **ppMem) {
   UR_ASSERT(ppMem, UR_RESULT_ERROR_INVALID_NULL_POINTER);
   UR_ASSERT(hContext, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);

From 405778ab831b86cc536f0ef7730545b1a73920f1 Mon Sep 17 00:00:00 2001
From: Zahira Ammarguellat <zahira.ammarguellat@intel.com>
Date: Wed, 14 Jun 2023 09:11:45 -0400
Subject: [PATCH 21/55] [SYCL][FE][Driver] Implement floating point accuracy
 control (#8280)

This patch implements the accuracy controls for floating-point math
functions in DPC++. Using the -ffp-accuracy command line option, the
user can request an accuracy level for all math functions or for
specific ones. Calls to fpbuiltin intrinsics llvm.fpbuilin.* are then
generated.

Syntax:

Linux:   -ffp-accuracy=[default|value][:funclist]
Windows: /Qfp-accuracy:[default|value][:funclist]

funclist is an optional comma separated list of math library functions.

-ffp-accuracy=[default|value]
default: Use the implementation defined accuracy for all math library
functions.
            This is equivalent to not using this option.
value: Use the defined standard accuracy for what each accuracy value
            means for all math library functions.

-ffp-accuracy=[default|value][:funclist]

default: Use the implementation defined accuracy for the math library
functions in funclist.
            This is equivalent to not using this option.
value: Use the defined standard accuracy for what each accuracy value
            means for the math library functions in funclist.

value is one of the following values denoting the library function
accuracy.

high	This is equivalent to max-error = 1.0.
medium	This is equivalent to max-error = 4.
low This is equivalent to accuracy-bits = 11 for single-precision
functions.
accuracy-bits = 26 for double-precision functions.
sycl Determined by the OpenCL specification for math function accuracy:
https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_C.html#relative-error-as-ulps
cuda Determined by standard
https://docs.nvidia.com/cuda/cuda-c-programming-guide/#mathematical-functions-appendix
---
 clang/include/clang/Basic/CodeGenOptions.def  |   4 +
 .../clang/Basic/DiagnosticCommonKinds.td      |   3 +
 .../clang/Basic/DiagnosticDriverKinds.td      |   9 +-
 clang/include/clang/Basic/FPOptions.def       |   3 +-
 clang/include/clang/Basic/LangOptions.def     |   1 +
 clang/include/clang/Basic/LangOptions.h       |  13 +
 clang/include/clang/Driver/Options.td         |  15 +
 .../clang/Frontend/CompilerInvocation.h       |   3 +
 clang/lib/CodeGen/CGBuiltin.cpp               | 246 +++++++++--
 clang/lib/CodeGen/CGCall.cpp                  |  46 +++
 clang/lib/CodeGen/CodeGenFunction.h           |  27 ++
 clang/lib/CodeGen/CodeGenModule.cpp           |  10 +
 clang/lib/CodeGen/CodeGenModule.h             |   9 +
 clang/lib/Driver/ToolChains/Clang.cpp         |  34 +-
 clang/lib/Frontend/CompilerInvocation.cpp     |  75 ++++
 clang/test/CodeGen/fp-accuracy.c              | 389 ++++++++++++++++++
 clang/test/Driver/fp-accuracy.c               |  64 +++
 17 files changed, 907 insertions(+), 44 deletions(-)
 create mode 100644 clang/test/CodeGen/fp-accuracy.c
 create mode 100644 clang/test/Driver/fp-accuracy.c

diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 051fd28c57759..e8ce113b0a97b 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -373,6 +373,10 @@ CODEGENOPT(VirtualFunctionElimination, 1, 0) ///< Whether to apply the dead
                                              /// virtual function elimination
                                              /// optimization.
 
+/// Whether accuracy levels for math library functions are requested by the
+/// user. These accuracy levels will then be expressed in terms of ULPs.
+CODEGENOPT(FPAccuracy, 1, 0)
+
 /// Whether to use public LTO visibility for entities in std and stdext
 /// namespaces. This is enabled by clang-cl's /MT and /MTd flags.
 CODEGENOPT(LTOVisibilityPublicStd, 1, 0)
diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td
index eb1649cc238a5..1e2d7a1c83b33 100644
--- a/clang/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td
@@ -301,6 +301,9 @@ def warn_stack_clash_protection_inline_asm : Warning<
 def warn_slh_does_not_support_asm_goto : Warning<
   "speculative load hardening does not protect functions with asm goto">,
   InGroup<DiagGroup<"slh-asm-goto">>;
+
+def err_drv_incompatible_options : Error<
+  "the combination of '%0' and '%1' is incompatible">;
 }
 
 // Sema && Serialization
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index e2c54549d495d..8f65523effeb5 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -63,6 +63,10 @@ def err_drv_no_cuda_libdevice : Error<
   "via '--cuda-path', or pass '-nocudalib' to build without linking with "
   "libdevice">;
 
+def warn_function_fp_accuracy_already_set : Warning <
+  "floating point accuracy value of '%0' has already been assigned to "
+  "function '%1'">,
+  InGroup<DiagGroup<"fp-accuracy-already-set">>;
 def err_drv_no_rocm_device_lib : Error<
   "cannot find ROCm device library%select{| for %1|for ABI version %1}0; provide its path via "
   "'--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build "
@@ -141,8 +145,9 @@ def err_drv_invalid_unwindlib_name : Error<
   "invalid unwind library name in argument '%0'">;
 def err_drv_incompatible_unwindlib : Error<
   "--rtlib=libgcc requires --unwindlib=libgcc">;
-def err_drv_incompatible_options : Error<
-  "the combination of '%0' and '%1' is incompatible">;
+def err_drv_incompatible_fp_accuracy_options : Error<
+  "floating point accuracy requirements cannot be guaranteed when '-fmath-errno' "
+  "is enabled; use '-fno-math-errno' to enable floating point accuracy control">;
 def err_drv_invalid_stdlib_name : Error<
   "invalid library name in argument '%0'">;
 def err_drv_invalid_output_with_multiple_archs : Error<
diff --git a/clang/include/clang/Basic/FPOptions.def b/clang/include/clang/Basic/FPOptions.def
index 4517be6f178d2..29cf787b4f876 100644
--- a/clang/include/clang/Basic/FPOptions.def
+++ b/clang/include/clang/Basic/FPOptions.def
@@ -26,5 +26,6 @@ OPTION(AllowReciprocal, bool, 1, NoSignedZero)
 OPTION(AllowApproxFunc, bool, 1, AllowReciprocal)
 OPTION(FPEvalMethod, LangOptions::FPEvalMethodKind, 2, AllowApproxFunc)
 OPTION(Float16ExcessPrecision, LangOptions::ExcessPrecisionKind, 2, FPEvalMethod)
-OPTION(BFloat16ExcessPrecision, LangOptions::ExcessPrecisionKind, 2, FPEvalMethod)
+OPTION(BFloat16ExcessPrecision, LangOptions::ExcessPrecisionKind, 2, Float16ExcessPrecision)
+OPTION(FPAccuracy, LangOptions::FPAccuracyKind, 3, BFloat16ExcessPrecision)
 #undef OPTION
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 6bdffb6abc32f..8d3b9a64eb126 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -338,6 +338,7 @@ BENIGN_ENUM_LANGOPT(FPExceptionMode, FPExceptionModeKind, 2, FPE_Default, "FP Ex
 BENIGN_ENUM_LANGOPT(FPEvalMethod, FPEvalMethodKind, 2, FEM_UnsetOnCommandLine, "FP type used for floating point arithmetic")
 ENUM_LANGOPT(Float16ExcessPrecision, ExcessPrecisionKind, 2, FPP_Standard, "Intermediate truncation behavior for Float16 arithmetic")
 ENUM_LANGOPT(BFloat16ExcessPrecision, ExcessPrecisionKind, 2, FPP_Standard, "Intermediate truncation behavior for BFloat16 arithmetic")
+BENIGN_ENUM_LANGOPT(FPAccuracy, FPAccuracyKind, 3, FPA_Default, "Accuracy for floating point operations and library functions")
 LANGOPT(NoBitFieldTypeAlign , 1, 0, "bit-field type alignment")
 LANGOPT(HexagonQdsp6Compat , 1, 0, "hexagon-qdsp6 backward compatibility")
 LANGOPT(ObjCAutoRefCount , 1, 0, "Objective-C automated reference counting")
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index e72dee31f7a0d..04ec54cc43973 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -303,6 +303,15 @@ class LangOptions : public LangOptionsBase {
 
   enum ExcessPrecisionKind { FPP_Standard, FPP_Fast, FPP_None };
 
+  enum FPAccuracyKind {
+    FPA_Default,
+    FPA_High,
+    FPA_Medium,
+    FPA_Low,
+    FPA_Sycl,
+    FPA_Cuda,
+  };
+
   /// Possible exception handling behavior.
   enum class ExceptionHandlingKind { None, SjLj, WinEH, DwarfCFI, Wasm };
 
@@ -509,6 +518,10 @@ class LangOptions : public LangOptionsBase {
   /// records.
   std::string OptRecordFile;
 
+  std::string FPAccuracyVal;
+  using FPAccuracyFuncMapTy = std::map<std::string, std::string>;
+  FPAccuracyFuncMapTy FPAccuracyFuncMap;
+
   LangOptions();
 
   /// Set language defaults for the given input language and
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 2b4546ec3b911..4eb31fc50e656 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1729,6 +1729,13 @@ def ffp_exception_behavior_EQ : Joined<["-"], "ffp-exception-behavior=">, Group<
   Values<"ignore,maytrap,strict">, NormalizedValuesScope<"LangOptions">,
   NormalizedValues<["FPE_Ignore", "FPE_MayTrap", "FPE_Strict"]>,
   MarshallingInfoEnum<LangOpts<"FPExceptionMode">, "FPE_Default">;
+def ffp_accuracy_EQ : Joined<["-"], "ffp-accuracy=">, Group<f_Group>, Flags<[CC1Option]>,
+  HelpText<"Specifies the required accuracy for floating-point operations and library calls.">,
+  Values<"default,high,medium,low,sycl,cuda">, NormalizedValuesScope<"LangOptions">,
+  NormalizedValues<["FPA_Default", "FPA_High", "FPA_Medium", "FPA_Low", "FPA_Sycl", "FPA_Cuda"]>,
+  MarshallingInfoEnum<LangOpts<"FPAccuracy">, "FPA_Default">;
+def ffp_builtin_accuracy_EQ : Joined<["-"], "ffp-builtin-accuracy=">, Group<f_Group>, Flags<[CC1Option]>;
+
 defm fast_math : BoolFOption<"fast-math",
   LangOpts<"FastMath">, DefaultFalse,
   PosFlag<SetTrue, [CC1Option, CoreOption, FC1Option, FlangOption], "Allow aggressive, lossy floating-point optimizations",
@@ -7020,6 +7027,14 @@ class CLRemainingArgsJoined<string name> : Option<["/", "-"], name,
 // (We don't put any of these in cl_compile_Group as the options they alias are
 // already in the right group.)
 
+// INTEL_CUSTOMIZATION
+def _SLASH_Qfp_accuracy_EQ : CLJoined<"Qfp-accuracy=">,
+  Alias<ffp_accuracy_EQ>;
+def _SLASH_Qfp_accuracy_COL : CLJoined<"Qfp-accuracy:">,
+  Alias<ffp_accuracy_EQ>,HelpText<"Specifies the required accuracy for "
+  "floating-point operations and library calls.">;
+// END INTEL_CUSTOMIZATION
+
 def _SLASH_Brepro : CLFlag<"Brepro">,
   HelpText<"Do not write current time into COFF output (breaks link.exe /incremental)">,
   Alias<mno_incremental_linker_compatible>;
diff --git a/clang/include/clang/Frontend/CompilerInvocation.h b/clang/include/clang/Frontend/CompilerInvocation.h
index 1dbd1eda62b3f..ee147022e5c62 100644
--- a/clang/include/clang/Frontend/CompilerInvocation.h
+++ b/clang/include/clang/Frontend/CompilerInvocation.h
@@ -276,6 +276,9 @@ class CompilerInvocation : public CompilerInvocationRefBase,
                             std::vector<std::string> &Includes,
                             DiagnosticsEngine &Diags);
 
+  static void ParseFpAccuracyArgs(LangOptions &Opts, llvm::opt::ArgList &Args,
+                                  DiagnosticsEngine &Diags);
+
   /// Generate command line options from LangOptions.
   static void GenerateLangArgs(const LangOptions &Opts,
                                SmallVectorImpl<const char *> &Args,
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 64e0751ac4f1b..38163c7946488 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -35,6 +35,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/FPAccuracy.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
@@ -504,18 +505,69 @@ static Value *EmitISOVolatileStore(CodeGenFunction &CGF, const CallExpr *E) {
   return Store;
 }
 
+static CallInst *CreateBuiltinCallWithAttr(CodeGenFunction &CGF, StringRef Name,
+                                           llvm::Function *FPBuiltinF,
+                                           ArrayRef<Value *> Args,
+                                           unsigned ID) {
+  llvm::CallInst *CI = CGF.Builder.CreateCall(FPBuiltinF, Args);
+  // TODO: Replace AttrList with a single attribute. The call can only have a
+  // single FPAccuracy attribute.
+  llvm::AttributeList AttrList;
+  // sincos() doesn't return a value, but it still has a type associated with
+  // it that corresponds to the operand type.
+  CGF.CGM.getFPAccuracyFuncAttributes(
+      Name, AttrList, ID,
+      Name == "sincos" ? Args[0]->getType() : FPBuiltinF->getReturnType());
+  CI->setAttributes(AttrList);
+  return CI;
+}
+
+static Function *getIntrinsic(CodeGenFunction &CGF, llvm::Value *Src0,
+                              unsigned FPIntrinsicID, unsigned IntrinsicID,
+                              bool HasAccuracyRequirement) {
+  return HasAccuracyRequirement
+             ? CGF.CGM.getIntrinsic(FPIntrinsicID, Src0->getType())
+             : CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
+}
+
+static bool hasAccuracyRequirement(CodeGenFunction &CGF, StringRef Name) {
+  if (!CGF.getLangOpts().FPAccuracyVal.empty())
+    return true;
+  auto FuncMapIt = CGF.getLangOpts().FPAccuracyFuncMap.find(Name.str());
+  return FuncMapIt != CGF.getLangOpts().FPAccuracyFuncMap.end();
+}
+
 // Emit a simple mangled intrinsic that has 1 argument and a return type
 // matching the argument type. Depending on mode, this may be a constrained
-// floating-point intrinsic.
-static Value *emitUnaryMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
-                                const CallExpr *E, unsigned IntrinsicID,
-                                unsigned ConstrainedIntrinsicID) {
+// or an fpbuiltin floating-point intrinsic.
+static Value *emitUnaryMaybeConstrainedFPBuiltin(
+    CodeGenFunction &CGF, const CallExpr *E, unsigned IntrinsicID,
+    unsigned ConstrainedIntrinsicID,
+    unsigned FPAccuracyIntrinsicID = Intrinsic::not_intrinsic) {
   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
-
+  if (FPAccuracyIntrinsicID != Intrinsic::not_intrinsic) {
+    if (CGF.CGM.getCodeGenOpts().FPAccuracy) {
+      if (CGF.getLangOpts().MathErrno) {
+        DiagnosticsEngine &Diags = CGF.CGM.getDiags();
+        Diags.Report(E->getBeginLoc(), diag::err_drv_incompatible_options)
+            << "-ffp-accuracy"
+            << "-fmath-errno";
+      } else {
+        StringRef Name =
+            CGF.CGM.getContext().BuiltinInfo.getName(CGF.getCurrentBuiltinID());
+        // Use fpbuiltin intrinsic only when needed.
+        Function *Func =
+            getIntrinsic(CGF, Src0, FPAccuracyIntrinsicID, IntrinsicID,
+                         hasAccuracyRequirement(CGF, Name));
+        return CreateBuiltinCallWithAttr(CGF, Name, Func, {Src0},
+                                         FPAccuracyIntrinsicID);
+      }
+    }
+  }
   if (CGF.Builder.getIsFPConstrained()) {
     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
     Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Src0->getType());
-    return CGF.Builder.CreateConstrainedFPCall(F, { Src0 });
+    return CGF.Builder.CreateConstrainedFPCall(F, {Src0});
   } else {
     Function *F = CGF.CGM.getIntrinsic(IntrinsicID, Src0->getType());
     return CGF.Builder.CreateCall(F, Src0);
@@ -524,12 +576,21 @@ static Value *emitUnaryMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
 
 // Emit an intrinsic that has 2 operands of the same type as its result.
 // Depending on mode, this may be a constrained floating-point intrinsic.
-static Value *emitBinaryMaybeConstrainedFPBuiltin(CodeGenFunction &CGF,
-                                const CallExpr *E, unsigned IntrinsicID,
-                                unsigned ConstrainedIntrinsicID) {
+static Value *emitBinaryMaybeConstrainedFPBuiltin(
+    CodeGenFunction &CGF, const CallExpr *E, unsigned IntrinsicID,
+    unsigned ConstrainedIntrinsicID,
+    unsigned FPAccuracyIntrinsicID = Intrinsic::not_intrinsic) {
   llvm::Value *Src0 = CGF.EmitScalarExpr(E->getArg(0));
   llvm::Value *Src1 = CGF.EmitScalarExpr(E->getArg(1));
-
+  if (CGF.CGM.getCodeGenOpts().FPAccuracy) {
+    StringRef Name =
+        CGF.CGM.getContext().BuiltinInfo.getName(CGF.getCurrentBuiltinID());
+    // Use fpbuiltin intrinsic only when needed.
+    Function *Func = getIntrinsic(CGF, Src0, FPAccuracyIntrinsicID, IntrinsicID,
+                                  hasAccuracyRequirement(CGF, Name));
+    return CreateBuiltinCallWithAttr(CGF, Name, Func, {Src0, Src1},
+                                     FPAccuracyIntrinsicID);
+  }
   if (CGF.Builder.getIsFPConstrained()) {
     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, E);
     Function *F = CGF.CGM.getIntrinsic(ConstrainedIntrinsicID, Src0->getType());
@@ -2231,6 +2292,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
                                                Result.Val.getFloat()));
   }
 
+  CurrentBuiltinIDRAII CB(*this, BuiltinID);
+
   // If current long-double semantics is IEEE 128-bit, replace math builtins
   // of long-double with f128 equivalent.
   // TODO: This mutation should also be applied to other targets other than PPC,
@@ -2291,9 +2354,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_cosf16:
     case Builtin::BI__builtin_cosl:
     case Builtin::BI__builtin_cosf128:
-      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
-                                   Intrinsic::cos,
-                                   Intrinsic::experimental_constrained_cos));
+      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
+          *this, E, Intrinsic::cos, Intrinsic::experimental_constrained_cos,
+          Intrinsic::fpbuiltin_cos));
 
     case Builtin::BIexp:
     case Builtin::BIexpf:
@@ -2303,9 +2366,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_expf16:
     case Builtin::BI__builtin_expl:
     case Builtin::BI__builtin_expf128:
-      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
-                                   Intrinsic::exp,
-                                   Intrinsic::experimental_constrained_exp));
+      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
+          *this, E, Intrinsic::exp, Intrinsic::experimental_constrained_exp,
+          Intrinsic::fpbuiltin_exp));
 
     case Builtin::BIexp2:
     case Builtin::BIexp2f:
@@ -2315,9 +2378,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_exp2f16:
     case Builtin::BI__builtin_exp2l:
     case Builtin::BI__builtin_exp2f128:
-      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
-                                   Intrinsic::exp2,
-                                   Intrinsic::experimental_constrained_exp2));
+      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
+          *this, E, Intrinsic::exp2, Intrinsic::experimental_constrained_exp2,
+          Intrinsic::fpbuiltin_exp2));
 
     case Builtin::BIfabs:
     case Builtin::BIfabsf:
@@ -2401,9 +2464,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_logf16:
     case Builtin::BI__builtin_logl:
     case Builtin::BI__builtin_logf128:
-      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
-                                   Intrinsic::log,
-                                   Intrinsic::experimental_constrained_log));
+      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
+          *this, E, Intrinsic::log, Intrinsic::experimental_constrained_log,
+          Intrinsic::fpbuiltin_log));
 
     case Builtin::BIlog10:
     case Builtin::BIlog10f:
@@ -2413,9 +2476,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_log10f16:
     case Builtin::BI__builtin_log10l:
     case Builtin::BI__builtin_log10f128:
-      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
-                                   Intrinsic::log10,
-                                   Intrinsic::experimental_constrained_log10));
+      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
+          *this, E, Intrinsic::log10, Intrinsic::experimental_constrained_log10,
+          Intrinsic::fpbuiltin_log10));
 
     case Builtin::BIlog2:
     case Builtin::BIlog2f:
@@ -2425,9 +2488,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_log2f16:
     case Builtin::BI__builtin_log2l:
     case Builtin::BI__builtin_log2f128:
-      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
-                                   Intrinsic::log2,
-                                   Intrinsic::experimental_constrained_log2));
+      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
+          *this, E, Intrinsic::log2, Intrinsic::experimental_constrained_log2,
+          Intrinsic::fpbuiltin_log2));
 
     case Builtin::BInearbyint:
     case Builtin::BInearbyintf:
@@ -2448,9 +2511,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_powf16:
     case Builtin::BI__builtin_powl:
     case Builtin::BI__builtin_powf128:
-      return RValue::get(emitBinaryMaybeConstrainedFPBuiltin(*this, E,
-                                   Intrinsic::pow,
-                                   Intrinsic::experimental_constrained_pow));
+      return RValue::get(emitBinaryMaybeConstrainedFPBuiltin(
+          *this, E, Intrinsic::pow, Intrinsic::experimental_constrained_pow,
+          Intrinsic::fpbuiltin_pow));
 
     case Builtin::BIrint:
     case Builtin::BIrintf:
@@ -2496,9 +2559,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_sinf16:
     case Builtin::BI__builtin_sinl:
     case Builtin::BI__builtin_sinf128:
-      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
-                                   Intrinsic::sin,
-                                   Intrinsic::experimental_constrained_sin));
+      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
+          *this, E, Intrinsic::sin, Intrinsic::experimental_constrained_sin,
+          Intrinsic::fpbuiltin_sin));
 
     case Builtin::BIsqrt:
     case Builtin::BIsqrtf:
@@ -2508,9 +2571,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     case Builtin::BI__builtin_sqrtf16:
     case Builtin::BI__builtin_sqrtl:
     case Builtin::BI__builtin_sqrtf128:
-      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(*this, E,
-                                   Intrinsic::sqrt,
-                                   Intrinsic::experimental_constrained_sqrt));
+      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
+          *this, E, Intrinsic::sqrt, Intrinsic::experimental_constrained_sqrt,
+          Intrinsic::fpbuiltin_sqrt));
 
     case Builtin::BItrunc:
     case Builtin::BItruncf:
@@ -22062,6 +22125,115 @@ RValue CodeGenFunction::EmitIntelFPGAMemBuiltin(const CallExpr *E) {
   return RValue::get(Ann);
 }
 
+llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall(
+    llvm::FunctionType *IRFuncTy, const SmallVectorImpl<llvm::Value *> &IRArgs,
+    llvm::Value *FnPtr, const FunctionDecl *FD) {
+  llvm::Function *Func;
+  unsigned FPAccuracyIntrinsicID = 0;
+  StringRef Name;
+  if (CurrentBuiltinID == 0) {
+    // Even if the current function doesn't have a clang builtin, create
+    // an 'fpbuiltin-max-error' attribute for it; unless it's marked with
+    // an NoBuiltin attribute.
+    if (!FD->hasAttr<NoBuiltinAttr>()) {
+      Name = FD->getName();
+      FPAccuracyIntrinsicID =
+          llvm::StringSwitch<unsigned>(Name)
+              .Case("fadd", llvm::Intrinsic::fpbuiltin_fadd)
+              .Case("fdiv", llvm::Intrinsic::fpbuiltin_fdiv)
+              .Case("fmul", llvm::Intrinsic::fpbuiltin_fmul)
+              .Case("fsub", llvm::Intrinsic::fpbuiltin_fsub)
+              .Case("frem", llvm::Intrinsic::fpbuiltin_frem)
+              .Case("sincos", llvm::Intrinsic::fpbuiltin_sincos)
+              .Case("exp10", llvm::Intrinsic::fpbuiltin_exp10)
+              .Case("rsqrt", llvm::Intrinsic::fpbuiltin_rsqrt);
+    } else {
+      return nullptr;
+    }
+  } else {
+    // The function has a clang builtin. Create an attribute for it
+    // only if it has an fpbuiltin intrinsic.
+    unsigned BuiltinID = getCurrentBuiltinID();
+    Name = CGM.getContext().BuiltinInfo.getName(BuiltinID);
+    switch (BuiltinID) {
+    default:
+      // If the function has a clang builtin but doesn't have an
+      // fpbuiltin, it will be generated with no 'fpbuiltin-max-error'
+      // attribute.
+      return nullptr;
+    case Builtin::BItan:
+      FPAccuracyIntrinsicID = Intrinsic::fpbuiltin_tan;
+      break;
+    case Builtin::BItanh:
+      FPAccuracyIntrinsicID = Intrinsic::fpbuiltin_tanh;
+      break;
+    case Builtin::BIlog2:
+      FPAccuracyIntrinsicID = Intrinsic::fpbuiltin_log2;
+      break;
+    case Builtin::BIlog1p:
+      FPAccuracyIntrinsicID = Intrinsic::fpbuiltin_log1p;
+      break;
+    case Builtin::BIcos:
+      FPAccuracyIntrinsicID = Intrinsic::fpbuiltin_cos;
+      break;
+    case Builtin::BIcosh:
+      FPAccuracyIntrinsicID = Intrinsic::fpbuiltin_cosh;
+      break;
+    case Builtin::BIacos:
+      FPAccuracyIntrinsicID = Intrinsic::fpbuiltin_acos;
+      break;
+    case Builtin::BIacosh:
+      FPAccuracyIntrinsicID = Intrinsic::fpbuiltin_acosh;
+      break;
+    case Builtin::BIsin:
+      FPAccuracyIntrinsicID = Intrinsic::fpbuiltin_sin;
+      break;
+    case Builtin::BIsinh:
+      FPAccuracyIntrinsicID = Intrinsic::fpbuiltin_sinh;
+      break;
+    case Builtin::BIasin:
+      FPAccuracyIntrinsicID = Intrinsic::fpbuiltin_asin;
+      break;
+    case Builtin::BIasinh:
+      FPAccuracyIntrinsicID = Intrinsic::fpbuiltin_asinh;
+      break;
+    case Builtin::BIatan:
+      FPAccuracyIntrinsicID = Intrinsic::fpbuiltin_atan;
+      break;
+    case Builtin::BIatanh:
+      FPAccuracyIntrinsicID = Intrinsic::fpbuiltin_atanh;
+      break;
+    case Builtin::BIatan2:
+      FPAccuracyIntrinsicID = Intrinsic::fpbuiltin_atan2;
+      break;
+    case Builtin::BIerf:
+      FPAccuracyIntrinsicID = Intrinsic::fpbuiltin_erf;
+      break;
+    case Builtin::BIerfc:
+      FPAccuracyIntrinsicID = Intrinsic::fpbuiltin_erfc;
+      break;
+    case Builtin::BIexp:
+      FPAccuracyIntrinsicID = Intrinsic::fpbuiltin_exp;
+      break;
+    case Builtin::BIexp2:
+      FPAccuracyIntrinsicID = Intrinsic::fpbuiltin_exp2;
+      break;
+    case Builtin::BIexpm1:
+      FPAccuracyIntrinsicID = Intrinsic::fpbuiltin_expm1;
+      break;
+    case Builtin::BIhypot:
+      FPAccuracyIntrinsicID = Intrinsic::fpbuiltin_hypot;
+      break;
+    case Builtin::BIldexp:
+      FPAccuracyIntrinsicID = Intrinsic::fpbuiltin_ldexp;
+      break;
+    }
+  }
+  Func = CGM.getIntrinsic(FPAccuracyIntrinsicID, IRArgs[0]->getType());
+  return CreateBuiltinCallWithAttr(*this, Name, Func, ArrayRef(IRArgs),
+                                   FPAccuracyIntrinsicID);
+}
+
 Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned BuiltinID,
                                              const CallExpr *E,
                                              ReturnValueSlot ReturnValue) {
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 15f887081e85e..c0adfce62d233 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -34,6 +34,7 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/FPAccuracy.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
@@ -1836,6 +1837,44 @@ static bool HasStrictReturn(const CodeGenModule &Module, QualType RetTy,
          Module.getLangOpts().Sanitize.has(SanitizerKind::Return);
 }
 
+static llvm::fp::FPAccuracy convertFPAccuracy(StringRef FPAccuracyStr) {
+  return llvm::StringSwitch<llvm::fp::FPAccuracy>(FPAccuracyStr)
+      .Case("high", llvm::fp::FPAccuracy::High)
+      .Case("medium", llvm::fp::FPAccuracy::Medium)
+      .Case("low", llvm::fp::FPAccuracy::Low)
+      .Case("sycl", llvm::fp::FPAccuracy::SYCL)
+      .Case("cuda", llvm::fp::FPAccuracy::CUDA);
+}
+
+void CodeGenModule::getDefaultFunctionFPAccuracyAttributes(
+    StringRef Name, llvm::AttrBuilder &FuncAttrs, unsigned ID,
+    const llvm::Type *FuncType) {
+  // Priority is given to to the accuracy specific to the function.
+  // So, if the command line is something like this:
+  // 'clang -fp-accuracy = high -fp-accuracy = low:[sin]'.
+  // This means, all library functions will have the accuracy 'high'
+  // except 'sin', which should have an accuracy value of 'low'.
+  // To ensure that, first check if Name has a required accuracy by visiting
+  // the 'FPAccuracyFuncMap'; if no accuracy is mapped to Name (FuncAttrs
+  // is empty), then set its accuracy from the TU's accuracy value.
+  if (!getLangOpts().FPAccuracyFuncMap.empty()) {
+    auto FuncMapIt = getLangOpts().FPAccuracyFuncMap.find(Name.str());
+    if (FuncMapIt != getLangOpts().FPAccuracyFuncMap.end()) {
+      StringRef FPAccuracyVal = llvm::fp::getAccuracyForFPBuiltin(
+          ID, FuncType, convertFPAccuracy(FuncMapIt->second));
+      assert(!FPAccuracyVal.empty() && "A valid accuracy value is expected");
+      FuncAttrs.addAttribute("fpbuiltin-max-error=", FPAccuracyVal);
+    }
+  }
+  if (FuncAttrs.attrs().size() == 0)
+    if (!getLangOpts().FPAccuracyVal.empty()) {
+      StringRef FPAccuracyVal = llvm::fp::getAccuracyForFPBuiltin(
+          ID, FuncType, convertFPAccuracy(getLangOpts().FPAccuracyVal));
+      assert(!FPAccuracyVal.empty() && "A valid accuracy value is expected");
+      FuncAttrs.addAttribute("fpbuiltin-max-error=", FPAccuracyVal);
+    }
+}
+
 /// Add denormal-fp-math and denormal-fp-math-f32 as appropriate for the
 /// requested denormal behavior, accounting for the overriding behavior of the
 /// -f32 case.
@@ -5581,6 +5620,13 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   // Emit the actual call/invoke instruction.
   llvm::CallBase *CI;
   if (!InvokeDest) {
+    if (CGM.getCodeGenOpts().FPAccuracy) {
+      const auto *FD = dyn_cast_if_present<FunctionDecl>(TargetDecl);
+      assert(FD && "expecting a function");
+      CI = EmitFPBuiltinIndirectCall(IRFuncTy, IRCallArgs, CalleePtr, FD);
+      if (CI)
+        return RValue::get(CI);
+    }
     CI = Builder.CreateCall(IRFuncTy, CalleePtr, IRCallArgs, BundleList);
   } else {
     llvm::BasicBlock *Cont = createBasicBlock("invoke.cont");
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 2184a8401b21c..58709193b374a 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -1597,6 +1597,28 @@ class CodeGenFunction : public CodeGenTypeCache {
   SourceLocation LastStopPoint;
 
 public:
+  /// Class to manage the BuiltinID for the current builtin expression during
+  /// processing in EmitBuiltinExpr.
+  class CurrentBuiltinIDRAII {
+    CodeGenFunction &CGF;
+    unsigned SavedBuiltinID;
+
+  public:
+    CurrentBuiltinIDRAII(CodeGenFunction &CGF, unsigned BuiltinID)
+        : CGF(CGF), SavedBuiltinID(CGF.CurrentBuiltinID) {
+      CGF.CurrentBuiltinID = BuiltinID;
+    }
+    ~CurrentBuiltinIDRAII() { CGF.CurrentBuiltinID = SavedBuiltinID; }
+  };
+
+private:
+  unsigned CurrentBuiltinID = /*NotBuiltin*/ 0;
+
+public:
+  unsigned getCurrentBuiltinID() const {
+    assert(CurrentBuiltinID != /*NotBuiltin*/ 0);
+    return CurrentBuiltinID;
+  }
   /// Source location information about the default argument or member
   /// initializer expression we're evaluating, if any.
   CurrentSourceLocExprScope CurSourceLocExprScope;
@@ -4289,6 +4311,11 @@ class CodeGenFunction : public CodeGenTypeCache {
                                  ReturnValueSlot ReturnValue);
   RValue EmitIntelFPGAMemBuiltin(const CallExpr *E);
 
+  llvm::CallInst *
+  EmitFPBuiltinIndirectCall(llvm::FunctionType *IRFuncTy,
+                            const SmallVectorImpl<llvm::Value *> &IRArgs,
+                            llvm::Value *FnPtr, const FunctionDecl *FD);
+
   enum class MSVCIntrin;
   llvm::Value *EmitMSVCBuiltinExpr(MSVCIntrin BuiltinID, const CallExpr *E);
 
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 4751adeb529c4..6178ce2840f79 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -7881,3 +7881,13 @@ void CodeGenModule::moveLazyEmissionStates(CodeGenModule *NewBuilder) {
 
   NewBuilder->ABI->MangleCtx = std::move(ABI->MangleCtx);
 }
+
+void CodeGenModule::getFPAccuracyFuncAttributes(StringRef Name,
+                                                llvm::AttributeList &AttrList,
+                                                unsigned ID,
+                                                const llvm::Type *FuncType) {
+  llvm::AttrBuilder FuncAttrs(getLLVMContext());
+  getDefaultFunctionFPAccuracyAttributes(Name, FuncAttrs, ID, FuncType);
+  AttrList = llvm::AttributeList::get(
+      getLLVMContext(), llvm::AttributeList::FunctionIndex, FuncAttrs);
+}
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
index 2ccacf40ac56b..5d1521da2da63 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -1593,6 +1593,10 @@ class CodeGenModule : public CodeGenTypeCache {
   /// because we'll lose all important information after each repl.
   void moveLazyEmissionStates(CodeGenModule *NewBuilder);
 
+  void getFPAccuracyFuncAttributes(StringRef Name,
+                                   llvm::AttributeList &AttrList, unsigned ID,
+                                   const llvm::Type *FuncType);
+
 private:
   llvm::Constant *GetOrCreateLLVMFunction(
       StringRef MangledName, llvm::Type *Ty, GlobalDecl D, bool ForVTable,
@@ -1787,6 +1791,11 @@ class CodeGenModule : public CodeGenTypeCache {
                                     bool AttrOnCallSite,
                                     llvm::AttrBuilder &FuncAttrs);
 
+  void getDefaultFunctionFPAccuracyAttributes(StringRef Name,
+                                              llvm::AttrBuilder &FuncAttrs,
+                                              unsigned ID,
+                                              const llvm::Type *FuncType);
+
   llvm::Metadata *CreateMetadataIdentifierImpl(QualType T, MetadataTypeMap &Map,
                                                StringRef Suffix);
 };
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 1acb5cfe6c016..1cab8ab2ee7c6 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -2886,6 +2886,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
   bool StrictFPModel = false;
   StringRef Float16ExcessPrecision = "";
   StringRef BFloat16ExcessPrecision = "";
+  StringRef FPAccuracy = "";
 
   if (const Arg *A = Args.getLastArg(options::OPT_flimited_precision_EQ)) {
     CmdArgs.push_back("-mlimit-float-precision");
@@ -2898,13 +2899,20 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
     switch (optID) {
     default:
       break;
+    case options::OPT_ffp_accuracy_EQ: {
+      StringRef Val = A->getValue();
+      FPAccuracy = Val;
+      break;
+    }
     case options::OPT_ffp_model_EQ: {
       // If -ffp-model= is seen, reset to fno-fast-math
       HonorINFs = true;
       HonorNaNs = true;
       ApproxFunc = false;
-      // Turning *off* -ffast-math restores the toolchain default.
-      MathErrno = TC.IsMathErrnoDefault();
+      // Turning *off* -ffast-math restores the toolchain default,
+      // unless -fp-accuracy is used.
+      if (FPAccuracy.empty())
+        MathErrno = TC.IsMathErrnoDefault();
       AssociativeMath = false;
       ReciprocalMath = false;
       SignedZeros = true;
@@ -3173,8 +3181,9 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       HonorNaNs = true;
       // Turning on -ffast-math (with either flag) removes the need for
       // MathErrno. However, turning *off* -ffast-math merely restores the
-      // toolchain default (which may be false).
-      MathErrno = TC.IsMathErrnoDefault();
+      // toolchain default (which may be false), unless -fp-accuracy is used.
+      if (FPAccuracy.empty())
+        MathErrno = TC.IsMathErrnoDefault();
       AssociativeMath = false;
       ReciprocalMath = false;
       ApproxFunc = false;
@@ -6057,6 +6066,23 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
           << A->getAsString(Args) << TripleStr;
   }
 
+  std::string FpAccuracyAttr;
+  auto RenderFPAccuracyOptions = [&FpAccuracyAttr](const Twine &OptStr) {
+    // In case the value is 'default' don't add the -ffp-builtin-accuracy
+    // attribute.
+    if (OptStr.str() != "default") {
+      if (FpAccuracyAttr.empty())
+        FpAccuracyAttr = "-ffp-builtin-accuracy=";
+      else
+        FpAccuracyAttr += " ";
+      FpAccuracyAttr += OptStr.str();
+    }
+  };
+  for (StringRef A : Args.getAllArgValues(options::OPT_ffp_accuracy_EQ))
+    RenderFPAccuracyOptions(A);
+  if (!FpAccuracyAttr.empty())
+    CmdArgs.push_back(Args.MakeArgString(FpAccuracyAttr));
+
   // Decide whether to use verbose asm. Verbose assembly is the default on
   // toolchains which have the integrated assembler on by default.
   bool IsIntegratedAssemblerDefault = TC.IsIntegratedAssemblerDefault();
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 330169ca22efd..03874178ca357 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -2034,6 +2034,9 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args,
     }
   }
 
+  if (Args.getLastArg(options::OPT_ffp_builtin_accuracy_EQ))
+    Opts.FPAccuracy = 1;
+
   if (auto *arg =
           Args.getLastArg(options::OPT_fdiagnostics_misexpect_tolerance_EQ)) {
     auto ResultOrErr = parseToleranceOption(arg->getValue());
@@ -3328,6 +3331,13 @@ void CompilerInvocation::GenerateLangArgs(const LangOptions &Opts,
 #include "clang/Driver/Options.inc"
 #undef LANG_OPTION_WITH_MARSHALLING
 
+  if (!Opts.FPAccuracyVal.empty())
+    GenerateArg(Args, OPT_ffp_builtin_accuracy_EQ, Opts.FPAccuracyVal, SA);
+
+  for (const auto &F : Opts.FPAccuracyFuncMap)
+    GenerateArg(Args, OPT_ffp_builtin_accuracy_EQ, (F.second + ":" + F.first),
+                SA);
+
   // The '-fcf-protection=' option is generated by CodeGenOpts generator.
 
   if (Opts.ObjC) {
@@ -3570,6 +3580,69 @@ void CompilerInvocation::GenerateLangArgs(const LangOptions &Opts,
     GenerateArg(Args, OPT_fno_gpu_rdc, SA);
 }
 
+static void checkFPAccuracyIsValid(StringRef ValElement,
+                                   DiagnosticsEngine &Diags) {
+  if (!llvm::StringSwitch<bool>(ValElement)
+           .Case("default", true)
+           .Case("high", true)
+           .Case("low", true)
+           .Case("medium", true)
+           .Case("sycl", true)
+           .Case("cuda", true)
+           .Default(false))
+    Diags.Report(diag::err_drv_unsupported_option_argument)
+        << "-ffp-accuracy" << ValElement;
+}
+
+void CompilerInvocation::ParseFpAccuracyArgs(LangOptions &Opts, ArgList &Args,
+                                             DiagnosticsEngine &Diags) {
+  for (StringRef Values : Args.getAllArgValues(OPT_ffp_builtin_accuracy_EQ)) {
+    if (Opts.MathErrno) {
+      Diags.Report(diag::err_drv_incompatible_fp_accuracy_options);
+    } else {
+      SmallVector<StringRef, 8> ValuesArr;
+      Values.split(ValuesArr, ' ');
+      for (const auto &Val : ValuesArr) {
+        SmallVector<StringRef, 3> ValElement;
+        Val.split(ValElement, ':');
+        // The option is of the form -ffp-accuracy=value.
+        if (ValElement.size() == 1) {
+          checkFPAccuracyIsValid(ValElement[0], Diags);
+          Opts.FPAccuracyVal = ValElement[0].str();
+        }
+        // The option is of the form -ffp-accuracy=value:[f1, ... fn].
+        if (ValElement.size() == 2) {
+          SmallVector<StringRef, 30> FuncList;
+          ValElement[1].split(FuncList, ',');
+          for (StringRef FuncName : FuncList) {
+            if (FuncName.front() == '[')
+              FuncName = FuncName.drop_front(1);
+            if (FuncName.back() == ']')
+              FuncName = FuncName.drop_back(1);
+            auto FuncMap = Opts.FPAccuracyFuncMap.find(FuncName.str());
+            if (FuncMap != Opts.FPAccuracyFuncMap.end()) {
+              if (!FuncMap->second.empty()) {
+                Diags.Report(diag::warn_function_fp_accuracy_already_set)
+                    << FuncMap->second << FuncName.str();
+              }
+            } else {
+              checkFPAccuracyIsValid(ValElement[0], Diags);
+              if (!Opts.FPAccuracyVal.empty())
+                Diags.Report(diag::warn_function_fp_accuracy_already_set)
+                    << Opts.FPAccuracyVal << FuncName.str();
+              // No need to fill the map if the FPaccuracy is 'default'.
+              // The default builtin will be generated.
+              if (!ValElement[0].equals("default"))
+                Opts.FPAccuracyFuncMap.insert(
+                    {FuncName.str(), ValElement[0].str()});
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
                                        InputKind IK, const llvm::Triple &T,
                                        std::vector<std::string> &Includes,
@@ -3726,6 +3799,8 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
 #include "clang/Driver/Options.inc"
 #undef LANG_OPTION_WITH_MARSHALLING
 
+  ParseFpAccuracyArgs(Opts, Args, Diags);
+
   if (const Arg *A = Args.getLastArg(OPT_fcf_protection_EQ)) {
     StringRef Name = A->getValue();
     if (Name == "full" || Name == "branch") {
diff --git a/clang/test/CodeGen/fp-accuracy.c b/clang/test/CodeGen/fp-accuracy.c
new file mode 100644
index 0000000000000..7cc5296089adc
--- /dev/null
+++ b/clang/test/CodeGen/fp-accuracy.c
@@ -0,0 +1,389 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -ffp-builtin-accuracy=high \
+// RUN: -Wno-return-type -Wno-implicit-function-declaration -emit-llvm -o - %s \
+// RUN: | FileCheck --check-prefixes=CHECK %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: "-ffp-builtin-accuracy=high:[acosf,cos,pow] low:[tan] medium:[sincos,log10]" \
+// RUN: -Wno-return-type -Wno-implicit-function-declaration -emit-llvm -o - %s \
+// RUN: | FileCheck --check-prefix=CHECK-F1 %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: "-ffp-builtin-accuracy=medium high:[tan] cuda:[cos]" \
+// RUN: -Wno-return-type -Wno-implicit-function-declaration -emit-llvm -o - %s \
+// RUN: | FileCheck --check-prefix=CHECK-F2 %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown		     \
+// RUN: "-ffp-builtin-accuracy=high low:[tan] medium:[sincos,log10]" \
+// RUN: -Wno-return-type -Wno-implicit-function-declaration -emit-llvm -o - %s \
+// RUN: | FileCheck --check-prefix=CHECK-F3 %s
+
+// RUN: %clang_cc1 -triple spir64-unknown-unknown -ffp-builtin-accuracy=sycl \
+// RUN: -D SPIR -Wno-implicit-function-declaration -emit-llvm -o - %s \
+// RUN: | FileCheck --check-prefix=CHECK-SPIR %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: "-ffp-builtin-accuracy=default:[acosf,cos,pow]" \
+// RUN: -Wno-return-type -Wno-implicit-function-declaration -emit-llvm -o - %s \
+// RUN: | FileCheck --check-prefixes=CHECK-DEFAULT %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown \
+// RUN: -Wno-return-type -Wno-implicit-function-declaration -emit-llvm -o - %s \
+// RUN: | FileCheck --check-prefixes=CHECK-DEFAULT %s
+
+#ifdef SPIR
+// This is a declaration when compiling with -fsycl to avoid
+// the compilation error "function with no prototype cannot use
+// the spir_function calling convention".
+void sincos(float, float *, float *);
+double exp10(double);
+double fadd(double, double);
+float fdiv(float, float);
+float fmul(float, float);
+float frem(float, float);
+float fsub(float, float);
+double rsqrt(double);
+#endif
+
+
+// CHECK-LABEL: define dso_local void @f1
+// CHECK: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR_HIGH:[0-9]+]]
+// CHECK: call double @llvm.fpbuiltin.acosh.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.asinh.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.atan.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.atan2.f64(double {{.*}}, double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.atanh.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.cosh.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.erf.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.erfc.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.exp10.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.exp2.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.expm1.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.fadd.f64(double {{.*}}, double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.fdiv.f64(double {{.*}}, double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.fmul.f64(double {{.*}}, double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.frem.f64(double {{.*}}, double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.fsub.f64(double {{.*}}, double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.hypot.f64(double {{.*}}, double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.ldexp.f64(double {{.*}}, i32 {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.log10.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.log1p.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.log2.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.pow.f64(double {{.*}}, double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.rsqrt.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK:    call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.sinh.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.tanh.f64(double {{.*}}) #[[ATTR_HIGH]]
+
+// CHECK-F1-LABEL: define dso_local void @f1
+// CHECK-F1: call double @llvm.fpbuiltin.acos.f64(double {{.*}})
+// CHECK-F1: call double @llvm.fpbuiltin.acosh.f64(double {{.*}})
+// CHECK-F1: call double @llvm.fpbuiltin.asin.f64(double {{.*}})
+// CHECK-F1: call double @llvm.fpbuiltin.asinh.f64(double {{.*}})
+// CHECK-F1: call double @llvm.fpbuiltin.atan.f64(double {{.*}})
+// CHECK-F1: call double @llvm.fpbuiltin.atan2.f64(double {{.*}}, double {{.*}})
+// CHECK-F1: call double @llvm.fpbuiltin.atanh.f64(double {{.*}})
+// CHECK-F1: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F1_HIGH:[0-9]+]]
+// CHECK-F1: call double @llvm.fpbuiltin.cosh.f64(double {{.*}})
+// CHECK-F1: call double @llvm.fpbuiltin.erf.f64(double {{.*}})
+// CHECK-F1: call double @llvm.fpbuiltin.erfc.f64(double {{.*}})
+// CHECK-F1: call double @llvm.exp.f64(double {{.*}})
+// CHECK-F1: call double @llvm.fpbuiltin.exp10.f64(double {{.*}})
+// CHECK-F1: call double @llvm.exp2.f64(double {{.*}})
+// CHECK-F1: call double @llvm.fpbuiltin.expm1.f64(double {{.*}})
+// CHECK-F1: call double @llvm.fpbuiltin.fadd.f64(double {{.*}}, double {{.*}})
+// CHECK-F1: call double @llvm.fpbuiltin.fdiv.f64(double {{.*}}, double {{.*}})
+// CHECK-F1: call double @llvm.fpbuiltin.fmul.f64(double {{.*}}, double {{.*}})
+// CHECK-F1: call double @llvm.fpbuiltin.frem.f64(double {{.*}}, double {{.*}})
+// CHECK-F1: call double @llvm.fpbuiltin.fsub.f64(double {{.*}}, double {{.*}})
+// CHECK-F1: call double @llvm.fpbuiltin.hypot.f64(double {{.*}}, double {{.*}})
+// CHECK-F1: call double @llvm.fpbuiltin.ldexp.f64(double {{.*}}, i32 {{.*}})
+// CHECK-F1: call double @llvm.log.f64(double {{.*}})
+// CHECK-F1: call double @llvm.fpbuiltin.log10.f64(double {{.*}}) #[[ATTR_F1_MEDIUM:[0-9]+]]
+// CHECK-F1: call double @llvm.fpbuiltin.log1p.f64(double {{.*}})
+// CHECK-F1: call double @llvm.log2.f64(double {{.*}})
+// CHECK-F1: call double @llvm.fpbuiltin.pow.f64(double {{.*}}, double {{.*}}) #[[ATTR_F1_HIGH]]
+// CHECK-F1: call double @llvm.fpbuiltin.rsqrt.f64(double {{.*}})
+// CHECK-F1: call double @llvm.sin.f64(double {{.*}})
+// CHECK-F1: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F1_MEDIUM]]
+// CHECK-F1: call double @llvm.fpbuiltin.sinh.f64(double {{.*}})
+// CHECK-F1: call double @llvm.sqrt.f64(double {{.*}})
+// CHECK-F1: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F1_LOW:[0-9]+]]
+// CHECK-F1: call double @llvm.fpbuiltin.tanh.f64(double {{.*}})
+//
+// CHECK-F2-LABEL: define dso_local void @f1
+// CHECK-F2: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR_F2_MEDIUM:[0-9]+]]
+// CHECK-F2: call double @llvm.fpbuiltin.acosh.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.asinh.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.atan.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.atan2.f64(double {{.*}}, double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.atanh.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F2_CUDA:[0-9]+]]
+// CHECK-F2: call double @llvm.fpbuiltin.cosh.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.erf.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.erfc.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.exp10.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.exp2.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.expm1.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.fadd.f64(double {{.*}}, double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.fdiv.f64(double {{.*}}, double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.fmul.f64(double {{.*}}, double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.frem.f64(double {{.*}}, double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.fsub.f64(double {{.*}}, double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.hypot.f64(double {{.*}}, double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.ldexp.f64(double {{.*}}, i32 {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.log10.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.log1p.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.log2.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.pow.f64(double {{.*}}, double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.rsqrt.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2:    call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.sinh.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F2_HIGH:[0-9]+]]
+// CHECK-F2: call double @llvm.fpbuiltin.tanh.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]]
+//
+// CHECK-F3-LABEL: define dso_local void @f1
+// CHECK-F3: call double @llvm.fpbuiltin.acos.f64(double %conv) #[[ATTR_F3_HIGH:[0-9]+]]
+// CHECK-F3: call double @llvm.fpbuiltin.acosh.f64(double %conv2) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.asin.f64(double %conv4) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.asinh.f64(double %conv6) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.atan.f64(double %conv8) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.atan2.f64(double %conv10, double %conv11) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.atanh.f64(double %conv13) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.cos.f64(double %conv15) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.cosh.f64(double %conv17) #[[ATTR_F3_HIGH]]
+// CHECk-F3: call double @llvm.fpbuiltin.erf.f64(double %conv19) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.erfc.f64(double %conv21) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.exp.f64(double %conv23) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.exp10.f64(double %conv25) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.exp2.f64(double %conv27) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.expm1.f64(double %conv29) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.fadd.f64(double %conv31, double %conv32) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.fdiv.f64(double %conv34, double %conv35) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.fmul.f64(double %conv37, double %conv38) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.frem.f64(double %conv40, double %conv41) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.fsub.f64(double %conv43, double %conv44) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.hypot.f64(double %conv46, double %conv47) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.ldexp.f64(double %conv49, i32 %conv50) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.log.f64(double %conv52) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.log10.f64(double %conv54) #[[ATTR_F3_MEDIUM:[0-9]+]]
+// CHECK-F3: call double @llvm.fpbuiltin.log1p.f64(double %conv56) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.log2.f64(double %conv58) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.pow.f64(double %conv60, double %conv61) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.rsqrt.f64(double %conv63) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.sin.f64(double %conv65) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call void @llvm.fpbuiltin.sincos.f64(double %conv67, ptr %p1, ptr %p2) #[[ATTR_F3_MEDIUM]]
+// CHECK-F3: call double @llvm.fpbuiltin.sinh.f64(double %conv68) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.sqrt.f64(double %conv70) #[[ATTR_F3_HIGH]]
+// CHECK-F3: call double @llvm.fpbuiltin.tan.f64(double %conv72) #[[ATTR_F3_LOW:[0-9]+]]
+// CHECK-F3: call double @llvm.fpbuiltin.tanh.f64(double %conv74) #[[ATTR_F3_HIGH]]
+
+// CHECK-F3: attributes #[[ATTR_F3_HIGH]] = {{.*}}"fpbuiltin-max-error="="1.0f"
+// CHECK-F3: attributes #[[ATTR_F3_MEDIUM]] = {{.*}}"fpbuiltin-max-error="="4.0f"
+// CHECK-F3: attributes #[[ATTR_F3_LOW]] = {{.*}}"fpbuiltin-max-error="="67108864.0f"
+//
+// CHECK-SPIR-LABEL: define dso_local spir_func void @f1
+// CHECK-SPIR: call double @llvm.fpbuiltin.acos.f64(double {{.*}}) #[[ATTR_SYCL1:[0-9]+]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.acosh.f64(double {{.*}}) #[[ATTR_SYCL1]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.asin.f64(double {{.*}}) #[[ATTR_SYCL1]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.asinh.f64(double {{.*}}) #[[ATTR_SYCL1]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.atan.f64(double {{.*}}) #[[ATTR_SYCL2:[0-9]+]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.atan2.f64(double {{.*}}, double {{.*}}) #[[ATTR_SYCL3:[0-9]+]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.atanh.f64(double {{.*}}) #[[ATTR_SYCL2]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_SYCL1]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.cosh.f64(double {{.*}}) #[[ATTR_SYCL1]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.erf.f64(double {{.*}}) #[[ATTR_SYCL4:[0-9]+]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.erfc.f64(double {{.*}}) #[[ATTR_SYCL4]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_SYCL5:[0-9]+]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.exp10.f64(double {{.*}}) #[[ATTR_SYCL5]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.exp2.f64(double {{.*}}) #[[ATTR_SYCL5]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.expm1.f64(double {{.*}}) #[[ATTR_SYCL5]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.fadd.f64(double {{.*}}, double {{.*}}) #[[ATTR_SYCL6:[0-9]+]]
+// CHECK-SPIR: call float @llvm.fpbuiltin.fdiv.f32(float {{.*}}, float {{.*}}) #[[ATTR_SYCL7:[0-9]+]]
+// CHECK-SPIR: call float @llvm.fpbuiltin.fmul.f32(float {{.*}}, float {{.*}}) #[[ATTR_SYCL6]]
+// CHECK-SPIR: call float @llvm.fpbuiltin.frem.f32(float {{.*}}, float {{.*}}) #[[ATTR_SYCL6]]
+// CHECK-SPIR: call float @llvm.fpbuiltin.fsub.f32(float {{.*}}, float {{.*}}) #[[ATTR_SYCL6]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.hypot.f64(double {{.*}}, double {{.*}}) #[[ATTR_SYCL1]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.ldexp.f64(double {{.*}}, i32 {{.*}}) #[[ATTR_SYCL6]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR_SYCL5]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.log10.f64(double {{.*}}) #[[ATTR_SYCL5]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.log1p.f64(double {{.*}}) #[[ATTR_SYCL8:[0-9]+]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.log2.f64(double {{.*}}) #[[ATTR_SYCL5]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.pow.f64(double {{.*}}, double {{.*}}) #[[ATTR_SYCL4]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.rsqrt.f64(double {{.*}}) #[[ATTR_SYCL8]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.sin.f64(double {{.*}}) #[[ATTR_SYCL1]]
+// CHECK-SPIR:    call void @llvm.fpbuiltin.sincos.f32(float {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_SYCL1]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.sinh.f64(double {{.*}}) #[[ATTR_SYCL1]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.sqrt.f64(double {{.*}}) #[[ATTR_SYCL6]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_SYCL2]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.tanh.f64(double {{.*}}) #[[ATTR_SYCL2]]
+//
+void f1(float a, float b) {
+  float p1 = 0.f, p2 = 0.f;
+
+  b = acos(b);
+  b = acosh(b);
+  b = asin(b);
+  b = asinh(b);
+  b = atan(b);
+  b = atan2(b,b);
+  b = atanh(b);
+  b = cos(b);
+  b = cosh(b);
+  b = erf(b);
+  b = erfc(b);
+  b = exp(b);
+  b = exp10(b);
+  b = exp2(b);
+  b = expm1(b);
+  b = fadd(b,b);
+  b = fdiv(b,b);
+  b = fmul(b,b);
+  b = frem(b,b);
+  b = fsub(b,b);
+  b = hypot(b,b);
+  b = ldexp(b,b);
+  b = log(b);
+  b = log10(b);
+  b = log1p(b);
+  b = log2(b);
+  b = pow(b,b);
+  b = rsqrt(b);
+  b = sin(b);
+  sincos(b,&p1,&p2);
+  b = sinh(b);
+  b = sqrt(b);
+  b =tan(b);
+  b = tanh(b);
+}
+// CHECK-LABEL: define dso_local void @f2
+// CHECK: call float @llvm.fpbuiltin.cos.f32(float {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call float @llvm.fpbuiltin.sin.f32(float {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call double @llvm.fpbuiltin.log10.f64(double {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_HIGH]]
+// CHECK: call float @tanf(float noundef {{.*}})
+//
+// CHECK-F1-LABEL: define dso_local void @f2
+// CHECK-F1: call float @llvm.cos.f32(float {{.*}})
+// CHECK-F1: call float @llvm.sin.f32(float {{.*}})
+// CHECK-F1: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F1_LOW]]
+// CHECK-F1: call double @llvm.fpbuiltin.log10.f64(double {{.*}}) #[[ATTR_F1_MEDIUM]]
+// CHECK-F1: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F1_MEDIUM]]
+// CHECK-F1: call float @tanf(float noundef {{.*}})
+//
+// CHECK-F2-LABEL: define dso_local void @f2
+// CHECK-F2: call float @llvm.fpbuiltin.cos.f32(float {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call float @llvm.fpbuiltin.sin.f32(float {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F2_HIGH]]
+// CHECK-F2: call double @llvm.fpbuiltin.log10.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F2_MEDIUM]]
+// CHECK-F2: call float @tanf(float noundef {{.*}})
+//
+// CHECK-SPIR-LABEL: define dso_local spir_func void @f2
+// CHECK-SPIR: call float @llvm.fpbuiltin.cos.f32(float {{.*}}) #[[ATTR_SYCL1]]
+// CHECK-SPIR: call float @llvm.fpbuiltin.sin.f32(float {{.*}}) #[[ATTR_SYCL1]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_SYCL2]]
+// CHECK-SPIR: call double @llvm.fpbuiltin.log10.f64(double {{.*}}) #[[ATTR_SYCL5]]
+// CHECK-SPIR: call void @llvm.fpbuiltin.sincos.f32(float {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_SYCL1]]
+// CHECK-SPIR: call spir_func float @tanf(float noundef {{.*}})
+
+// CHECK-LABEL: define dso_local void @f3
+// CHECK: call float @fake_exp10(float {{.*}})
+// CHECK-F1: call float @fake_exp10(float {{.*}})
+// CHECK-F2: call float @fake_exp10(float {{.*}})
+// CHECK-SPIR-LABEL: define dso_local spir_func void @f3
+// CHECK-SPIR: call spir_func float @fake_exp10(float {{.*}})
+
+// CHECK: attributes #[[ATTR_HIGH]] = {{.*}}"fpbuiltin-max-error="="1.0f"
+
+// CHECK-F1: attributes #[[ATTR_F1_HIGH]] = {{.*}}"fpbuiltin-max-error="="1.0f"
+// CHECK-F1: attributes #[[ATTR_F1_MEDIUM]] = {{.*}}"fpbuiltin-max-error="="4.0f"
+// CHECK-F1: attributes #[[ATTR_F1_LOW]] = {{.*}}"fpbuiltin-max-error="="67108864.0f"
+
+// CHECK-F2: attributes #[[ATTR_F2_MEDIUM]] = {{.*}}"fpbuiltin-max-error="="4.0f"
+// CHECK-F2: attributes #[[ATTR_F2_CUDA]] = {{.*}}"fpbuiltin-max-error="="2.0f"
+// CHECK-F2: attributes #[[ATTR_F2_HIGH]] = {{.*}}"fpbuiltin-max-error="="1.0f"
+
+// CHECK-SPIR: attributes #[[ATTR_SYCL1]] = {{.*}}"fpbuiltin-max-error="="4.0f"
+// CHECK-SPIR: attributes #[[ATTR_SYCL2]] = {{.*}}"fpbuiltin-max-error="="5.0f"
+// CHECK-SPIR: attributes #[[ATTR_SYCL3]] = {{.*}}"fpbuiltin-max-error="="6.0f"
+// CHECK-SPIR: attributes #[[ATTR_SYCL4]] = {{.*}}"fpbuiltin-max-error="="16.0f"
+// CHECK-SPIR: attributes #[[ATTR_SYCL5]] = {{.*}}"fpbuiltin-max-error="="3.0f"
+// CHECK-SPIR: attributes #[[ATTR_SYCL6]] = {{.*}}"fpbuiltin-max-error="="0.0f"
+// CHECK-SPIR: attributes #[[ATTR_SYCL7]] = {{.*}}"fpbuiltin-max-error="="2.5f"
+// CHECK-SPIR: attributes #[[ATTR_SYCL8]] = {{.*}}"fpbuiltin-max-error="="2.0f"
+
+// CHECK-DEFAULT-LABEL: define dso_local void @f1
+// CHECK-DEFAULT: call double @acos(double noundef {{.*}})
+// CHECK-DEFAULT: call double @acosh(double noundef {{.*}})
+// CHECK-DEFAULT: call double @asin(double noundef {{.*}})
+// CHECK-DEFAULT: call double @asinh(double noundef {{.*}})
+// CHECK-DEFAULT: call double @atan(double noundef {{.*}})
+// CHECK-DEFAULT: call double @atan2(double noundef {{.*}}, double noundef {{.*}})
+// CHECK-DEFAULT: call double @atanh(double noundef {{.*}})
+// CHECK-DEFAULT: call double @llvm.cos.f64(double {{.*}})
+// CHECK-DEFAULT: call double @cosh(double noundef {{.*}})
+// CHECK-DEFAULT: call double @erf(double noundef {{.*}})
+// CHECK-DEFAULT: call double @erfc(double noundef {{.*}})
+// CHECK-DEFAULT: call double @llvm.exp.f64(double {{.*}})
+// CHECK-DEFAULT: call i32 (double, ...) @exp10(double noundef {{.*}})
+// CHECK-DEFAULT: call double @llvm.exp2.f64(double {{.*}})
+// CHECK-DEFAULT: call double @expm1(double noundef {{.*}})
+// CHECK-DEFAULT: call i32 (double, double, ...) @fadd(double noundef {{.*}}, double noundef {{.*}})
+// CHECK-DEFAULT: call i32 (double, double, ...) @fdiv(double noundef {{.*}}, double noundef {{.*}})
+// CHECK-DEFAULT: call i32 (double, double, ...) @fmul(double noundef {{.*}}, double noundef {{.*}})
+// CHECK-DEFAULT: call i32 (double, double, ...) @frem(double noundef {{.*}}, double noundef {{.*}})
+// CHECK-DEFAULT: call i32 (double, double, ...) @fsub(double noundef {{.*}}, double noundef {{.*}})
+// CHECK-DEFAULT: call double @hypot(double noundef {{.*}}, double noundef {{.*}})
+// CHECK-DEFAULT: call double @ldexp(double noundef {{.*}}, i32 noundef {{.*}})
+// CHECK-DEFAULT: call double @llvm.log.f64(double {{.*}})
+// CHECK-DEFAULT: call double @llvm.log10.f64(double {{.*}})
+// CHECK-DEFAULT: call double @log1p(double noundef {{.*}})
+// CHECK-DEFAULT: call double @llvm.log2.f64(double {{.*}})
+// CHECK-DEFAULT: call double @llvm.pow.f64(double {{.*}}, double {{.*}})
+// CHECK-DEFAULT: call i32 (double, ...) @rsqrt(double noundef {{.*}})
+// CHECK-DEFAULT: call double @llvm.sin.f64(double {{.*}})
+// CHECK-DEFAULT: call i32 (double, ptr, ptr, ...) @sincos(double noundef {{.*}}, ptr noundef {{.*}}, ptr noundef {{.*}})
+// CHECK-DEFAULT: call double @sinh(double noundef {{.*}})
+// CHECK-DEFAULT: call double @llvm.sqrt.f64(double {{.*}})
+// CHECK-DEFAULT: call double @tan(double noundef {{.*}})
+// CHECK-DEFAULT: call double @tanh(double noundef {{.*}})
+//
+// CHECK-DEFAULT-LABEL: define dso_local void @f2
+// CHECK-DEFAULT: call float @llvm.cos.f32(float {{.*}})
+// CHECK-DEFAULT: call float @llvm.sin.f32(float {{.*}})
+// CHECK-DEFAULT: call double @tan(double noundef {{.*}})
+// CHECK-DEFAULT: call double @llvm.log10.f64(double {{.*}})
+// CHECK-DEFAULT: call i32 (double, ptr, ptr, ...) @sincos(double noundef {{.*}}, ptr noundef {{.*}}, ptr noundef {{.*}})
+// CHECK-DEFAULT: call float @tanf(float noundef {{.*}})
+
+// CHECK-DEFAULT-LABEL: define dso_local void @f3
+// CHECK-DEFAULT: call float @fake_exp10(float {{.*}})
+
+void f2(float a, float b) {
+  float sin = 0.f, cos = 0.f;
+
+  b = cosf(b);
+  b = sinf(b);
+  b = tan(b);
+  b = log10(b);
+  sincos(b, &sin, &cos);
+  b = tanf(b);
+}
+
+float fake_exp10(float a) __attribute__((no_builtin)){}
+void f3(float a, float b) {
+  a = fake_exp10(b);
+}
diff --git a/clang/test/Driver/fp-accuracy.c b/clang/test/Driver/fp-accuracy.c
new file mode 100644
index 0000000000000..e13c2dfc657f1
--- /dev/null
+++ b/clang/test/Driver/fp-accuracy.c
@@ -0,0 +1,64 @@
+// RUN: %clang -### -target x86_64 -ffp-accuracy=high -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=HIGH %s
+
+// RUN: %clang -### -target x86_64 -ffp-accuracy=low -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=LOW %s
+
+// RUN: %clang -### -target x86_64 -ffp-accuracy=medium -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=MEDIUM %s
+
+// RUN: %clang -### -target x86_64 -ffp-accuracy=sycl -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=SYCL %s
+
+// RUN: %clang -### -target x86_64 -ffp-accuracy=cuda -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=CUDA %s
+
+// RUN: %clang -### -target x86_64 -ffp-accuracy=low:sin,cos -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=FUNC-1 %s
+
+// RUN: %clang -### -target x86_64 -ffp-accuracy=low:sin,cos -ffp-accuracy=high:tan -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=FUNC-2 %s
+
+// RUN: not %clang -Xclang -verify -fno-math-errno -ffp-accuracy=foo %s 2>&1  \
+// RUN: | FileCheck %s --check-prefixes=ERR
+
+// RUN: not %clang -Xclang -verify -fno-math-errno -ffp-accuracy=foo:[sin,cos] %s 2>&1  \
+// RUN: | FileCheck %s --check-prefixes=ERR
+
+// RUN: not %clang -Xclang -verify -fno-math-errno -ffp-accuracy=foo:[sin,cos] \
+// RUN: -ffp-accuracy=goo %s 2>&1  \
+// RUN: | FileCheck %s --check-prefixes=ERR
+
+// RUN: not %clang -Xclang -verify -fno-math-errno -ffp-accuracy=foo:[sin,cos] \
+// RUN: -ffp-accuracy=goo:[tan] %s 2>&1  \
+// RUN: | FileCheck %s --check-prefixes=ERR-1
+
+// RUN: not %clang -Xclang -verify -fno-math-errno -ffp-accuracy=high=[sin] %s 2>& 1 \
+// RUN: | FileCheck %s --check-prefixes=ERR-2
+
+// RUN: not %clang -Xclang -verify -fno-math-errno -ffp-accuracy=low:[sin,cos] \
+// RUN: -ffp-accuracy=high %s 2>&1  \
+// RUN: | FileCheck %s --check-prefix=WARN
+
+// RUN: not %clang -Xclang -verify -ffp-accuracy=low:[sin,cos] \
+// RUN: -ffp-accuracy=high -fmath-errno %s 2>&1  \
+// RUN: | FileCheck %s --check-prefix=ERR-3
+
+// RUN: not %clang -Xclang -verify -ffp-accuracy=high \
+// RUN: -fmath-errno %s 2>&1  \
+// RUN: | FileCheck %s --check-prefixes=ERR-3
+
+
+// HIGH: "-ffp-builtin-accuracy=high"
+// LOW: "-ffp-builtin-accuracy=low"
+// MEDIUM: "-ffp-builtin-accuracy=medium"
+// SYCL: "-ffp-builtin-accuracy=sycl"
+// CUDA: "-ffp-builtin-accuracy=cuda"
+// FUNC-1: "-ffp-builtin-accuracy=low:sin,cos"
+// FUNC-2: "-ffp-builtin-accuracy=low:sin,cos high:tan"
+// ERR: (frontend): unsupported argument 'foo' to option '-ffp-accuracy'
+// ERR-1: (frontend): unsupported argument 'foo' to option '-ffp-accuracy'
+// ERR-2: (frontend): unsupported argument 'high=[sin]' to option '-ffp-accuracy'
+// WARN: (frontend): floating point accuracy value of 'high' has already been assigned to function 'cos'
+// WARN: (frontend): floating point accuracy value of 'high' has already been assigned to function 'sin'
+// ERR-3: (frontend): floating point accuracy requirements cannot be guaranteed when '-fmath-errno' is enabled; use '-fno-math-errno' to enable floating point accuracy control

From c2f0858eb28579aca1836ccfa36d51713fdc9b43 Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Wed, 14 Jun 2023 07:16:58 -0700
Subject: [PATCH 22/55] [CI] Remove FPGA Emulator workaround (#9855)

I don't think we test it anywhere in our CI pipeline.
---
 devops/actions/e2e-tests/action.yml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/devops/actions/e2e-tests/action.yml b/devops/actions/e2e-tests/action.yml
index 7da247dd297c1..d3efde094b3a0 100644
--- a/devops/actions/e2e-tests/action.yml
+++ b/devops/actions/e2e-tests/action.yml
@@ -55,12 +55,6 @@ runs:
       else
         echo "no TBB vars in /opt/runtimes or /runtimes";
       fi
-      # TODO remove workaround of FPGA emu bug
-      mkdir -p icd
-      echo /usr/lib/x86_64-linux-gnu/intel-opencl/libigdrcl.so > icd/gpu.icd
-      echo /runtimes/oclcpu/x64/libintelocl.so > icd/cpu.icd
-      echo /opt/runtimes/oclcpu/x64/libintelocl.so > icd/cpu2.icd
-      export OCL_ICD_VENDORS=$PWD/icd
       echo "::group::sycl-ls --verbose"
       sycl-ls --verbose
       echo "::endgroup::"

From 5f0dbfed527b8f393de4b8a87afa7b1fe7953c6b Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Wed, 14 Jun 2023 07:17:33 -0700
Subject: [PATCH 23/55] [CI] Remove ROCm LD_LIBRARY_PATH setup (#9856)

Doesn't seem to be needed.
---
 devops/actions/e2e-tests/action.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/devops/actions/e2e-tests/action.yml b/devops/actions/e2e-tests/action.yml
index d3efde094b3a0..4190f755edf27 100644
--- a/devops/actions/e2e-tests/action.yml
+++ b/devops/actions/e2e-tests/action.yml
@@ -45,8 +45,6 @@ runs:
     run: |
       export LD_LIBRARY_PATH=$PWD/toolchain/lib/:$LD_LIBRARY_PATH
       export PATH=$PWD/toolchain/bin/:$PATH
-      # TODO make this part of container build
-      export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/hip/lib/:/opt/rocm/lib
       export LIT_OPTS="-v --no-progress-bar --show-unsupported --max-time 3600 --time-tests"
       if [ -e /runtimes/oneapi-tbb/env/vars.sh ]; then
         source /runtimes/oneapi-tbb/env/vars.sh;

From c063b99843d61867881634e5e90ce16b184547d2 Mon Sep 17 00:00:00 2001
From: Dmitry Sidorov <dmitry.sidorov@intel.com>
Date: Wed, 14 Jun 2023 17:00:40 +0200
Subject: [PATCH 24/55] [SYCL] Re-land "Represent JointMatrixINTEL type as
 extension type" (#9841)

This reverts commit 4447a50fb73068a438f0a332b138e73d824d5c2b. Previous
attempt: https://github.com/intel/llvm/pull/8343

What changed: One extra patch is being added to the headers:
https://github.com/intel/llvm/pull/9841/commits/ca0595b0f5069333ea58ff18ecbc4d34abcf3c21
with this patch clang won't generate llvm.memcpy for trivial c'tor. So
later on inst combine won't
replace it with a cast to i64 followed by load + store which SROA +
mem2reg won't be able to handle
for target extension types.

It adds:
ConvertSYCLJointMatrixINTELType - Convert SYCL joint_matrix type which
is represented as a pointer to a structure to LLVM extension type with
the parameters that follow SPIR-V JointMatrixINTEL type. The expected
representation is:
target("spirv.JointMatrixINTEL", %element_type, %rows%, %cols%, %scope%,
%use%, (optional) %element_type_interpretation%)

Better approach is to introduce joint matrix type to clang, but it's off
the table now, since we are lacking OpenCL
spec.

Co-authored-by: Joshua Cranmer <joshua.cranmer@intel.com>

---------

Signed-off-by: Sidorov, Dmitry <dmitry.sidorov@intel.com>
Co-authored-by: Alexey Bader <alexey.bader@intel.com>
---
 clang/lib/CodeGen/CodeGenTypes.cpp            | 142 ++++++++++--------
 clang/lib/CodeGen/CodeGenTypes.h              |   8 +
 clang/test/CodeGenSYCL/matrix.cpp             |  34 ++---
 .../sycl/ext/oneapi/matrix/matrix-jit.hpp     |  18 +++
 .../sycl/ext/oneapi/matrix/matrix-unified.hpp |  17 +++
 .../matrix/matrix_load_store_as.cpp           |   9 +-
 .../matrix/matrix_load_store_as_legacy.cpp    |   9 +-
 sycl/test/matrix/legacy/matrix-int8-test.cpp  |   6 +-
 sycl/test/matrix/matrix-int8-test.cpp         |   6 +-
 9 files changed, 161 insertions(+), 88 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index 866661968293e..f16ecb66c3ccb 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -51,65 +51,6 @@ void CodeGenTypes::addRecordTypeName(const RecordDecl *RD,
                                      StringRef suffix) {
   SmallString<256> TypeName;
   llvm::raw_svector_ostream OS(TypeName);
-  // If RD is spirv_JointMatrixINTEL type, mangle differently.
-  if (CGM.getTriple().isSPIRV() || CGM.getTriple().isSPIR()) {
-    if (RD->getQualifiedNameAsString() == "__spv::__spirv_JointMatrixINTEL") {
-      if (auto TemplateDecl = dyn_cast<ClassTemplateSpecializationDecl>(RD)) {
-        ArrayRef<TemplateArgument> TemplateArgs =
-            TemplateDecl->getTemplateArgs().asArray();
-        OS << "spirv.JointMatrixINTEL.";
-        for (auto &TemplateArg : TemplateArgs) {
-          OS << "_";
-          if (TemplateArg.getKind() == TemplateArgument::Type) {
-            llvm::Type *TTy = ConvertType(TemplateArg.getAsType());
-            if (TTy->isIntegerTy()) {
-              switch (TTy->getIntegerBitWidth()) {
-              case 8:
-                OS << "char";
-                break;
-              case 16:
-                OS << "short";
-                break;
-              case 32:
-                OS << "int";
-                break;
-              case 64:
-                OS << "long";
-                break;
-              default:
-                OS << "i" << TTy->getIntegerBitWidth();
-                break;
-              }
-            } else if (TTy->isHalfTy()) {
-              OS << "half";
-            } else if (TTy->isFloatTy()) {
-              OS << "float";
-            } else if (TTy->isDoubleTy()) {
-              OS << "double";
-            } else if (TTy->isBFloatTy()) {
-              OS << "bfloat16";
-            } else if (TTy->isStructTy()) {
-              StringRef LlvmTyName = TTy->getStructName();
-              // Emit half/bfloat16/tf32 for sycl[::*]::{half,bfloat16,tf32}
-              if (LlvmTyName.startswith("class.sycl::") ||
-                  LlvmTyName.startswith("class.__sycl_internal::"))
-                LlvmTyName = LlvmTyName.rsplit("::").second;
-              if (LlvmTyName != "half" && LlvmTyName != "bfloat16" &&
-                  LlvmTyName != "tf32")
-                llvm_unreachable("Wrong matrix base type!");
-              OS << LlvmTyName;
-            } else {
-              llvm_unreachable("Wrong matrix base type!");
-            }
-          } else if (TemplateArg.getKind() == TemplateArgument::Integral) {
-            OS << TemplateArg.getAsIntegral();
-          }
-        }
-        Ty->setName(OS.str());
-        return;
-      }
-    }
-  }
   OS << RD->getKindName() << '.';
 
   // FIXME: We probably want to make more tweaks to the printing policy. For
@@ -460,6 +401,77 @@ llvm::Type *CodeGenTypes::ConvertFunctionTypeInternal(QualType QFT) {
   return ResultType;
 }
 
+template <bool NeedTypeInterpret = false>
+llvm::Type *getJointMatrixINTELExtType(llvm::Type *CompTy,
+                                       ArrayRef<TemplateArgument> TemplateArgs,
+                                       const unsigned Val = 0) {
+  // TODO: we should actually have exactly 5 template parameters: 1 for
+  // type and 4 for type parameters. But in previous version of the SPIR-V
+  // spec we have Layout matrix type parameter, that was later removed.
+  // Once we update to the newest version of the spec - this should be updated.
+  assert((TemplateArgs.size() == 5 || TemplateArgs.size() == 6) &&
+         "Wrong JointMatrixINTEL template parameters number");
+  // This is required to represent optional 'Component Type Interpretation'
+  // parameter
+  std::vector<unsigned> Params;
+  for (size_t I = 1; I != TemplateArgs.size(); ++I) {
+    assert(TemplateArgs[I].getKind() == TemplateArgument::Integral &&
+           "Wrong JointMatrixINTEL template parameter");
+    Params.push_back(TemplateArgs[I].getAsIntegral().getExtValue());
+  }
+  // Don't add type interpretation for legacy matrices.
+  // Legacy matrices has 5 template parameters, while new representation
+  // has 6.
+  if (NeedTypeInterpret && TemplateArgs.size() != 5)
+    Params.push_back(Val);
+
+  return llvm::TargetExtType::get(CompTy->getContext(),
+                                  "spirv.JointMatrixINTEL", {CompTy}, Params);
+}
+
+/// ConvertSYCLJointMatrixINTELType - Convert SYCL joint_matrix type
+/// which is represented as a pointer to a structure to LLVM extension type
+/// with the parameters that follow SPIR-V JointMatrixINTEL type.
+/// The expected representation is:
+/// target("spirv.JointMatrixINTEL", %element_type, %rows%, %cols%, %scope%,
+///        %use%, (optional) %element_type_interpretation%)
+llvm::Type *CodeGenTypes::ConvertSYCLJointMatrixINTELType(RecordDecl *RD) {
+  auto *TemplateDecl = cast<ClassTemplateSpecializationDecl>(RD);
+  ArrayRef<TemplateArgument> TemplateArgs =
+      TemplateDecl->getTemplateArgs().asArray();
+  assert(TemplateArgs[0].getKind() == TemplateArgument::Type &&
+         "1st JointMatrixINTEL template parameter must be type");
+  llvm::Type *CompTy = ConvertType(TemplateArgs[0].getAsType());
+
+  // Per JointMatrixINTEL spec the type can have an optional
+  // 'Component Type Interpretation' parameter. We should emit it in case
+  // if on SYCL level joint matrix accepts 'bfloat16' or 'tf32' objects as
+  // matrix's components. Yet 'bfloat16' should be represented as 'int16' and
+  // 'tf32' as 'float' types.
+  if (CompTy->isStructTy()) {
+    StringRef LlvmTyName = CompTy->getStructName();
+    // Emit half/int16/float for sycl[::*]::{half,bfloat16,tf32}
+    if (LlvmTyName.startswith("class.sycl::") ||
+        LlvmTyName.startswith("class.__sycl_internal::"))
+      LlvmTyName = LlvmTyName.rsplit("::").second;
+    if (LlvmTyName == "half") {
+      CompTy = llvm::Type::getHalfTy(getLLVMContext());
+      return getJointMatrixINTELExtType(CompTy, TemplateArgs);
+    } else if (LlvmTyName == "tf32") {
+      CompTy = llvm::Type::getFloatTy(getLLVMContext());
+      // 'tf32' interpretation is mapped to '0'
+      return getJointMatrixINTELExtType<true>(CompTy, TemplateArgs, 0);
+    } else if (LlvmTyName == "bfloat16") {
+      CompTy = llvm::Type::getInt16Ty(getLLVMContext());
+      // 'bfloat16' interpretation is mapped to '1'
+      return getJointMatrixINTELExtType<true>(CompTy, TemplateArgs, 1);
+    } else {
+      llvm_unreachable("Wrong matrix base type!");
+    }
+  }
+  return getJointMatrixINTELExtType(CompTy, TemplateArgs);
+}
+
 /// ConvertType - Convert the specified type to its LLVM form.
 llvm::Type *CodeGenTypes::ConvertType(QualType T) {
   T = Context.getCanonicalType(T);
@@ -754,6 +766,18 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
     llvm::Type *PointeeType = ConvertTypeForMem(ETy);
     if (PointeeType->isVoidTy())
       PointeeType = llvm::Type::getInt8Ty(getLLVMContext());
+    if (CGM.getTriple().isSPIRV() || CGM.getTriple().isSPIR()) {
+      const Type *ClangETy = ETy.getTypePtrOrNull();
+      if (ClangETy && ClangETy->isStructureOrClassType()) {
+        RecordDecl *RD = ClangETy->getAsCXXRecordDecl();
+        if (RD && RD->getQualifiedNameAsString() ==
+                      "__spv::__spirv_JointMatrixINTEL") {
+          ResultType = ConvertSYCLJointMatrixINTELType(RD);
+          break;
+        }
+      }
+    }
+
     unsigned AS = getTargetAddressSpace(ETy);
     ResultType = llvm::PointerType::get(PointeeType, AS);
     break;
diff --git a/clang/lib/CodeGen/CodeGenTypes.h b/clang/lib/CodeGen/CodeGenTypes.h
index e76fda95513f6..3f198b2a3de1a 100644
--- a/clang/lib/CodeGen/CodeGenTypes.h
+++ b/clang/lib/CodeGen/CodeGenTypes.h
@@ -133,6 +133,14 @@ class CodeGenTypes {
   /// memory representation is usually i8 or i32, depending on the target.
   llvm::Type *ConvertTypeForMem(QualType T, bool ForBitField = false);
 
+  /// ConvertSYCLJointMatrixINTELType - Convert SYCL joint_matrix type
+  /// which is represented as a pointer to a structure to LLVM extension type
+  /// with the parameters that follow SPIR-V JointMatrixINTEL type.
+  /// The expected representation is:
+  /// target("spirv.JointMatrixINTEL", %element_type, %rows%, %cols%, %scope%,
+  ///        %use%, (optional) %element_type_interpretation%)
+  llvm::Type *ConvertSYCLJointMatrixINTELType(RecordDecl *RD);
+
   /// GetFunctionType - Get the LLVM function type for \arg Info.
   llvm::FunctionType *GetFunctionType(const CGFunctionInfo &Info);
 
diff --git a/clang/test/CodeGenSYCL/matrix.cpp b/clang/test/CodeGenSYCL/matrix.cpp
index 69469811047fd..b2c0c51adba6e 100644
--- a/clang/test/CodeGenSYCL/matrix.cpp
+++ b/clang/test/CodeGenSYCL/matrix.cpp
@@ -5,18 +5,18 @@
 #include <stdint.h>
 
 namespace __spv {
-  template <typename T, size_t R, size_t C, uint32_t U, uint32_t S>
+  template <typename T, size_t R, size_t C, uint32_t L, uint32_t S, uint32_t U>
   struct __spirv_JointMatrixINTEL;
 }
 
-// CHECK: @_Z2f1{{.*}}(%spirv.JointMatrixINTEL._float_5_10_0_1
-void f1(__spv::__spirv_JointMatrixINTEL<float, 5, 10, 0, 1> *matrix) {}
+// CHECK: @_Z2f1{{.*}}(target("spirv.JointMatrixINTEL", float, 5, 10, 0, 1, 0)
+void f1(__spv::__spirv_JointMatrixINTEL<float, 5, 10, 0, 1, 0> *matrix) {}
 
-// CHECK: @_Z2f2{{.*}}(%spirv.JointMatrixINTEL._long_10_2_0_0
-void f2(__spv::__spirv_JointMatrixINTEL<uint64_t, 10, 2, 0, 0> *matrix) {}
+// CHECK: @_Z2f2{{.*}}(target("spirv.JointMatrixINTEL", i64, 10, 2, 0, 0, 0)
+void f2(__spv::__spirv_JointMatrixINTEL<uint64_t, 10, 2, 0, 0, 0> *matrix) {}
 
-// CHECK: @_Z2f3{{.*}}(%spirv.JointMatrixINTEL._char_10_2_0_0
-void f3(__spv::__spirv_JointMatrixINTEL<char, 10, 2, 0, 0> *matrix) {}
+// CHECK: @_Z2f3{{.*}}(target("spirv.JointMatrixINTEL", i8, 10, 2, 0, 0, 0)
+void f3(__spv::__spirv_JointMatrixINTEL<char, 10, 2, 0, 0, 0> *matrix) {}
 
 namespace sycl {
   class half {};
@@ -25,17 +25,17 @@ namespace sycl {
 }
 typedef sycl::half my_half;
 
-// CHECK: @_Z2f4{{.*}}(%spirv.JointMatrixINTEL._half_10_2_0_0
-void f4(__spv::__spirv_JointMatrixINTEL<my_half, 10, 2, 0, 0> *matrix) {}
+// CHECK: @_Z2f4{{.*}}(target("spirv.JointMatrixINTEL", half, 10, 2, 0, 0, 0)
+void f4(__spv::__spirv_JointMatrixINTEL<my_half, 10, 2, 0, 0, 0> *matrix) {}
 
-// CHECK: @_Z2f5{{.*}}(%spirv.JointMatrixINTEL._bfloat16_10_2_0_0
-void f5(__spv::__spirv_JointMatrixINTEL<sycl::bfloat16, 10, 2, 0, 0> *matrix) {}
+// CHECK: @_Z2f5{{.*}}(target("spirv.JointMatrixINTEL", i16, 10, 2, 0, 0, 0, 1)
+void f5(__spv::__spirv_JointMatrixINTEL<sycl::bfloat16, 10, 2, 0, 0, 0> *matrix) {}
 
-// CHECK: @_Z2f6{{.*}}(%spirv.JointMatrixINTEL._i128_10_2_0_0
-void f6(__spv::__spirv_JointMatrixINTEL<_BitInt(128), 10, 2, 0, 0> *matrix) {}
+// CHECK: @_Z2f6{{.*}}(target("spirv.JointMatrixINTEL", i128, 10, 2, 0, 0, 0)
+void f6(__spv::__spirv_JointMatrixINTEL<_BitInt(128), 10, 2, 0, 0, 0> *matrix) {}
 
-// CHECK: @_Z2f7{{.*}}(%spirv.JointMatrixINTEL._tf32_10_2_0_0
-void f7(__spv::__spirv_JointMatrixINTEL<sycl::tf32, 10, 2, 0, 0> *matrix) {}
+// CHECK: @_Z2f7{{.*}}(target("spirv.JointMatrixINTEL", float, 10, 2, 0, 0, 0, 0)
+void f7(__spv::__spirv_JointMatrixINTEL<sycl::tf32, 10, 2, 0, 0, 0> *matrix) {}
 
-// CHECK: @_Z2f8{{.*}}(%spirv.JointMatrixINTEL._double_5_10_0_1
-void f8(__spv::__spirv_JointMatrixINTEL<double, 5, 10, 0, 1> *matrix) {}
+// CHECK: @_Z2f8{{.*}}(target("spirv.JointMatrixINTEL", double, 5, 10, 0, 1, 0)
+void f8(__spv::__spirv_JointMatrixINTEL<double, 5, 10, 0, 1, 0> *matrix) {}
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit.hpp
index 174e1a5f8d744..3fe1c2e2285dc 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit.hpp
@@ -69,6 +69,24 @@ struct joint_matrix {
   get_wi_data() {
     return wi_data<T, NumRows, NumCols, Layout, Group>(*this);
   }
+
+#ifdef __SYCL_DEVICE_ONLY__
+#if defined(__SPIR__)
+  // Generate a non-trivial assignment operator and copy c'tor that prevents
+  // memcpy from being generated.
+  // TODO: to remove, when either IGC can handle alloca JointMatrix or
+  // combination of InstCombine + SROA + mem2reg can remove it
+  joint_matrix(const joint_matrix &other) {
+    spvm = other.spvm;
+    return *this;
+  }
+
+  joint_matrix &operator=(const joint_matrix &rhs) {
+    spvm = rhs.spvm;
+    return *this;
+  }
+#endif // defined(__SPIR__)
+#endif
 };
 
 template <typename Group, typename T, size_t NumRows, size_t NumCols,
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
index dab25807e93b6..8e4a5e5692fc0 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
@@ -42,6 +42,23 @@ struct joint_matrix {
                         PI_ERROR_INVALID_DEVICE);
 #endif
   }
+#ifdef __SYCL_DEVICE_ONLY__
+#if defined(__SPIR__)
+  // Generate a non-trivial assignment operator and copy c'tor that prevents
+  // memcpy from being generated.
+  // TODO: to remove, when either IGC can handle alloca JointMatrix or
+  // combination of InstCombine + SROA + mem2reg can remove it
+  joint_matrix(const joint_matrix &other) {
+    spvm = other.spvm;
+    return *this;
+  }
+
+  joint_matrix &operator=(const joint_matrix &rhs) {
+    spvm = rhs.spvm;
+    return *this;
+  }
+#endif // defined(__SPIR__)
+#endif
 };
 
 #ifdef __SYCL_DEVICE_ONLY__
diff --git a/sycl/test/check_device_code/matrix/matrix_load_store_as.cpp b/sycl/test/check_device_code/matrix/matrix_load_store_as.cpp
index b66be2a0b6187..e4d6ab6cdd7a1 100644
--- a/sycl/test/check_device_code/matrix/matrix_load_store_as.cpp
+++ b/sycl/test/check_device_code/matrix/matrix_load_store_as.cpp
@@ -1,5 +1,8 @@
 // RUN: %clangxx -fsycl-device-only -S -emit-llvm -o - %s | FileCheck %s
 
+// Check that SROA and mem2reg won't leave alloca of matrix type in IR
+// CHECK-NOT: alloca target("spirv.JointMatrixINTEL"
+
 // check that correct address spaces are used to load from and store to
 #define SYCL_EXT_ONEAPI_MATRIX_VERSION 4
 #include <sycl/sycl.hpp>
@@ -39,16 +42,16 @@ int main(void) {
           it.barrier(access::fence_space::local_space);
 
           // A should load from local address space
-          // CHECK: %{{.*}} = tail call spir_func noundef %spirv.JointMatrixINTEL._short_8_16_0_3_0 addrspace(4)* @_Z[[#]]__spirv_JointMatrixLoadINTEL{{.*}}(i16 addrspace(3)* noundef %{{.*}}, i64 noundef 16, i32 noundef 0, i32 noundef 3, i32 noundef 0) #{{.*}}
+          // CHECK: %{{.*}} = tail call spir_func noundef target("spirv.JointMatrixINTEL", i16, 8, 16, 0, 3, 0) @_Z[[#]]__spirv_JointMatrixLoadINTEL{{.*}}(i16 addrspace(3)* noundef %{{.*}}, i64 noundef 16, i32 noundef 0, i32 noundef 3, i32 noundef 0) #{{.*}}
           joint_matrix_load(
               sg, tA,
               tileA.template get_multi_ptr<sycl::access::decorated::yes>(), 16);
           // B should load from global address space
-          // CHECK: %{{.*}} = tail call spir_func noundef %spirv.JointMatrixINTEL._short_16_16_2_3_1 addrspace(4)* @_Z[[#]]__spirv_JointMatrixLoadINTEL{{.*}}(i16 addrspace(1)* noundef %{{.*}}, i64 noundef 32, i32 noundef 2, i32 noundef 3, i32 noundef 0) #{{.*}}
+          // CHECK: %{{.*}} = tail call spir_func noundef target("spirv.JointMatrixINTEL", i16, 16, 16, 2, 3, 1) @_Z[[#]]__spirv_JointMatrixLoadINTEL{{.*}}(i16 addrspace(1)* noundef %{{.*}}, i64 noundef 32, i32 noundef 2, i32 noundef 3, i32 noundef 0) #{{.*}}
           joint_matrix_load(sg, tB, pB, 32);
           tC = joint_matrix_mad(sg, tA, tB, tC);
           // C should store to global address space
-          // CHECK: tail call spir_func void @_Z[[#]]__spirv_JointMatrixStoreINTEL{{.*}}(float addrspace(1)* noundef %{{.*}}, %spirv.JointMatrixINTEL._float_8_16_3_3_2 addrspace(4)* noundef %{{.*}}, i64 noundef 16, i32 noundef 0, i32 noundef 3, i32 noundef 0) #{{.*}}
+          // CHECK: tail call spir_func void @_Z[[#]]__spirv_JointMatrixStoreINTEL{{.*}}(float addrspace(1)* noundef %{{.*}}, target("spirv.JointMatrixINTEL", float, 8, 16, 3, 3, 2) noundef %{{.*}}, i64 noundef 16, i32 noundef 0, i32 noundef 3, i32 noundef 0) #{{.*}}
           joint_matrix_store(sg, tC, pC, 16, layout::row_major);
         });
   });
diff --git a/sycl/test/check_device_code/matrix/matrix_load_store_as_legacy.cpp b/sycl/test/check_device_code/matrix/matrix_load_store_as_legacy.cpp
index 57b7bdb183518..0ea08030a03ee 100644
--- a/sycl/test/check_device_code/matrix/matrix_load_store_as_legacy.cpp
+++ b/sycl/test/check_device_code/matrix/matrix_load_store_as_legacy.cpp
@@ -1,5 +1,8 @@
 // RUN: %clangxx -fsycl-device-only -S -emit-llvm -o - %s | FileCheck %s
 
+// Check that SROA and mem2reg won't leave alloca of matrix type in IR
+// CHECK-NOT: alloca target("spirv.JointMatrixINTEL"
+
 // check that correct address spaces are used to load from and store to
 #define SYCL_EXT_ONEAPI_MATRIX_VERSION 1
 #include <sycl/sycl.hpp>
@@ -36,17 +39,17 @@ int main(void) {
           it.barrier(access::fence_space::local_space);
 
           // A should load from local address space
-          // CHECK: %{{.*}} = tail call spir_func noundef %spirv.JointMatrixINTEL._short_8_16_0_3 addrspace(4)* @_Z[[#]]__spirv_JointMatrixLoadINTEL{{.*}}(i16 addrspace(3)* noundef %{{.*}}, i64 noundef 16, i32 noundef 0, i32 noundef 3, i32 noundef 0) #{{.*}}
+          // CHECK: %{{.*}} = tail call spir_func noundef target("spirv.JointMatrixINTEL", i16, 8, 16, 0, 3) @_Z[[#]]__spirv_JointMatrixLoadINTEL{{.*}}(i16 addrspace(3)* noundef %{{.*}}, i64 noundef 16, i32 noundef 0, i32 noundef 3, i32 noundef 0) #{{.*}}
           joint_matrix_load(
               sg, tA,
               tileA.template get_multi_ptr<sycl::access::decorated::yes>(), 16,
               matrix_layout::row_major);
           // B should load from global address space
-          // CHECK: %{{.*}} = tail call spir_func noundef %spirv.JointMatrixINTEL._short_16_16_3_3 addrspace(4)* @_Z[[#]]__spirv_JointMatrixLoadINTEL{{.*}}(i16 addrspace(1)* noundef %{{.*}}, i64 noundef 32, i32 noundef [[#]], i32 noundef 3, i32 noundef 0) #{{.*}}
+          // CHECK: %{{.*}} = tail call spir_func noundef target("spirv.JointMatrixINTEL", i16, 16, 16, 3, 3) @_Z[[#]]__spirv_JointMatrixLoadINTEL{{.*}}(i16 addrspace(1)* noundef %{{.*}}, i64 noundef 32, i32 noundef [[#]], i32 noundef 3, i32 noundef 0) #{{.*}}
           joint_matrix_load(sg, tB, pB, 32, matrix_layout::packed_b);
           tC = joint_matrix_mad(sg, tA, tB, tC);
           // C should store to global address space
-          // CHECK: tail call spir_func void @_Z[[#]]__spirv_JointMatrixStoreINTEL{{.*}}(float addrspace(1)* noundef %{{.*}}, %spirv.JointMatrixINTEL._float_8_16_0_3 addrspace(4)* noundef %{{.*}}, i64 noundef 16, i32 noundef 0, i32 noundef 3, i32 noundef 0) #{{.*}}
+          // CHECK: tail call spir_func void @_Z[[#]]__spirv_JointMatrixStoreINTEL{{.*}}(float addrspace(1)* noundef %{{.*}}, target("spirv.JointMatrixINTEL", float, 8, 16, 0, 3) noundef %{{.*}}, i64 noundef 16, i32 noundef 0, i32 noundef 3, i32 noundef 0) #{{.*}}
           joint_matrix_store(sg, tC, pC, 16, matrix_layout::row_major);
         });
   });
diff --git a/sycl/test/matrix/legacy/matrix-int8-test.cpp b/sycl/test/matrix/legacy/matrix-int8-test.cpp
index 6efc0e89b0a57..bf1e71b95224b 100644
--- a/sycl/test/matrix/legacy/matrix-int8-test.cpp
+++ b/sycl/test/matrix/legacy/matrix-int8-test.cpp
@@ -1,8 +1,8 @@
 // RUN: %clangxx -fsycl -fsycl-device-only -O2 -DSYCL_EXT_ONEAPI_MATRIX_VERSION=1 -S -emit-llvm -o - %s | FileCheck %s
 
-// CHECK-DAG: %spirv.JointMatrixINTEL._char_12_48_0_3 = type opaque
-// CHECK-DAG: %spirv.JointMatrixINTEL._int_12_12_0_3 = type opaque
-// CHECK-DAG: %spirv.JointMatrixINTEL._char_48_12_3_3 = type opaque
+// CHECK-DAG: target("spirv.JointMatrixINTEL", i8, 12, 48, 0, 3)
+// CHECK-DAG: target("spirv.JointMatrixINTEL", i32, 12, 12, 0, 3)
+// CHECK-DAG: target("spirv.JointMatrixINTEL", i8, 48, 12, 3, 3)
 
 #include <iostream>
 #include <sycl/sycl.hpp>
diff --git a/sycl/test/matrix/matrix-int8-test.cpp b/sycl/test/matrix/matrix-int8-test.cpp
index 63866c19f89fa..f8dcc26ab1b17 100644
--- a/sycl/test/matrix/matrix-int8-test.cpp
+++ b/sycl/test/matrix/matrix-int8-test.cpp
@@ -1,8 +1,8 @@
 // RUN: %clangxx -fsycl -fsycl-device-only -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -O2 -S -emit-llvm -o - %s | FileCheck %s
 
-// CHECK-DAG: %spirv.JointMatrixINTEL._char_12_48_0_3_0 = type opaque
-// CHECK-DAG: %spirv.JointMatrixINTEL._int_12_12_3_3_2 = type opaque
-// CHECK-DAG: %spirv.JointMatrixINTEL._char_48_12_2_3_1 = type opaque
+// CHECK-DAG: target("spirv.JointMatrixINTEL", i8, 12, 48, 0, 3, 0)
+// CHECK-DAG: target("spirv.JointMatrixINTEL", i32, 12, 12, 3, 3, 2)
+// CHECK-DAG: target("spirv.JointMatrixINTEL", i8, 48, 12, 2, 3, 1)
 
 #include <iostream>
 #include <sycl/sycl.hpp>

From 90406b2ba07cc92982708c47a6c7f46e786329a1 Mon Sep 17 00:00:00 2001
From: Justin Cai <justin.cai@intel.com>
Date: Wed, 14 Jun 2023 08:17:28 -0700
Subject: [PATCH 25/55] [SYCL] Const-qualify sycl::logical_{or, and} (#9861)

These operators were changed from aliasing their `std` counterparts in
https://github.com/intel/llvm/pull/9298 but a const-qualification was
not added (as required by [4.17.2. Function
objects](https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:function-objects)).
---
 sycl/include/sycl/functional.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/include/sycl/functional.hpp b/sycl/include/sycl/functional.hpp
index 96ee0ac95b7cf..d93201c590af1 100644
--- a/sycl/include/sycl/functional.hpp
+++ b/sycl/include/sycl/functional.hpp
@@ -24,13 +24,13 @@ template <typename T = void> using bit_xor = std::bit_xor<T>;
 // std:logical_and/std::logical_or with a non-void type returns bool,
 // sycl requires returning T.
 template <typename T = void> struct logical_and {
-  T operator()(const T &lhs, const T &rhs) { return lhs && rhs; }
+  T operator()(const T &lhs, const T &rhs) const { return lhs && rhs; }
 };
 
 template <> struct logical_and<void> : std::logical_and<void> {};
 
 template <typename T = void> struct logical_or {
-  T operator()(const T &lhs, const T &rhs) { return lhs || rhs; }
+  T operator()(const T &lhs, const T &rhs) const { return lhs || rhs; }
 };
 
 template <> struct logical_or<void> : std::logical_or<void> {};

From 83d0997587707c3026963f325a83f5f791ec391b Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Wed, 14 Jun 2023 11:23:17 -0400
Subject: [PATCH 26/55] [SYCL][DOC] Add sycl_ext_intel_grf_size extension
 (#9779)

This extension is used to specify the register mode on an Intel GPU.

Currently we only support specific register mode values on specific
GPUs.

---------

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 .../proposed/sycl_ext_intel_grf_size.asciidoc | 178 ++++++++++++++++++
 1 file changed, 178 insertions(+)
 create mode 100644 sycl/doc/extensions/proposed/sycl_ext_intel_grf_size.asciidoc

diff --git a/sycl/doc/extensions/proposed/sycl_ext_intel_grf_size.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_intel_grf_size.asciidoc
new file mode 100644
index 0000000000000..3323ea61718fb
--- /dev/null
+++ b/sycl/doc/extensions/proposed/sycl_ext_intel_grf_size.asciidoc
@@ -0,0 +1,178 @@
+= sycl_ext_intel_grf_size
+
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+:dpcpp: pass:[DPC++]
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+
+== Notice
+
+[%hardbreaks]
+Copyright (C) 2023-2023 Intel Corporation.  All rights reserved.
+
+Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks
+of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc. used by
+permission by Khronos.
+
+
+== Contact
+
+To report problems with this extension, please open a new issue at:
+
+https://github.com/intel/llvm/issues
+
+
+== Dependencies
+
+This extension is written against the SYCL 2020 revision 7 specification.  All
+references below to the "core SYCL specification" or to section numbers in the
+SYCL specification refer to that revision.
+
+This extension also depends on the following other SYCL extensions:
+
+* link:../experimental/sycl_ext_oneapi_properties.asciidoc[
+  sycl_ext_oneapi_properties]
+* link:../experimental/sycl_ext_oneapi_kernel_properties.asciidoc[
+  sycl_ext_oneapi_kernel_properties]
+
+== Status
+
+This is a proposed extension specification, intended to gather community
+feedback.  Interfaces defined in this specification may not be implemented yet
+or may be in a preliminary state.  The specification itself may also change in
+incompatible ways before it is finalized.  *Shipping software products should
+not rely on APIs defined in this specification.*
+
+
+== Backend support status
+
+This extension is currently implemented in {dpcpp} only for Intel GPU devices and
+only when using the Level Zero backend or OpenCL backend.
+Attempting to use this extension in kernels that run on other devices or
+backends may result in undefined behavior.  Be aware that the compiler
+is not able to issue a diagnostic to warn you if this happens.
+
+== Overview
+
+There are devices where the size of the general register file (GRF) used by a kernel is 
+configurable. Developers may want to change the GRF size based on their
+application. This extension adds the kernel property `grf_size` which provides a way
+to specify the GRF size for a SYCL kernel, and the kernel property `grf_size_automatic`
+which provides a way to request a valid GRF size be automatically chosen.
+
+The properties described in this extension are advanced features that most applications
+should not need to use. In most cases, applications get the best performance
+without using these properties.
+
+== Specification
+
+=== Feature test macro
+
+This extension provides a feature-test macro as described in the core SYCL
+specification.  An implementation supporting this extension must predefine the
+macro `SYCL_EXT_INTEL_GRF_SIZE` to one of the values defined in the table
+below.  Applications can test for the existence of this macro to determine if
+the implementation supports this feature, or applications can test the macro's
+value to determine which of the extension's features the implementation
+supports.
+
+
+[%header,cols="1,5"]
+|===
+|Value
+|Description
+
+|1
+|The APIs of this experimental extension are not versioned, so the
+ feature-test macro always has this value.
+|===
+
+=== Properties
+
+|===
+|Property|Description
+
+|`grf_size`
+|The `grf_size` kernel property provides a way to specify the GRF size used by a kernel. 
+It accepts a single unsigned integer value.
+
+|`grf_size_automatic`
+| The `grf_size_automatic` kernel property adds the requirement that the kernel use any of the supported GRF sizes. The manner in which the GRF size is selected is implementation-defined.
+
+|===
+
+At most one of the `grf_size` and `grf_size_automatic` properties may be associated with a kernel.
+
+If a kernel is not associated with a `grf_size` or `grf_size_automatic` property, the manner in which the GRF size is selected is implementation-defined.
+
+The properties are defined as follows:
+```c++
+namespace sycl::ext::intel::experimental {
+
+struct grf_size_key {
+  template <unsigned int Size>
+  using value_t = 
+      oneapi::experimental::property_value<grf_size_key, 
+                                           std::integral_constant<unsigned int, Size>>;
+};
+
+struct grf_size_automatic_key {
+  using value_t = 
+      oneapi::experimental::property_value<grf_size_automatic_key>;
+};
+
+template <unsigned int Size>
+inline constexpr grf_size_key::value_t<Size> grf_size;
+
+inline constexpr grf_size_automatic_key::value_t grf_size_automatic;
+
+} // namespace sycl::ext::intel::experimental
+```
+The supported values are as follows:
+[%header,cols="1,5"]
+|===
+|GPU |Supported Values
+| PVC | 128 (small register file), 256 (large register file)
+| DG2 | 128 (small register file), 256 (large register file)
+|===
+
+Providing a value not consistent with the supported values may result in undefined behavior.
+
+=== Using the properties in a kernel
+
+A simple example of using this extension is below.
+
+```c++
+namespace syclex = sycl::ext::oneapi::experimental;
+namespace intelex = sycl::ext::intel::experimental;
+{
+  ...
+  syclex::properties kernel_properties{intelex::grf_size<256>};
+
+  q.single_task(kernel_properties, [=] {
+   ...
+  }).wait();
+}
+{
+  ...
+  syclex::properties kernel_properties{intelex:grf_size_automatic};
+
+  q.single_task(kernel_properties, [=] {
+   ...
+  }).wait();
+}
+```
+
+

From 1d0230a0a198638d40677afb220b70c21e77bb38 Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Wed, 14 Jun 2023 09:04:23 -0700
Subject: [PATCH 27/55] [CI] Try to fix lint issues (#9876)

https://github.com/intel/llvm/pull/9844 somehow caused problems with
lint tasks when `origin/sycl` is newer than PR's merge base with it. I
don't understand how that wasn't a problem before, but let's try to fix
it.

While on it, start using sparse checkout to get
`devops/actions/cached_checkout` instead of "wget".
---
 .github/workflows/sycl_precommit.yml | 31 ++++++++++------------------
 1 file changed, 11 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/sycl_precommit.yml b/.github/workflows/sycl_precommit.yml
index 8dc8cc1b18969..c3e54c49dfed9 100644
--- a/.github/workflows/sycl_precommit.yml
+++ b/.github/workflows/sycl_precommit.yml
@@ -37,30 +37,21 @@ jobs:
       # actions/checkout fails without "--privileged".
       options: -u 1001:1001 --privileged
     steps:
-    - name: Fake actions/checkout task
-      uses: actions/checkout@v3
+    - uses: actions/checkout@v3
       with:
-        # cached_checkout below uses actions/checkout internally. However, when
-        # actions/checkout is run from within another action step (not from
-        # workflow), github seems to try to download from within the container
-        # and doesn't have requried filesystem permissions. Make sure it's
-        # already downloaded by the time it's needed by checking out some small
-        # repository.
-        repository: actions/checkout
-        path: fake-checkout
-    - name: 'PR commits + 1'
-      run: echo "PR_FETCH_DEPTH=$(( ${{ github.event.pull_request.commits }} + 1 ))" >> "${GITHUB_ENV}"
-    - name: Setup action
-      # We can switch to `cp -r /actions .` once changes in cached_checkout are
-      # propagated into the nightly container image.
-      run: |
-        mkdir -p actions/cached_checkout
-        wget raw.githubusercontent.com/intel/llvm/sycl/devops/actions/cached_checkout/action.yml   -P ./actions/cached_checkout
-    - uses: ./actions/cached_checkout
+        ref: ${{ github.event.pull_request.merge_commit_sha }}
+        sparse-checkout: |
+          devops/actions/cached_checkout
+    - name: 'PR commits + 2'
+      run: echo "PR_FETCH_DEPTH=$(( ${{ github.event.pull_request.commits }} + 2 ))" >> "${GITHUB_ENV}"
+    - uses: ./devops/actions/cached_checkout
       with:
         path: src
         fetch-depth: ${{ env.PR_FETCH_DEPTH }}
-        ref: ${{ github.event.pull_request.head.sha }}
+        # clang-format uses github.event.pull_request.base.sha that has the top
+        # of the base branch, not the merge base. As such, checkout the merge
+        # commit instead of github.event.pull_request.head.sha.
+        ref: ${{ github.event.pull_request.merge_commit_sha }}
         cache_path: "/__w/repo_cache/"
         merge: false
     - name: Run clang-format

From a69e5152874e5c3805d6ec9b46c71331da3a277d Mon Sep 17 00:00:00 2001
From: Steffen Larsen <steffen.larsen@intel.com>
Date: Wed, 14 Jun 2023 17:47:17 +0100
Subject: [PATCH 28/55] [SYCL] Fix multi_ptr ctor for extended address spaces
 (#9869)

The sycl_ext_usm_address_spaces extension adds the
ext_intel_global_device_space address space together with additional
multi_ptr constructors for creating a multi_ptr from an accessor.
However, the current implementation fails to construct the multi_ptr
from an accessor when the extended address space decorations are enabled
(through __ENABLE_USM_ADDR_SPACE__) as it attempts to use the normal
global address space decoration.
This commit fixes these constructors by doing a legal cast of the
underlying global-space pointer to a ext_intel_global_device_space
decorated pointer.

Signed-off-by: Larsen, Steffen <steffen.larsen@intel.com>
---
 sycl/include/sycl/multi_ptr.hpp         | 17 +++++++++----
 sycl/test/multi_ptr/ext_addr_spaces.cpp | 33 +++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 5 deletions(-)
 create mode 100644 sycl/test/multi_ptr/ext_addr_spaces.cpp

diff --git a/sycl/include/sycl/multi_ptr.hpp b/sycl/include/sycl/multi_ptr.hpp
index f8925954be303..765a8040c9918 100644
--- a/sycl/include/sycl/multi_ptr.hpp
+++ b/sycl/include/sycl/multi_ptr.hpp
@@ -136,7 +136,9 @@ class multi_ptr {
   multi_ptr(accessor<ElementType, Dimensions, Mode, access::target::device,
                      isPlaceholder, PropertyListT>
                 Accessor)
-      : multi_ptr(Accessor.template get_multi_ptr<DecorateAddress>()) {}
+      : multi_ptr(detail::cast_AS<decorated_type *>(
+            Accessor.template get_multi_ptr<DecorateAddress>()
+                .get_decorated())) {}
 
   // Only if Space == local_space || generic_space
   template <int Dimensions, access::mode Mode,
@@ -186,8 +188,9 @@ class multi_ptr {
   multi_ptr(accessor<typename std::remove_const_t<RelayElementType>, Dimensions,
                      Mode, access::target::device, isPlaceholder, PropertyListT>
                 Accessor)
-      : m_Pointer(Accessor.template get_multi_ptr<DecorateAddress>()
-                      .get_decorated()) {}
+      : m_Pointer(detail::cast_AS<decorated_type *>(
+            Accessor.template get_multi_ptr<DecorateAddress>()
+                .get_decorated())) {}
 
   // Only if Space == local_space || generic_space and element type is const
   template <int Dimensions, access::mode Mode,
@@ -450,7 +453,9 @@ class multi_ptr<const void, Space, DecorateAddress> {
   multi_ptr(accessor<ElementType, Dimensions, Mode, access::target::device,
                      isPlaceholder, PropertyListT>
                 Accessor)
-      : multi_ptr(Accessor.template get_multi_ptr<DecorateAddress>()) {}
+      : multi_ptr(detail::cast_AS<decorated_type *>(
+            Accessor.template get_multi_ptr<DecorateAddress>()
+                .get_decorated())) {}
 
   // Only if Space == local_space
   template <
@@ -575,7 +580,9 @@ class multi_ptr<void, Space, DecorateAddress> {
   multi_ptr(accessor<ElementType, Dimensions, Mode, access::target::device,
                      isPlaceholder, PropertyListT>
                 Accessor)
-      : multi_ptr(Accessor.template get_multi_ptr<DecorateAddress>()) {}
+      : multi_ptr(detail::cast_AS<decorated_type *>(
+            Accessor.template get_multi_ptr<DecorateAddress>()
+                .get_decorated())) {}
 
   // Only if Space == local_space
   template <
diff --git a/sycl/test/multi_ptr/ext_addr_spaces.cpp b/sycl/test/multi_ptr/ext_addr_spaces.cpp
new file mode 100644
index 0000000000000..1b574f61701d7
--- /dev/null
+++ b/sycl/test/multi_ptr/ext_addr_spaces.cpp
@@ -0,0 +1,33 @@
+// RUN: %clangxx -fsycl -D__ENABLE_USM_ADDR_SPACE__ -fsyntax-only -Xclang -verify %s -Xclang -verify-ignore-unexpected=note,warning
+// expected-no-diagnostics
+
+// Checks that extended address spaces are allowed when creating multi_ptr from
+// accessors.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  queue Q;
+
+  buffer<int> Buf{1};
+  Q.submit([&](handler &CGH) {
+     accessor Acc{Buf, CGH, read_write};
+     CGH.single_task([=]() {
+       device_ptr<int, access::decorated::yes> MPtr1{Acc};
+       device_ptr<int, access::decorated::no> MPtr2{Acc};
+       device_ptr<int, access::decorated::legacy> MPtr3{Acc};
+       device_ptr<const int, access::decorated::yes> MPtr4{Acc};
+       device_ptr<const int, access::decorated::no> MPtr5{Acc};
+       device_ptr<const int, access::decorated::legacy> MPtr6{Acc};
+       device_ptr<void, access::decorated::yes> MPtr7{Acc};
+       device_ptr<void, access::decorated::no> MPtr8{Acc};
+       device_ptr<void, access::decorated::legacy> MPtr9{Acc};
+       device_ptr<const void, access::decorated::yes> MPtr10{Acc};
+       device_ptr<const void, access::decorated::no> MPtr11{Acc};
+       device_ptr<const void, access::decorated::legacy> MPtr12{Acc};
+     });
+   }).wait_and_throw();
+  return 0;
+}

From 52c4efb7e5de01497091e83b1adb30cd46b74300 Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Wed, 14 Jun 2023 10:06:53 -0700
Subject: [PATCH 29/55] [CI] Add debug output for the clang-format task (#9881)

Manual testing before merging lint-related PRs didn't reveal issues but
it seems to misbehave after merge. Add some debug output to root cause.
I hope to address the issues during the day, if not I'm going to revert
the changes in the evening.
---
 devops/actions/clang-format/action.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/devops/actions/clang-format/action.yml b/devops/actions/clang-format/action.yml
index cf448ee4c5c11..4cc6c0cedc871 100644
--- a/devops/actions/clang-format/action.yml
+++ b/devops/actions/clang-format/action.yml
@@ -14,6 +14,11 @@ runs:
       git config --global --add safe.directory ${{ inputs.path }}
       git -C ${{ inputs.path }} clang-format ${{ github.event.pull_request.base.sha }}
       git -C ${{ inputs.path }} diff > ./clang-format.patch
+  - name: Debug
+    shell: bash
+    run: |
+      git -C ${{ inputs.path }} log ${{ github.event.pull_request.base.sha }}..HEAD
+      git -C ${{ inputs.path }} diff ${{ github.event.pull_request.base.sha }}..HEAD
   # Add patch with formatting fixes to CI job artifacts
   - uses: actions/upload-artifact@v1
     with:

From 3d866f2b1791e089e7c4f5d632edfb7226aad7ce Mon Sep 17 00:00:00 2001
From: Evgeniy <evgeniy.tyurin@intel.com>
Date: Wed, 14 Jun 2023 11:17:12 -0700
Subject: [PATCH 30/55] [SYCL][ESIMD][NFC] Do not use deprecated APIs in ESIMD
 headers and tests (#9859)

Co-authored-by: Vyacheslav Klochkov <vyacheslav.n.klochkov@intel.com>
---
 sycl/include/sycl/ext/intel/esimd/common.hpp     | 16 ++++++++--------
 .../ext/intel/esimd/detail/memory_intrin.hpp     |  8 ++++----
 sycl/include/sycl/ext/intel/esimd/memory.hpp     | 16 ++++++++--------
 .../experimental/esimd/detail/math_intrin.hpp    | 16 ++++++++--------
 .../ESIMD/lsc/lsc_load_store_2d_compare.cpp      |  4 ++--
 sycl/test/esimd/lsc.cpp                          | 14 +++++++-------
 6 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/sycl/include/sycl/ext/intel/esimd/common.hpp b/sycl/include/sycl/ext/intel/esimd/common.hpp
index ce48d9c1bc86c..46a1cdb06e3b9 100644
--- a/sycl/include/sycl/ext/intel/esimd/common.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/common.hpp
@@ -244,9 +244,9 @@ constexpr __ESIMD_NS::native::lsc::atomic_op to_lsc_atomic_op() {
     return __ESIMD_NS::native::lsc::atomic_op::inc;
   case __ESIMD_NS::atomic_op::dec:
     return __ESIMD_NS::native::lsc::atomic_op::dec;
-  case __ESIMD_NS::atomic_op::min:
+  case __ESIMD_NS::atomic_op::umin:
     return __ESIMD_NS::native::lsc::atomic_op::umin;
-  case __ESIMD_NS::atomic_op::max:
+  case __ESIMD_NS::atomic_op::umax:
     return __ESIMD_NS::native::lsc::atomic_op::umax;
   case __ESIMD_NS::atomic_op::cmpxchg:
     return __ESIMD_NS::native::lsc::atomic_op::cmpxchg;
@@ -256,9 +256,9 @@ constexpr __ESIMD_NS::native::lsc::atomic_op to_lsc_atomic_op() {
     return __ESIMD_NS::native::lsc::atomic_op::bit_or;
   case __ESIMD_NS::atomic_op::bit_xor:
     return __ESIMD_NS::native::lsc::atomic_op::bit_xor;
-  case __ESIMD_NS::atomic_op::minsint:
+  case __ESIMD_NS::atomic_op::smin:
     return __ESIMD_NS::native::lsc::atomic_op::smin;
-  case __ESIMD_NS::atomic_op::maxsint:
+  case __ESIMD_NS::atomic_op::smax:
     return __ESIMD_NS::native::lsc::atomic_op::smax;
   case __ESIMD_NS::atomic_op::fmax:
     return __ESIMD_NS::native::lsc::atomic_op::fmax;
@@ -291,9 +291,9 @@ constexpr __ESIMD_NS::atomic_op to_atomic_op() {
   case __ESIMD_NS::native::lsc::atomic_op::dec:
     return __ESIMD_NS::atomic_op::dec;
   case __ESIMD_NS::native::lsc::atomic_op::umin:
-    return __ESIMD_NS::atomic_op::min;
+    return __ESIMD_NS::atomic_op::umin;
   case __ESIMD_NS::native::lsc::atomic_op::umax:
-    return __ESIMD_NS::atomic_op::max;
+    return __ESIMD_NS::atomic_op::umax;
   case __ESIMD_NS::native::lsc::atomic_op::cmpxchg:
     return __ESIMD_NS::atomic_op::cmpxchg;
   case __ESIMD_NS::native::lsc::atomic_op::bit_and:
@@ -303,9 +303,9 @@ constexpr __ESIMD_NS::atomic_op to_atomic_op() {
   case __ESIMD_NS::native::lsc::atomic_op::bit_xor:
     return __ESIMD_NS::atomic_op::bit_xor;
   case __ESIMD_NS::native::lsc::atomic_op::smin:
-    return __ESIMD_NS::atomic_op::minsint;
+    return __ESIMD_NS::atomic_op::smin;
   case __ESIMD_NS::native::lsc::atomic_op::smax:
-    return __ESIMD_NS::atomic_op::maxsint;
+    return __ESIMD_NS::atomic_op::smax;
   case __ESIMD_NS::native::lsc::atomic_op::fmax:
     return __ESIMD_NS::atomic_op::fmax;
   case __ESIMD_NS::native::lsc::atomic_op::fmin:
diff --git a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
index 3c9779843c536..6c9448e79b072 100644
--- a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
@@ -540,13 +540,13 @@ __esimd_svm_atomic1(__ESIMD_DNS::vector_type_t<uint64_t, N> addrs,
                          (Op == __ESIMD_NS::atomic_op::fsub)) {
       Oldval[AddrIdx] =
           __ESIMD_DNS::atomic_sub<Ty>((Ty *)addrs[AddrIdx], src0[AddrIdx]);
-    } else if constexpr ((Op == __ESIMD_NS::atomic_op::minsint) ||
-                         (Op == __ESIMD_NS::atomic_op::min) ||
+    } else if constexpr ((Op == __ESIMD_NS::atomic_op::smin) ||
+                         (Op == __ESIMD_NS::atomic_op::umin) ||
                          (Op == __ESIMD_NS::atomic_op::fmin)) {
       Oldval[AddrIdx] =
           __ESIMD_DNS::atomic_min<Ty>((Ty *)addrs[AddrIdx], src0[AddrIdx]);
-    } else if constexpr ((Op == __ESIMD_NS::atomic_op::maxsint) ||
-                         (Op == __ESIMD_NS::atomic_op::max) ||
+    } else if constexpr ((Op == __ESIMD_NS::atomic_op::smax) ||
+                         (Op == __ESIMD_NS::atomic_op::umax) ||
                          (Op == __ESIMD_NS::atomic_op::fmax)) {
       Oldval[AddrIdx] =
           __ESIMD_DNS::atomic_max<Ty>((Ty *)addrs[AddrIdx], src0[AddrIdx]);
diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index 44071c60908de..4cca9c2cb8ec2 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -1015,18 +1015,18 @@ constexpr void check_atomic() {
   }
   if constexpr (Op == __ESIMD_NS::atomic_op::add ||
                 Op == __ESIMD_NS::atomic_op::sub ||
-                Op == __ESIMD_NS::atomic_op::min ||
-                Op == __ESIMD_NS::atomic_op::max ||
+                Op == __ESIMD_NS::atomic_op::umin ||
+                Op == __ESIMD_NS::atomic_op::umax ||
                 Op == __ESIMD_NS::atomic_op::bit_and ||
                 Op == __ESIMD_NS::atomic_op::bit_or ||
                 Op == __ESIMD_NS::atomic_op::bit_xor ||
-                Op == __ESIMD_NS::atomic_op::minsint ||
-                Op == __ESIMD_NS::atomic_op::maxsint) {
+                Op == __ESIMD_NS::atomic_op::smin ||
+                Op == __ESIMD_NS::atomic_op::smax) {
     static_assert(IsInt2BytePlus, "Integral 16-bit or wider type is expected");
-    constexpr bool IsSignedMinmax = (Op == __ESIMD_NS::atomic_op::minsint) ||
-                                    (Op == __ESIMD_NS::atomic_op::maxsint);
-    constexpr bool IsUnsignedMinmax = (Op == __ESIMD_NS::atomic_op::min) ||
-                                      (Op == __ESIMD_NS::atomic_op::max);
+    constexpr bool IsSignedMinmax = (Op == __ESIMD_NS::atomic_op::smin) ||
+                                    (Op == __ESIMD_NS::atomic_op::smax);
+    constexpr bool IsUnsignedMinmax = (Op == __ESIMD_NS::atomic_op::umin) ||
+                                      (Op == __ESIMD_NS::atomic_op::umax);
 
     if constexpr (IsSignedMinmax || IsUnsignedMinmax) {
       constexpr bool SignOK = std::is_signed_v<T> == IsSignedMinmax;
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp
index 8fc022307fba4..ec5d33918869c 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/math_intrin.hpp
@@ -486,17 +486,17 @@ __esimd_dpas_inner(const __ESIMD_DNS::vector_type_t<T0, SZ> *src0,
 
   constexpr bool
       pvcHfDest = isPvc && std::is_same_v<RT, unsigned short> &&
-                  src1_precision == __ESIMD_ENS::argument_type::FP16 &&
-                  src2_precision == __ESIMD_ENS::argument_type::FP16,
+                  src1_precision == __ESIMD_XMX_NS::dpas_argument_type::fp16 &&
+                  src2_precision == __ESIMD_XMX_NS::dpas_argument_type::fp16,
       pvcHfSrc0 = isPvc && std::is_same_v<T0, unsigned short> &&
-                  src1_precision == __ESIMD_ENS::argument_type::FP16 &&
-                  src2_precision == __ESIMD_ENS::argument_type::FP16,
+                  src1_precision == __ESIMD_XMX_NS::dpas_argument_type::fp16 &&
+                  src2_precision == __ESIMD_XMX_NS::dpas_argument_type::fp16,
       pvcBfDest = isPvc && std::is_same_v<RT, unsigned short> &&
-                  src1_precision == __ESIMD_ENS::argument_type::BF16 &&
-                  src2_precision == __ESIMD_ENS::argument_type::BF16,
+                  src1_precision == __ESIMD_XMX_NS::dpas_argument_type::bf16 &&
+                  src2_precision == __ESIMD_XMX_NS::dpas_argument_type::bf16,
       pvcBfSrc0 = isPvc && std::is_same_v<T0, unsigned short> &&
-                  src1_precision == __ESIMD_ENS::argument_type::BF16 &&
-                  src2_precision == __ESIMD_ENS::argument_type::BF16,
+                  src1_precision == __ESIMD_XMX_NS::dpas_argument_type::bf16 &&
+                  src2_precision == __ESIMD_XMX_NS::dpas_argument_type::bf16,
       pvcBfOrHfDest = pvcBfDest || pvcHfDest,
 
       pvcBfDestChecks =
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_load_store_2d_compare.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_load_store_2d_compare.cpp
index b4867371fc1fc..acee332f6cf24 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_load_store_2d_compare.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_load_store_2d_compare.cpp
@@ -50,9 +50,9 @@ int main() {
       constexpr uint32_t width = SurfaceWidth * sizeof(float) - 1;
       constexpr uint32_t height = SurfaceHeight - 1;
       constexpr uint32_t pitch = SurfacePitch * sizeof(float) - 1;
-      auto data_a = lsc_load2d<float, BlockWidth, BlockHeight, NumBlocks>(
+      auto data_a = lsc_load_2d<float, BlockWidth, BlockHeight, NumBlocks>(
           A, width, height, pitch, x, y);
-      auto data_b = lsc_load2d<float, BlockWidth, BlockHeight, NumBlocks>(
+      auto data_b = lsc_load_2d<float, BlockWidth, BlockHeight, NumBlocks>(
           B, width, height, pitch, x, y);
 
       auto data_c = data_a + data_b;
diff --git a/sycl/test/esimd/lsc.cpp b/sycl/test/esimd/lsc.cpp
index 7f622162bbff3..25ff4756b340d 100644
--- a/sycl/test/esimd/lsc.cpp
+++ b/sycl/test/esimd/lsc.cpp
@@ -134,20 +134,20 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void foo(AccType &acc) {
   unsigned data_height, data_width, data_pitch, x, y;
 
   // CHECK: {{[^)]+}} = call <32 x i32> @llvm.genx.lsc.load2d.stateless.v32i32.v32i1.i64(<32 x i1> {{[^)]+}}, i8 1, i8 1, i8 3, i8 1, i8 2, i16 4, i16 4, i8 0, i64 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}})
-  simd<int, Width *Height *NumBlocks> data7 =
-      lsc_load2d<int, Width, Height, NumBlocks, false, false,
-                 cache_hint::uncached, cache_hint::uncached>(
+  simd<int, Width * Height * NumBlocks> data7 =
+      lsc_load_2d<int, Width, Height, NumBlocks, false, false,
+                  cache_hint::uncached, cache_hint::uncached>(
           ptr, data_width, data_height, data_pitch, x, y);
 
   simd<int, Width *Height * 1> data8 = 7;
   // CHECK: call void @llvm.genx.lsc.store2d.stateless.v16i1.i64.v16i32(<16 x i1> {{[^)]+}}, i8 1, i8 1, i8 3, i8 1, i8 1, i16 4, i16 4, i8 0, i64 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, <16 x i32> {{[^)]+}})
-  lsc_store2d<int, Width, Height, cache_hint::uncached, cache_hint::uncached>(
+  lsc_store_2d<int, Width, Height, cache_hint::uncached, cache_hint::uncached>(
       ptr, data_width, data_height, data_pitch, x, y, data8);
 
   // CHECK: call void @llvm.genx.lsc.prefetch2d.stateless.v32i1.i64(<32 x i1> {{[^)]+}}, i8 1, i8 2, i8 3, i8 1, i8 2, i16 4, i16 4, i8 0, i64 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}}, i32 {{[^)]+}})
-  lsc_prefetch2d<int, Width, Height, NumBlocks, cache_hint::uncached,
-                 cache_hint::cached>(ptr, data_width, data_height, data_pitch,
-                                     x, y);
+  lsc_prefetch_2d<int, Width, Height, NumBlocks, cache_hint::uncached,
+                  cache_hint::cached>(ptr, data_width, data_height, data_pitch,
+                                      x, y);
 
   lsc_fence<lsc_memory_kind::shared_local, lsc_fence_op::none, lsc_scope::group,
             16>();

From c6a9eeec742653f5666ccada73f3e260dfb44d3a Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Wed, 14 Jun 2023 11:22:59 -0700
Subject: [PATCH 31/55] [CI] Another attempt to fix lint task (#9885)

---
 .github/workflows/sycl_precommit.yml   |  6 +-----
 devops/actions/clang-format/action.yml | 15 +++++++++------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/sycl_precommit.yml b/.github/workflows/sycl_precommit.yml
index c3e54c49dfed9..d2ebb46794328 100644
--- a/.github/workflows/sycl_precommit.yml
+++ b/.github/workflows/sycl_precommit.yml
@@ -39,7 +39,6 @@ jobs:
     steps:
     - uses: actions/checkout@v3
       with:
-        ref: ${{ github.event.pull_request.merge_commit_sha }}
         sparse-checkout: |
           devops/actions/cached_checkout
     - name: 'PR commits + 2'
@@ -48,10 +47,7 @@ jobs:
       with:
         path: src
         fetch-depth: ${{ env.PR_FETCH_DEPTH }}
-        # clang-format uses github.event.pull_request.base.sha that has the top
-        # of the base branch, not the merge base. As such, checkout the merge
-        # commit instead of github.event.pull_request.head.sha.
-        ref: ${{ github.event.pull_request.merge_commit_sha }}
+        ref: ${{ github.event.pull_request.head.sha }}
         cache_path: "/__w/repo_cache/"
         merge: false
     - name: Run clang-format
diff --git a/devops/actions/clang-format/action.yml b/devops/actions/clang-format/action.yml
index 4cc6c0cedc871..e3710949603d2 100644
--- a/devops/actions/clang-format/action.yml
+++ b/devops/actions/clang-format/action.yml
@@ -12,13 +12,16 @@ runs:
     shell: bash {0}
     run: |
       git config --global --add safe.directory ${{ inputs.path }}
-      git -C ${{ inputs.path }} clang-format ${{ github.event.pull_request.base.sha }}
+      # TODO: Should we just drop fetch-depth in the cached checkout?
+      base=$(git -C ${{ inputs.path }} merge-base ${{ github.event.pull_request.base.sha }} HEAD)
+      echo "::group::Debug"
+      echo "HEAD:"
+      git -C ${{ inputs.path }} log -1 HEAD
+      echo "Merge-base:"
+      git -C ${{ inputs.path }} log -1 $base
+      echo "::endgroup::"
+      git -C ${{ inputs.path }} clang-format $base
       git -C ${{ inputs.path }} diff > ./clang-format.patch
-  - name: Debug
-    shell: bash
-    run: |
-      git -C ${{ inputs.path }} log ${{ github.event.pull_request.base.sha }}..HEAD
-      git -C ${{ inputs.path }} diff ${{ github.event.pull_request.base.sha }}..HEAD
   # Add patch with formatting fixes to CI job artifacts
   - uses: actions/upload-artifact@v1
     with:

From ef033238cfb594ea1b75f39090117c323daaf0db Mon Sep 17 00:00:00 2001
From: Nikita Kornev <nikita.kornev@intel.com>
Date: Wed, 14 Jun 2023 21:21:22 +0200
Subject: [PATCH 32/55] [SYCL] Improve is_compatible (#9769)

Modify `is_compatible` to check if specific target is defined with
`-fsycl-targets` and change the result. Previously there was a situation
when kernel is compatible with the device by aspects, but actually it
fails to run on this device as it was compiled for another target
device.

Related spec change: https://github.com/KhronosGroup/SYCL-Docs/pull/381

Resolves #7561
---
 sycl/source/kernel_bundle.cpp                 | 60 +++++++++++++++++--
 .../Inputs/is_compatible_with_env.cpp         | 14 +++++
 .../is_compatible/is_compatible_amdgcn.cpp    |  7 +++
 .../is_compatible_esimd_emulator.cpp          | 41 +++++++++++++
 .../is_compatible/is_compatible_nvptx64.cpp   |  7 +++
 .../is_compatible_several_targets.cpp         |  8 +++
 .../is_compatible/is_compatible_spir64.cpp    |  7 +++
 .../is_compatible_spir64_fpga.cpp             |  7 +++
 .../is_compatible_spir64_gen.cpp              |  7 +++
 .../is_compatible_spir64_x86_64.cpp           |  7 +++
 .../is_compatible_with_aspects.cpp}           |  0
 11 files changed, 159 insertions(+), 6 deletions(-)
 create mode 100644 sycl/test-e2e/OptionalKernelFeatures/is_compatible/Inputs/is_compatible_with_env.cpp
 create mode 100644 sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_amdgcn.cpp
 create mode 100644 sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_esimd_emulator.cpp
 create mode 100644 sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_nvptx64.cpp
 create mode 100644 sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_several_targets.cpp
 create mode 100644 sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64.cpp
 create mode 100644 sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_fpga.cpp
 create mode 100644 sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_gen.cpp
 create mode 100644 sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_x86_64.cpp
 rename sycl/test-e2e/OptionalKernelFeatures/{is_compatible.cpp => is_compatible/is_compatible_with_aspects.cpp} (100%)

diff --git a/sycl/source/kernel_bundle.cpp b/sycl/source/kernel_bundle.cpp
index e124f0316932a..44b172c60d12c 100644
--- a/sycl/source/kernel_bundle.cpp
+++ b/sycl/source/kernel_bundle.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <detail/device_binary_image.hpp>
 #include <detail/kernel_bundle_impl.hpp>
 #include <detail/kernel_id_impl.hpp>
 #include <detail/program_manager/program_manager.hpp>
@@ -290,12 +291,59 @@ std::vector<kernel_id> get_kernel_ids() {
 }
 
 bool is_compatible(const std::vector<kernel_id> &KernelIDs, const device &Dev) {
-  std::set<detail::RTDeviceBinaryImage *> BinImages =
-      detail::ProgramManager::getInstance().getRawDeviceImages(KernelIDs);
-  return std::all_of(BinImages.begin(), BinImages.end(),
-                     [&Dev](const detail::RTDeviceBinaryImage *Img) {
-                       return doesDevSupportDeviceRequirements(Dev, *Img);
-                     });
+  if (KernelIDs.empty())
+    return false;
+  // TODO: also need to check that the architecture specified by the
+  // "-fsycl-targets" flag matches the device when we are able to get the
+  // device's arch.
+  auto doesImageTargetMatchDevice = [](const device &Dev,
+                                       const detail::RTDeviceBinaryImage &Img) {
+    const char *Target = Img.getRawData().DeviceTargetSpec;
+    auto BE = Dev.get_backend();
+    // ESIMD emulator backend is only compatible with esimd kernels.
+    if (BE == sycl::backend::ext_intel_esimd_emulator) {
+      pi_device_binary_property Prop = Img.getProperty("isEsimdImage");
+      return (Prop && (detail::DeviceBinaryProperty(Prop).asUint32() != 0));
+    }
+    if (strcmp(Target, __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64) == 0) {
+      return (BE == sycl::backend::opencl ||
+              BE == sycl::backend::ext_oneapi_level_zero);
+    } else if (strcmp(Target, __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_X86_64) ==
+               0) {
+      return Dev.is_cpu();
+    } else if (strcmp(Target, __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_GEN) ==
+               0) {
+      return Dev.is_gpu() && (BE == sycl::backend::opencl ||
+                              BE == sycl::backend::ext_oneapi_level_zero);
+    } else if (strcmp(Target, __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64_FPGA) ==
+               0) {
+      return Dev.is_accelerator();
+    } else if (strcmp(Target, __SYCL_PI_DEVICE_BINARY_TARGET_NVPTX64) == 0) {
+      return BE == sycl::backend::ext_oneapi_cuda;
+    } else if (strcmp(Target, __SYCL_PI_DEVICE_BINARY_TARGET_AMDGCN) == 0) {
+      return BE == sycl::backend::ext_oneapi_hip;
+    }
+
+    return false;
+  };
+
+  // One kernel may be contained in several binary images depending on the
+  // number of targets. This kernel is compatible with the device if there is
+  // at least one image (containing this kernel) whose aspects are supported by
+  // the device and whose target matches the device.
+  for (const auto &KernelID : KernelIDs) {
+    std::set<detail::RTDeviceBinaryImage *> BinImages =
+        detail::ProgramManager::getInstance().getRawDeviceImages({KernelID});
+
+    if (std::none_of(BinImages.begin(), BinImages.end(),
+                     [&](const detail::RTDeviceBinaryImage *Img) {
+                       return doesDevSupportDeviceRequirements(Dev, *Img) &&
+                              doesImageTargetMatchDevice(Dev, *Img);
+                     }))
+      return false;
+  }
+
+  return true;
 }
 
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/Inputs/is_compatible_with_env.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/Inputs/is_compatible_with_env.cpp
new file mode 100644
index 0000000000000..e919a6a3bf001
--- /dev/null
+++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/Inputs/is_compatible_with_env.cpp
@@ -0,0 +1,14 @@
+#include <sycl/sycl.hpp>
+
+int main() {
+  sycl::device dev;
+  if (sycl::is_compatible<class Kernel>(dev)) {
+    sycl::queue q(dev);
+    q.submit([&](sycl::handler &cgh) {
+       cgh.parallel_for<class Kernel>(sycl::range<1>{1},
+                                      [=](sycl::id<1> Id) { int x = Id[0]; });
+     }).wait_and_throw();
+    return 0;
+  }
+  return 1;
+}
diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_amdgcn.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_amdgcn.cpp
new file mode 100644
index 0000000000000..5e240044483b7
--- /dev/null
+++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_amdgcn.cpp
@@ -0,0 +1,7 @@
+// REQUIRES: hip_amd, opencl, gpu, cpu
+
+// RUN: %clangxx -fsycl -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=gfx906 -fsycl-targets=amdgcn-amd-amdhsa %S/Inputs/is_compatible_with_env.cpp -o %t.out
+
+// RUN: env ONEAPI_DEVICE_SELECTOR=hip:gpu %{run} %t.out
+// RUN: env ONEAPI_DEVICE_SELECTOR=opencl:gpu %{run} not %t.out
+// RUN: env ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run} not %t.out
diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_esimd_emulator.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_esimd_emulator.cpp
new file mode 100644
index 0000000000000..1042b983167cc
--- /dev/null
+++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_esimd_emulator.cpp
@@ -0,0 +1,41 @@
+// REQUIRES: esimd_emulator
+
+// RUN: %clangxx -fsycl %S/Inputs/is_compatible_with_env.cpp %t_negative_case.out
+// RUN: env ONEAPI_DEVICE_SELECTOR=ext_intel_esimd_emulator:gpu %{run} not %t_negative_case.out
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Just an example from
+// https://github.com/intel/llvm/tree/sycl/sycl/doc/extensions/experimental/sycl_ext_intel_esimd
+
+#include <sycl/ext/intel/esimd.hpp>
+#include <sycl/sycl.hpp>
+
+int main() {
+  sycl::device dev;
+  if (sycl::is_compatible<class Test>(dev)) {
+    float *A = malloc_shared<float>(Size, q);
+    float *B = malloc_shared<float>(Size, q);
+    float *C = malloc_shared<float>(Size, q);
+
+    for (unsigned i = 0; i != Size; i++) {
+      A[i] = B[i] = i;
+    }
+
+    q.submit([&](handler &cgh) {
+       cgh.parallel_for<class Test>(Size / VL,
+                                    [=](id<1> i) [[intel::sycl_explicit_simd]] {
+                                      auto offset = i * VL;
+                                      // pointer arithmetic, so offset is in
+                                      // elements:
+                                      simd<float, VL> va(A + offset);
+                                      simd<float, VL> vb(B + offset);
+                                      simd<float, VL> vc = va + vb;
+                                      vc.copy_to(C + offset);
+                                    });
+     }).wait_and_throw();
+    return 0;
+  }
+  return 1;
+}
diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_nvptx64.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_nvptx64.cpp
new file mode 100644
index 0000000000000..ccfa829293c3f
--- /dev/null
+++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_nvptx64.cpp
@@ -0,0 +1,7 @@
+// REQUIRES: cuda, opencl, gpu, cpu
+
+// RUN: %clangxx -fsycl -fsycl-targets=nvptx64-nvidia-cuda %S/Inputs/is_compatible_with_env.cpp -o %t.out
+
+// RUN: env ONEAPI_DEVICE_SELECTOR=cuda:gpu %{run} %t.out
+// RUN: env ONEAPI_DEVICE_SELECTOR=opencl:gpu %{run} not %t.out
+// RUN: env ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run} not %t.out
diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_several_targets.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_several_targets.cpp
new file mode 100644
index 0000000000000..6dcc4690880d6
--- /dev/null
+++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_several_targets.cpp
@@ -0,0 +1,8 @@
+// REQUIRES: ocloc, level_zero, gpu, cpu
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_fpga,spir64_gen -Xsycl-target-backend "-device *" %S/Inputs/is_compatible_with_env.cpp -o %t.out
+
+// RUN: env ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run} not %t.out
+// RUN: env ONEAPI_DEVICE_SELECTOR=opencl:acc %{run} %t.out
+// RUN: env ONEAPI_DEVICE_SELECTOR=opencl:gpu %{run} %t.out
+// RUN: env ONEAPI_DEVICE_SELECTOR=level_zero:gpu %{run} %t.out
diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64.cpp
new file mode 100644
index 0000000000000..465a79056906a
--- /dev/null
+++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64.cpp
@@ -0,0 +1,7 @@
+// REQUIRES: cuda, opencl, gpu, cpu
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64 %S/Inputs/is_compatible_with_env.cpp -o %t.out
+
+// RUN: env ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run} %t.out
+// RUN: env ONEAPI_DEVICE_SELECTOR=opencl:gpu %{run} %t.out
+// RUN: env ONEAPI_DEVICE_SELECTOR=cuda:gpu %{run} not %t.out
diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_fpga.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_fpga.cpp
new file mode 100644
index 0000000000000..57366482e7082
--- /dev/null
+++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_fpga.cpp
@@ -0,0 +1,7 @@
+// REQUIRES: opencl-aot, accelerator, gpu, cpu
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_fpga %S/Inputs/is_compatible_with_env.cpp -o %t.out
+
+// RUN: env ONEAPI_DEVICE_SELECTOR=opencl:fpga %{run} %t.out
+// RUN: env ONEAPI_DEVICE_SELECTOR=*:gpu %{run} not %t.out
+// RUN: env ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run} not %t.out
diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_gen.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_gen.cpp
new file mode 100644
index 0000000000000..5adb27e0ae697
--- /dev/null
+++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_gen.cpp
@@ -0,0 +1,7 @@
+// REQUIRES: ocloc, gpu, level_zero, cpu
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %S/Inputs/is_compatible_with_env.cpp -o %t.out
+
+// RUN: env ONEAPI_DEVICE_SELECTOR=opencl:gpu %{run} %t.out
+// RUN: env ONEAPI_DEVICE_SELECTOR=level_zero:gpu %{run} %t.out
+// RUN: env ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run} not %t.out
diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_x86_64.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_x86_64.cpp
new file mode 100644
index 0000000000000..0a6f2c39df8af
--- /dev/null
+++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_x86_64.cpp
@@ -0,0 +1,7 @@
+// REQUIRES: opencl-aot, cpu, gpu, level_zero
+
+// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64 %S/Inputs/is_compatible_with_env.cpp -o %t.out
+
+// RUN: env ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run} %t.out
+// RUN: env ONEAPI_DEVICE_SELECTOR=opencl:gpu %{run} not %t.out
+// RUN: env ONEAPI_DEVICE_SELECTOR=level_zero:gpu %{run} not %t.out
diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_with_aspects.cpp
similarity index 100%
rename from sycl/test-e2e/OptionalKernelFeatures/is_compatible.cpp
rename to sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_with_aspects.cpp

From c9219ce7232da9bc10d50dff8a1858540b292df5 Mon Sep 17 00:00:00 2001
From: Alastair Murray <alastair.murray@codeplay.com>
Date: Thu, 15 Jun 2023 00:29:00 +0100
Subject: [PATCH 33/55] [SYCL][UR][CUDA] Update CODEOWNERS for Unified Runtime
 CUDA Adapter (#9883)

Mark newly ported UR CUDA plugin as owned by CUDA reviewer group

---------

Co-authored-by: Alexey Bader <alexey.bader@intel.com>
---
 .github/CODEOWNERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index f40685f18f7f7..23028a406360f 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -44,7 +44,7 @@ sycl/plugins/unified_runtime/ @intel/dpcpp-l0-pi-reviewers
 sycl/plugins/esimd_emulator/ @intel/dpcpp-esimd-reviewers
 
 # CUDA plugin
-sycl/plugins/cuda/ @intel/llvm-reviewers-cuda
+sycl/plugins/**/cuda/ @intel/llvm-reviewers-cuda
 
 # XPTI instrumentation utilities
 xpti/ @intel/llvm-reviewers-runtime

From 062446584d52e5dc80a94f7bacf02037b43dd5d2 Mon Sep 17 00:00:00 2001
From: Srividya Sundaram <srividya.sundaram@intel.com>
Date: Thu, 15 Jun 2023 02:58:51 -0700
Subject: [PATCH 34/55] [SYCL][Driver] Emit an error when PCH is triggered in
 SYCL mode. (#9689)

Emit an error if PCH(Pre-Compiled Header) file generation is forced in
-fsycl mode.

---------

Co-authored-by: premanandrao <premanand.m.rao@intel.com>
---
 .../clang/Basic/DiagnosticDriverKinds.td      |  2 ++
 clang/lib/Driver/Driver.cpp                   |  5 +++++
 clang/test/Driver/pch-fsycl-error.cpp         | 19 +++++++++++++++++++
 3 files changed, 26 insertions(+)
 create mode 100644 clang/test/Driver/pch-fsycl-error.cpp

diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 8f65523effeb5..b4ed7dfd6827b 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -360,6 +360,8 @@ def err_drv_expecting_fsycl_with_sycl_opt : Error<
   "'%0' must be used in conjunction with '-fsycl' to enable offloading">;
 def err_drv_fsycl_with_c_type : Error<
   "'%0' must not be used in conjunction with '-fsycl', which expects C++ source">;
+def err_drv_fsycl_with_pch : Error<
+  "Precompiled header generation is not supported with '-fsycl'">;
 def err_drv_fsycl_unsupported_with_opt
   : Error<"'%0' is not supported with '-fsycl'">;
 def err_drv_sycl_missing_amdgpu_arch : Error<
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 2c4e2552c615d..159035a887cb1 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -9252,6 +9252,11 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA,
     }
   }
 
+  // Emit an error if PCH(Pre-Compiled Header) file generation is forced in
+  // -fsycl mode.
+  if (C.getArgs().hasFlag(options::OPT_fsycl, options::OPT_fno_sycl, false) &&
+      JA.getType() == types::TY_PCH)
+    Diag(clang::diag::err_drv_fsycl_with_pch);
   // As an annoying special case, PCH generation doesn't strip the pathname.
   if (JA.getType() == types::TY_PCH && !IsCLMode()) {
     llvm::sys::path::remove_filename(BasePath);
diff --git a/clang/test/Driver/pch-fsycl-error.cpp b/clang/test/Driver/pch-fsycl-error.cpp
new file mode 100644
index 0000000000000..824e19c03738d
--- /dev/null
+++ b/clang/test/Driver/pch-fsycl-error.cpp
@@ -0,0 +1,19 @@
+// This test checks that an error is emitted when 
+// PCH(Precompiled Header) file generation is forced in -fsycl mode.
+
+// RUN: touch %t.h
+
+// Linux
+// RUN: %clang -c -fsycl -x c++-header %t.h -###  %s 2> %t1.txt
+// RUN: FileCheck %s -input-file=%t1.txt
+// CHECK: Precompiled header generation is not supported with '-fsycl'
+
+// Windows
+// RUN: %clang_cl -c -fsycl -x c++-header %t.h -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-ERROR %s
+// CHECK-ERROR: Precompiled header generation is not supported with '-fsycl'
+
+// /Yc
+// RUN: %clang_cl -fsycl /Ycpchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC %s
+// CHECK-YC: Precompiled header generation is not supported with '-fsycl'

From 6166f8eecbf6fb985dd95f9b9608846e3b419847 Mon Sep 17 00:00:00 2001
From: Mariya Podchishchaeva <mariya.podchishchaeva@intel.com>
Date: Thu, 15 Jun 2023 15:29:31 +0200
Subject: [PATCH 35/55] [NFC][clang] Fix -Wreorder in clang::PrinitingPolicy
 (#9904)

---
 clang/include/clang/AST/PrettyPrinter.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/include/clang/AST/PrettyPrinter.h b/clang/include/clang/AST/PrettyPrinter.h
index e0015447fd810..0a20f2a9c4864 100644
--- a/clang/include/clang/AST/PrettyPrinter.h
+++ b/clang/include/clang/AST/PrettyPrinter.h
@@ -212,6 +212,9 @@ struct PrintingPolicy {
   ///   \endcode
   unsigned SuppressTypedefs : 1;
 
+  /// When true, suppress printing final specifier.
+  unsigned SuppressFinalSpecifier : 1;
+
   /// When true, suppresses printing template arguments in names of C++
   /// constructors.
   unsigned SuppressTemplateArgsInCXXConstructors : 1;
@@ -220,9 +223,6 @@ struct PrintingPolicy {
   /// argument for the parameter.
   unsigned SuppressDefaultTemplateArgs : 1;
 
-  /// When true, suppress printing final specifier.
-  unsigned SuppressFinalSpecifier : 1;
-
   /// Whether we can use 'bool' rather than '_Bool' (even if the language
   /// doesn't actually have 'bool', because, e.g., it is defined as a macro).
   unsigned Bool : 1;

From 23fbba1e6b8b2ef9ee431851f447f6b1470021d4 Mon Sep 17 00:00:00 2001
From: Nikita Kornev <nikita.kornev@intel.com>
Date: Thu, 15 Jun 2023 16:56:42 +0200
Subject: [PATCH 36/55] [SYCL] Small fix for sycl::is_compatible() (#9903)

Should return true if KernelIDs is empty.
---
 sycl/source/kernel_bundle.cpp                                  | 2 +-
 .../is_compatible/is_compatible_with_aspects.cpp               | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/sycl/source/kernel_bundle.cpp b/sycl/source/kernel_bundle.cpp
index 44b172c60d12c..2cbb432ad2318 100644
--- a/sycl/source/kernel_bundle.cpp
+++ b/sycl/source/kernel_bundle.cpp
@@ -292,7 +292,7 @@ std::vector<kernel_id> get_kernel_ids() {
 
 bool is_compatible(const std::vector<kernel_id> &KernelIDs, const device &Dev) {
   if (KernelIDs.empty())
-    return false;
+    return true;
   // TODO: also need to check that the architecture specified by the
   // "-fsycl-targets" flag matches the device when we are able to get the
   // device's arch.
diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_with_aspects.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_with_aspects.cpp
index f2e57f5a6da4e..de0900b762e79 100644
--- a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_with_aspects.cpp
+++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_with_aspects.cpp
@@ -24,6 +24,9 @@ int main() {
   sycl::device Dev;
   sycl::queue Q(Dev);
 
+  // Returns true for empty vector of kernels
+  assert(sycl::is_compatible({}, Dev));
+
   if (sycl::is_compatible<KernelCPU>(Dev)) {
     Q.submit(
         [&](sycl::handler &h) { h.single_task<KernelCPU>([=]() { foo(); }); });

From 9cab5598bc824d45d058004cb1b2e28b0121c076 Mon Sep 17 00:00:00 2001
From: John Pennycook <john.pennycook@intel.com>
Date: Thu, 15 Jun 2023 08:25:06 -0700
Subject: [PATCH 37/55] [SYCL][DOC] Add sycl_ext_oneapi_address_cast (#9812)

Splits sycl::address_space_cast into static and dynamic casts, allowing
developers to avoid the overhead of address space checks at runtime.

---------

Signed-off-by: John Pennycook <john.pennycook@intel.com>
---
 .../sycl_ext_oneapi_address_cast.asciidoc     | 204 ++++++++++++++++++
 1 file changed, 204 insertions(+)
 create mode 100755 sycl/doc/extensions/proposed/sycl_ext_oneapi_address_cast.asciidoc

diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_address_cast.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_address_cast.asciidoc
new file mode 100755
index 0000000000000..87c48e745a11c
--- /dev/null
+++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_address_cast.asciidoc
@@ -0,0 +1,204 @@
+= sycl_ext_oneapi_address_cast
+
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+:dpcpp: pass:[DPC++]
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+
+== Notice
+
+[%hardbreaks]
+Copyright (C) 2023-2023 Intel Corporation.  All rights reserved.
+
+Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks
+of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc. used by
+permission by Khronos.
+
+
+== Contact
+
+To report problems with this extension, please open a new issue at:
+
+https://github.com/intel/llvm/issues
+
+
+== Dependencies
+
+This extension is written against the SYCL 2020 revision 7 specification.  All
+references below to the "core SYCL specification" or to section numbers in the
+SYCL specification refer to that revision.
+
+
+== Status
+
+This is a proposed extension specification, intended to gather community
+feedback.  Interfaces defined in this specification may not be implemented yet
+or may be in a preliminary state.  The specification itself may also change in
+incompatible ways before it is finalized.  *Shipping software products should
+not rely on APIs defined in this specification.*
+
+
+== Overview
+
+The `sycl::address_space_cast` function in SYCL 2020 does two things: 1) checks
+whether a given raw pointer can be cast to a specific address space; and 2)
+performs the casting operation. In cases where the developer is attempting to
+assert that a raw pointer points to an object in a specific address space, the
+checks from 1) are not required and may have undesirable performance impact.
+
+This extension separates `sycl::address_space_cast` into two functions:
+
+- `static_address_cast`, which casts with no run-time checks.
+- `dynamic_address_cast`, which casts with run-time checks.
+
+
+== Specification
+
+=== Feature test macro
+
+This extension provides a feature-test macro as described in the core SYCL
+specification.  An implementation supporting this extension must predefine the
+macro `SYCL_EXT_ONEAPI_ADDRESS_CAST` to one of the values defined in the
+table below.  Applications can test for the existence of this macro to
+determine if the implementation supports this feature, or applications can test
+the macro's value to determine which of the extension's features the
+implementation supports.
+
+[%header,cols="1,5"]
+|===
+|Value
+|Description
+
+|1
+|The APIs of this experimental extension are not versioned, so the
+ feature-test macro always has this value.
+|===
+
+
+=== Address space cast functions
+
+[source,c++]
+----
+namespace sycl::ext::oneapi::experimental {
+
+template <access::address_space Space, access::decorated DecorateAddress,
+          typename ElementType>
+multi_ptr<ElementType, Space, DecorateAddress>
+static_address_cast(ElementType* ptr);
+
+template <access::address_space Space, access::decorated DecorateAddress,
+          typename ElementType>
+multi_ptr<ElementType, Space, DecorateAddress>
+dynamic_address_cast(ElementType* ptr);
+
+} // namespace sycl::ext::oneapi::experimental
+----
+
+[source,c++]
+----
+template <access::address_space Space, access::decorated DecorateAddress,
+          typename ElementType>
+multi_ptr<ElementType, Space, DecorateAddress>
+static_address_cast(ElementType* ptr);
+----
+_Preconditions_: `ptr` points to an object allocated in the address space
+designated by `Space`.
+
+_Returns_: A `multi_ptr` with the specified address space and decoration that
+points to the same object as `ptr`.
+
+[NOTE]
+====
+Implementations may choose to issue a diagnostic if they can prove that `ptr`
+does not point to an object allocated in the address space designated by
+`Space`.
+====
+
+
+[source,c++]
+----
+template <access::address_space Space, access::decorated DecorateAddress,
+          typename ElementType>
+multi_ptr<ElementType, Space, DecorateAddress>
+dynamic_address_cast(ElementType* ptr);
+----
+_Preconditions_: The memory at `ptr` is accessible to the calling work-item.
+
+_Returns_: A `multi_ptr` with the specified address space and decoration that
+points to the same object as `ptr` if `ptr` points to an object allocated in
+the address space designated by `Space`, and `nullptr` otherwise.
+
+[NOTE]
+====
+The precondition prevents `dynamic_address_cast` from being used to
+reason about the address space of pointers originating from another work-item
+(in the case of `private` pointers) or another work-group (in the case of
+`local` pointers). Such pointers could not be dereferenced by the calling
+work-item, and it is thus unclear that being able to reason about the address
+space would be useful. Limiting the use of `dynamic_address_cast` to
+accessible pointers is expected to result in simpler and faster
+implementations.
+====
+
+
+== Implementation notes
+
+For SPIR-V backends, `static_address_cast` corresponds to
+`OpGenericCastToPtr`. `dynamic_address_cast` _may_ correspond to
+`OpGenericCastToPtrExplicit` -- there is currently some ambiguity regarding
+exactly how `OpGenericCastToPtrExplicit` is expected to behave, because the
+SPIR-V specification does not explain what it means for a cast to "fail".
+Since this extension is only experimental, we can likely implement
+`dynamic_address_cast` using `OpGenericCastToPtrExplicit` while we
+seek to clarify the SPIR-V specification.
+
+Generally speaking, it is expected that a `static_address_cast` can
+simply attach new decoration(s) to the raw pointer (or do nothing), while
+a `dynamic_address_cast` will have to inspect the address of the
+raw pointer to determine which region of memory it points to.
+
+An implementation for a CPU target could be implemented by keeping track of
+three pieces of information in thread-local storage:
+
+- The base (highest address) of the calling thread's stack.
+- The low bound of the calling work-item's local memory area.
+- The high bound of the calling work-item's local memory area.
+
+A cast to `private_space` succeeds as long as the pointer is within the calling
+thread's stack. A cast to `local_space` succeeds as long as the pointer is
+within the calling work-item's local memory area. A cast to `global_space`
+succeeds as long as the pointer is not within either of the above two address
+ranges.
+
+Implementations for GPU targets may be able to leverage dedicated instructions
+for checking the address space.
+
+
+== Issues
+
+. Some developers may expect a `dynamic_address_cast` to succeed if the
+pointer continues to work, irrespective of where the object the pointer points
+to was allocated. For example, some CPU implementations may treat global and
+local pointers equivalently in many situations.
++
+--
+*UNRESOLVED*:
+The current description of `dynamic_address_cast` requires
+implementations to track precisely which address space a pointer is associated
+with, in order to ensure that using the result of a dynamic cast is always
+safe. If we can identify use-cases for the more relaxed behavior, it would
+make sense to introduce either a third type of cast or some global check that
+two address spaces use the same representation and are thus "compatible".
+--

From 5069cc42950f0a619b004612f3ac91f8cefed6d4 Mon Sep 17 00:00:00 2001
From: fineg74 <61437305+fineg74@users.noreply.github.com>
Date: Thu, 15 Jun 2023 08:31:19 -0700
Subject: [PATCH 38/55] [ESIMD][E2E] Limit execution of the test
 bfloat16Constructor.cpp to DG2 and PVC (#9893)

---
 sycl/test-e2e/ESIMD/regression/bfloat16Constructor.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/test-e2e/ESIMD/regression/bfloat16Constructor.cpp b/sycl/test-e2e/ESIMD/regression/bfloat16Constructor.cpp
index 33242b599dcf9..1774afcbd55da 100644
--- a/sycl/test-e2e/ESIMD/regression/bfloat16Constructor.cpp
+++ b/sycl/test-e2e/ESIMD/regression/bfloat16Constructor.cpp
@@ -1,7 +1,7 @@
-// UNSUPPORTED: gpu-intel-gen9
+// REQUIRES: gpu-intel-dg2 || gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-// XFAIL: gpu && !esimd_emulator
+// XFAIL: windows && !esimd_emulator
 //==- bfloat16Constructor.cpp - Test to verify use of bfloat16 constructor -==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

From 8b85b6400b8271cf3bcf8e91fe1cb81a2665e23a Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Thu, 15 Jun 2023 11:35:17 -0400
Subject: [PATCH 39/55] [SYCL][ESIMD][Driver] Pass
 -fsycl-esimd-force-stateless-mem to host (#9825)

Right now, the `-fsycl-esimd-force-stateless-mem` flag sets the
`__ESIMD_FORCE_STATELESS_MEM` macro for the device compiler only. We
have some APIs that have different arguments with
`__ESIMD_FORCE_STATELESS_MEM` vs without it, so getting the host
compiler to not error when calling one of those functions can be
frustrating for the user.

Make `-fsycl-esimd-force-stateless-mem` set
`__ESIMD_FORCE_STATELESS_MEM` for the host compiler too.

I also found we had some esimd_emulator tests that were testing
stateless mode, but the emulator never uses the device code and always
uses host code, so it wasn't even testing stateless mode. We discussed
this internally and decided to disable the tests on the emulator.

---------

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 clang/lib/Driver/ToolChains/Clang.cpp                      | 7 +++----
 clang/lib/Frontend/InitPreprocessor.cpp                    | 6 +++---
 clang/test/Driver/sycl-esimd-force-stateless-mem.cpp       | 6 +++---
 sycl/test-e2e/ESIMD/acc_gather_scatter_rgba_stateless.cpp  | 1 +
 sycl/test-e2e/ESIMD/accessor_gather_scatter_stateless.cpp  | 1 +
 sycl/test-e2e/ESIMD/accessor_load_store_stateless.cpp      | 1 +
 sycl/test-e2e/ESIMD/accessor_stateless.cpp                 | 1 +
 sycl/test-e2e/ESIMD/lsc/lsc_predicate_stateless.cpp        | 3 ++-
 sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u32_stateless.cpp    | 5 +++--
 sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u64_stateless.cpp    | 5 +++--
 sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u8_u16_stateless.cpp | 5 +++--
 sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u32_stateless.cpp   | 5 +++--
 sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u64_stateless.cpp   | 5 +++--
 13 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 1cab8ab2ee7c6..bbed9e49ec102 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5129,10 +5129,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
       CmdArgs.push_back("-fsycl-allow-func-ptr");
     }
 
-    if (Args.hasFlag(options::OPT_fsycl_esimd_force_stateless_mem,
-                     options::OPT_fno_sycl_esimd_force_stateless_mem, false))
-      CmdArgs.push_back("-fsycl-esimd-force-stateless-mem");
-
     // Forward -fsycl-instrument-device-code option to cc1. This option will
     // only be used for SPIR-V-based targets.
     if (Triple.isSPIR())
@@ -5318,6 +5314,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
       for (auto &Macro : D.getSYCLTargetMacroArgs())
         CmdArgs.push_back(Args.MakeArgString(Macro));
     }
+    if (Args.hasFlag(options::OPT_fsycl_esimd_force_stateless_mem,
+                     options::OPT_fno_sycl_esimd_force_stateless_mem, false))
+      CmdArgs.push_back("-fsycl-esimd-force-stateless-mem");
   }
 
   if (IsOpenMPDevice) {
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index b785c966a18bd..6a8b05ae06ff6 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1309,13 +1309,13 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
       Builder.defineMacro("__ENABLE_USM_ADDR_SPACE__");
       Builder.defineMacro("SYCL_DISABLE_FALLBACK_ASSERT");
     }
-
-    if (LangOpts.SYCLESIMDForceStatelessMem)
-      Builder.defineMacro("__ESIMD_FORCE_STATELESS_MEM");
   }
   if (LangOpts.SYCLUnnamedLambda)
     Builder.defineMacro("__SYCL_UNNAMED_LAMBDA__");
 
+  if (LangOpts.SYCLESIMDForceStatelessMem)
+    Builder.defineMacro("__ESIMD_FORCE_STATELESS_MEM");
+
   // OpenCL definitions.
   if (LangOpts.OpenCL) {
     InitializeOpenCLFeatureTestMacros(TI, LangOpts, Builder);
diff --git a/clang/test/Driver/sycl-esimd-force-stateless-mem.cpp b/clang/test/Driver/sycl-esimd-force-stateless-mem.cpp
index 048aa7b3fa73a..90e506a86f53c 100644
--- a/clang/test/Driver/sycl-esimd-force-stateless-mem.cpp
+++ b/clang/test/Driver/sycl-esimd-force-stateless-mem.cpp
@@ -1,12 +1,12 @@
 
 /// Verify that the driver option is translated to corresponding options
-/// to device compilation and sycl-post-link.
+/// to host/device compilation and sycl-post-link.
 // RUN: %clang -### -fsycl -fsycl-esimd-force-stateless-mem \
 // RUN: %s 2>&1 | FileCheck -check-prefix=CHECK-PASS-TO-COMPS %s
 // CHECK-PASS-TO-COMPS: clang{{.*}} "-fsycl-esimd-force-stateless-mem"
 // CHECK-PASS-TO-COMPS: sycl-post-link{{.*}} "-lower-esimd-force-stateless-mem"
-// CHECK-PASS-TO-COMPS-NOT: clang{{.*}} "-fsycl-is-host" {{.*}}"-fsycl-esimd-force-stateless-mem"
-// CHECK-PASS-TO-COMPS-NOT: clang{{.*}} "-fsycl-esimd-force-stateless-mem" {{.*}}"-fsycl-is-host"
+// CHECK-PASS-TO-COMPS: clang{{.*}} "-fsycl-is-host" {{.*}}"-fsycl-esimd-force-stateless-mem"
+"
 
 /// Verify that stateless memory accesses mapping is not enforced by default
 // RUN: %clang -### -fsycl %s 2>&1 | FileCheck -check-prefix=CHECK-DEFAULT %s
diff --git a/sycl/test-e2e/ESIMD/acc_gather_scatter_rgba_stateless.cpp b/sycl/test-e2e/ESIMD/acc_gather_scatter_rgba_stateless.cpp
index de12f93d51e66..0f6a9dc846cd0 100644
--- a/sycl/test-e2e/ESIMD/acc_gather_scatter_rgba_stateless.cpp
+++ b/sycl/test-e2e/ESIMD/acc_gather_scatter_rgba_stateless.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+// UNSUPPORTED: esimd_emulator
 // Use -O2 to avoid huge stack usage under -O0.
 // RUN: %{build} -O2 -fsycl-esimd-force-stateless-mem -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/accessor_gather_scatter_stateless.cpp b/sycl/test-e2e/ESIMD/accessor_gather_scatter_stateless.cpp
index bc5a35d9b26d6..c893fb0c22000 100644
--- a/sycl/test-e2e/ESIMD/accessor_gather_scatter_stateless.cpp
+++ b/sycl/test-e2e/ESIMD/accessor_gather_scatter_stateless.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+// UNSUPPORTED: esimd_emulator
 // Use -O2 to avoid huge stack usage under -O0.
 // RUN: %{build} -O2 -fsycl-esimd-force-stateless-mem -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/accessor_load_store_stateless.cpp b/sycl/test-e2e/ESIMD/accessor_load_store_stateless.cpp
index 589af386c4ef0..0495ea2fd327a 100644
--- a/sycl/test-e2e/ESIMD/accessor_load_store_stateless.cpp
+++ b/sycl/test-e2e/ESIMD/accessor_load_store_stateless.cpp
@@ -10,6 +10,7 @@
 // intrinsics when stateless memory accesses are enforced, i.e. accessor
 // based accesses are automatically converted to stateless accesses.
 
+// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -fsycl-esimd-force-stateless-mem -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/accessor_stateless.cpp b/sycl/test-e2e/ESIMD/accessor_stateless.cpp
index aa3384bf0117f..78bc7a34925a8 100644
--- a/sycl/test-e2e/ESIMD/accessor_stateless.cpp
+++ b/sycl/test-e2e/ESIMD/accessor_stateless.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+// UNSUPPORTED: esimd_emulator
 // Use -O2 to avoid huge stack usage under -O0.
 // RUN: %{build} -O2 -fsycl-esimd-force-stateless-mem -D_CRT_SECURE_NO_WARNINGS=1 -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_predicate_stateless.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_predicate_stateless.cpp
index 35793cf670a3d..bb13127c9dda5 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_predicate_stateless.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_predicate_stateless.cpp
@@ -5,7 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// UNSUPPORTED: esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -fsycl-esimd-force-stateless-mem -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u32_stateless.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u32_stateless.cpp
index dc69914ed61c0..97d4d87b074ee 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u32_stateless.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u32_stateless.cpp
@@ -5,8 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// UNSUPPORTED: esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -fsycl-esimd-force-stateless-mem -o %t.out
 // RUN: %{run} %t.out
 
-#include "lsc_surf_load_u32.cpp"
\ No newline at end of file
+#include "lsc_surf_load_u32.cpp"
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u64_stateless.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u64_stateless.cpp
index f4f6e9044c939..53ce8cd28671f 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u64_stateless.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u64_stateless.cpp
@@ -5,8 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// UNSUPPORTED: esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -fsycl-esimd-force-stateless-mem -o %t.out
 // RUN: %{run} %t.out
 
-#include "lsc_surf_load_u64.cpp"
\ No newline at end of file
+#include "lsc_surf_load_u64.cpp"
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u8_u16_stateless.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u8_u16_stateless.cpp
index 4a138d8428d9a..c459f11a1a97f 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u8_u16_stateless.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u8_u16_stateless.cpp
@@ -5,8 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// UNSUPPORTED: esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -fsycl-esimd-force-stateless-mem -o %t.out
 // RUN: %{run} %t.out
 
-#include "lsc_surf_load_u8_u16.cpp"
\ No newline at end of file
+#include "lsc_surf_load_u8_u16.cpp"
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u32_stateless.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u32_stateless.cpp
index 1a0cafc3b443f..e85c5adff9ec7 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u32_stateless.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u32_stateless.cpp
@@ -5,8 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// UNSUPPORTED: esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -fsycl-esimd-force-stateless-mem -o %t.out
 // RUN: %{run} %t.out
 
-#include "lsc_surf_store_u32.cpp"
\ No newline at end of file
+#include "lsc_surf_store_u32.cpp"
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u64_stateless.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u64_stateless.cpp
index 27cd03868b7cc..3c0262be43147 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u64_stateless.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u64_stateless.cpp
@@ -5,8 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// UNSUPPORTED: esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -fsycl-esimd-force-stateless-mem -o %t.out
 // RUN: %{run} %t.out
 
-#include "lsc_surf_store_u64.cpp"
\ No newline at end of file
+#include "lsc_surf_store_u64.cpp"

From 52a0fc8ed65d31b403fa68e24d4db8e77685171f Mon Sep 17 00:00:00 2001
From: Buildbot for SYCL <bb-sycl@intel.com>
Date: Thu, 15 Jun 2023 23:36:43 +0800
Subject: [PATCH 40/55] [GHA] Uplift Linux GPU RT version to 23.09.25812.14
 (#9196)

Scheduled drivers uplift

---------

Co-authored-by: GitHub Actions <actions@github.com>
Co-authored-by: Alexey Bader <alexey.bader@intel.com>
---
 devops/dependencies.json      | 20 ++++++++++----------
 sycl/test-e2e/Printf/char.cpp |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/devops/dependencies.json b/devops/dependencies.json
index 448c7ff5004a1..5c5f4de96f5f5 100644
--- a/devops/dependencies.json
+++ b/devops/dependencies.json
@@ -1,15 +1,15 @@
 {
   "linux": {
     "compute_runtime": {
-      "github_tag": "22.43.24595.30",
-      "version": "22.43.24595.30",
-      "url": "https://github.com/intel/compute-runtime/releases/tag/22.43.24595.30",
+      "github_tag": "23.09.25812.14",
+      "version": "23.09.25812.14",
+      "url": "https://github.com/intel/compute-runtime/releases/tag/23.09.25812.14",
       "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu"
     },
     "igc": {
-      "github_tag": "igc-1.0.12504.5",
-      "version": "1.0.12504.5",
-      "url": "https://github.com/intel/intel-graphics-compiler/releases/tag/igc-1.0.12504.5",
+      "github_tag": "igc-1.0.13463.18",
+      "version": "1.0.13463.18",
+      "url": "https://github.com/intel/intel-graphics-compiler/releases/tag/igc-1.0.13463.18",
       "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu"
     },
     "cm": {
@@ -19,9 +19,9 @@
       "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu"
     },
     "level_zero": {
-      "github_tag": "v1.8.12",
-      "version": "v1.8.12",
-      "url": "https://github.com/oneapi-src/level-zero/releases/tag/v1.8.12",
+      "github_tag": "v1.10.0",
+      "version": "v1.10.0",
+      "url": "https://github.com/oneapi-src/level-zero/releases/tag/v1.10.0",
       "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu"
     },
     "tbb": {
@@ -74,4 +74,4 @@
       "root": "{ARCHIVE_ROOT}/comp/oclfpga/win"
     }
   }
-}
+}
\ No newline at end of file
diff --git a/sycl/test-e2e/Printf/char.cpp b/sycl/test-e2e/Printf/char.cpp
index ae222e6128dea..57c47a9349eef 100644
--- a/sycl/test-e2e/Printf/char.cpp
+++ b/sycl/test-e2e/Printf/char.cpp
@@ -5,7 +5,7 @@
 // [1]: https://en.cppreference.com/w/cpp/io/c/fprintf
 //
 // UNSUPPORTED: hip_amd
-// XFAIL: cuda && windows
+// XFAIL: cuda && windows || ((level_zero || opencl) && gpu && linux)
 //
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out | FileCheck %s

From bb9055da7c10dd3e3418f08c0bc71f56ff8f0a57 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Thu, 15 Jun 2023 19:35:08 +0100
Subject: [PATCH 41/55] [SYCL][CUDA] Fix some bfloat16 math to work for cc <
 sm_80 (#9900)

When we switched to bfloat16 being supported for all devices a few math
functions were not supported for sm_xx<sm_80. This patch fixes this by
using the generic implementations for such cases.
I also adjusted a few of the generic implementations to make them more
efficient.

---------

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
Co-authored-by: JackAKirk <chezjakirk@gmail.com>
---
 libclc/ptx-nvidiacl/libspirv/math/fabs.cl     | 12 +---
 libclc/ptx-nvidiacl/libspirv/math/fma.cl      | 10 +---
 libclc/ptx-nvidiacl/libspirv/math/fmax.cl     | 10 +---
 libclc/ptx-nvidiacl/libspirv/math/fmin.cl     | 10 +---
 .../ext/oneapi/experimental/bfloat16_math.hpp | 56 ++++++++++++-------
 5 files changed, 44 insertions(+), 54 deletions(-)

diff --git a/libclc/ptx-nvidiacl/libspirv/math/fabs.cl b/libclc/ptx-nvidiacl/libspirv/math/fabs.cl
index 0aac0fa4ab0f0..4f12e85310a01 100644
--- a/libclc/ptx-nvidiacl/libspirv/math/fabs.cl
+++ b/libclc/ptx-nvidiacl/libspirv/math/fabs.cl
@@ -11,27 +11,19 @@
 #include "../../include/libdevice.h"
 #include <clcmacro.h>
 
-extern int __clc_nvvm_reflect_arch();
-
 #define __CLC_FUNCTION __spirv_ocl_fabs
 #define __CLC_BUILTIN __nv_fabs
 #define __CLC_BUILTIN_F __CLC_XCONCAT(__CLC_BUILTIN, f)
 #include <math/unary_builtin.inc>
 
+// Requires at least sm_80
 _CLC_DEF _CLC_OVERLOAD ushort __clc_fabs(ushort x) {
-  if (__clc_nvvm_reflect_arch() >= 800) {
     return __nvvm_abs_bf16(x);
-  }
-  __builtin_trap();
-  __builtin_unreachable();
 }
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ushort, __clc_fabs, ushort)
 
+// Requires at least sm_80
 _CLC_DEF _CLC_OVERLOAD uint __clc_fabs(uint x) {
-  if (__clc_nvvm_reflect_arch() >= 800) {
     return __nvvm_abs_bf16x2(x);
-  }
-  __builtin_trap();
-  __builtin_unreachable();
 }
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uint, __clc_fabs, uint)
diff --git a/libclc/ptx-nvidiacl/libspirv/math/fma.cl b/libclc/ptx-nvidiacl/libspirv/math/fma.cl
index 4cfdc2821e12e..738951681e66e 100644
--- a/libclc/ptx-nvidiacl/libspirv/math/fma.cl
+++ b/libclc/ptx-nvidiacl/libspirv/math/fma.cl
@@ -48,22 +48,16 @@ _CLC_TERNARY_VECTORIZE_HAVE2(_CLC_OVERLOAD _CLC_DEF, half, __spirv_ocl_fma,
 
 #endif
 
+// Requires at least sm_80
 _CLC_DEF _CLC_OVERLOAD ushort __clc_fma(ushort x, ushort y, ushort z) {
-  if (__clc_nvvm_reflect_arch() >= 800) {
     return __nvvm_fma_rn_bf16(x, y, z);
-  }
-  __builtin_trap();
-  __builtin_unreachable();
 }
 _CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ushort, __clc_fma, ushort,
                        ushort, ushort)
 
+// Requires at least sm_80
 _CLC_DEF _CLC_OVERLOAD uint __clc_fma(uint x, uint y, uint z) {
-  if (__clc_nvvm_reflect_arch() >= 800) {
     return __nvvm_fma_rn_bf16x2(x, y, z);
-  }
-  __builtin_trap();
-  __builtin_unreachable();
 }
 _CLC_TERNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uint, __clc_fma, uint,
                        uint, uint)
diff --git a/libclc/ptx-nvidiacl/libspirv/math/fmax.cl b/libclc/ptx-nvidiacl/libspirv/math/fmax.cl
index 645762000ff53..d9dac6e752513 100644
--- a/libclc/ptx-nvidiacl/libspirv/math/fmax.cl
+++ b/libclc/ptx-nvidiacl/libspirv/math/fmax.cl
@@ -51,22 +51,16 @@ _CLC_BINARY_VECTORIZE_HAVE2(_CLC_OVERLOAD _CLC_DEF, half, __spirv_ocl_fmax,
 
 #endif
 
+// Requires at least sm_80
 _CLC_DEF _CLC_OVERLOAD ushort __clc_fmax(ushort x, ushort y) {
-  if (__clc_nvvm_reflect_arch() >= 800) {
     return __nvvm_fmax_bf16(x, y);
-  }
-  __builtin_trap();
-  __builtin_unreachable();
 }
 _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ushort, __clc_fmax, ushort,
                       ushort)
 
+// Requires at least sm_80
 _CLC_DEF _CLC_OVERLOAD uint __clc_fmax(uint x, uint y) {
-  if (__clc_nvvm_reflect_arch() >= 800) {
     return __nvvm_fmax_bf16x2(x, y);
-  }
-  __builtin_trap();
-  __builtin_unreachable();
 }
 _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uint, __clc_fmax, uint,
                       uint)
diff --git a/libclc/ptx-nvidiacl/libspirv/math/fmin.cl b/libclc/ptx-nvidiacl/libspirv/math/fmin.cl
index 6bdc4b8176be5..167e65cdc5ec8 100644
--- a/libclc/ptx-nvidiacl/libspirv/math/fmin.cl
+++ b/libclc/ptx-nvidiacl/libspirv/math/fmin.cl
@@ -51,22 +51,16 @@ _CLC_BINARY_VECTORIZE_HAVE2(_CLC_OVERLOAD _CLC_DEF, half, __spirv_ocl_fmin, half
 
 #endif
 
+// Requires at least sm_80
 _CLC_DEF _CLC_OVERLOAD ushort __clc_fmin(ushort x, ushort y) {
-  if (__clc_nvvm_reflect_arch() >= 800) {
     return __nvvm_fmin_bf16(x, y);
-  }
-  __builtin_trap();
-  __builtin_unreachable();
 }
 _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ushort, __clc_fmin, ushort,
                       ushort)
 
+// Requires at least sm_80
 _CLC_DEF _CLC_OVERLOAD uint __clc_fmin(uint x, uint y) {
-  if (__clc_nvvm_reflect_arch() >= 800) {
     return __nvvm_fmin_bf16x2(x, y);
-  }
-  __builtin_trap();
-  __builtin_unreachable();
 }
 _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uint, __clc_fmin, uint,
                       uint)
diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp
index 35a89502a2647..0d55d7572faf9 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp
@@ -48,7 +48,8 @@ template <size_t N> sycl::marray<bool, N> isnan(sycl::marray<bfloat16, N> x) {
 
 template <typename T>
 std::enable_if_t<std::is_same_v<T, bfloat16>, T> fabs(T x) {
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
+    (__SYCL_CUDA_ARCH__ >= 800)
   oneapi::detail::Bfloat16StorageT XBits = oneapi::detail::bfloat16ToBits(x);
   return oneapi::detail::bitsToBfloat16(__clc_fabs(XBits));
 #else
@@ -60,13 +61,15 @@ std::enable_if_t<std::is_same_v<T, bfloat16>, T> fabs(T x) {
             : x;
   }
   return x;
-#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&
+       // (__SYCL_CUDA_ARCH__ >= 800)
 }
 
 template <size_t N>
 sycl::marray<bfloat16, N> fabs(sycl::marray<bfloat16, N> x) {
   sycl::marray<bfloat16, N> res;
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
+    (__SYCL_CUDA_ARCH__ >= 800)
   for (size_t i = 0; i < N / 2; i++) {
     auto partial_res = __clc_fabs(detail::to_uint32_t(x, i * 2));
     std::memcpy(&res[i * 2], &partial_res, sizeof(uint32_t));
@@ -81,20 +84,20 @@ sycl::marray<bfloat16, N> fabs(sycl::marray<bfloat16, N> x) {
   for (size_t i = 0; i < N; i++) {
     res[i] = fabs(x[i]);
   }
-#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&
+       // (__SYCL_CUDA_ARCH__ >= 800)
   return res;
 }
 
 template <typename T>
 std::enable_if_t<std::is_same_v<T, bfloat16>, T> fmin(T x, T y) {
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
+    (__SYCL_CUDA_ARCH__ >= 800)
   oneapi::detail::Bfloat16StorageT XBits = oneapi::detail::bfloat16ToBits(x);
   oneapi::detail::Bfloat16StorageT YBits = oneapi::detail::bfloat16ToBits(y);
   return oneapi::detail::bitsToBfloat16(__clc_fmin(XBits, YBits));
 #else
   static const oneapi::detail::Bfloat16StorageT CanonicalNan = 0x7FC0;
-  oneapi::detail::Bfloat16StorageT XBits = oneapi::detail::bfloat16ToBits(x);
-  oneapi::detail::Bfloat16StorageT YBits = oneapi::detail::bfloat16ToBits(y);
   if (isnan(x) && isnan(y))
     return oneapi::detail::bitsToBfloat16(CanonicalNan);
 
@@ -102,6 +105,8 @@ std::enable_if_t<std::is_same_v<T, bfloat16>, T> fmin(T x, T y) {
     return y;
   if (isnan(y))
     return x;
+  oneapi::detail::Bfloat16StorageT XBits = oneapi::detail::bfloat16ToBits(x);
+  oneapi::detail::Bfloat16StorageT YBits = oneapi::detail::bfloat16ToBits(y);
   if (((XBits | YBits) ==
        static_cast<oneapi::detail::Bfloat16StorageT>(0x8000)) &&
       !(XBits & YBits))
@@ -109,14 +114,16 @@ std::enable_if_t<std::is_same_v<T, bfloat16>, T> fmin(T x, T y) {
         static_cast<oneapi::detail::Bfloat16StorageT>(0x8000));
 
   return (x < y) ? x : y;
-#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&
+       // (__SYCL_CUDA_ARCH__ >= 800)
 }
 
 template <size_t N>
 sycl::marray<bfloat16, N> fmin(sycl::marray<bfloat16, N> x,
                                sycl::marray<bfloat16, N> y) {
   sycl::marray<bfloat16, N> res;
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
+    (__SYCL_CUDA_ARCH__ >= 800)
   for (size_t i = 0; i < N / 2; i++) {
     auto partial_res = __clc_fmin(detail::to_uint32_t(x, i * 2),
                                   detail::to_uint32_t(y, i * 2));
@@ -134,20 +141,20 @@ sycl::marray<bfloat16, N> fmin(sycl::marray<bfloat16, N> x,
   for (size_t i = 0; i < N; i++) {
     res[i] = fmin(x[i], y[i]);
   }
-#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&
+       // (__SYCL_CUDA_ARCH__ >= 800)
   return res;
 }
 
 template <typename T>
 std::enable_if_t<std::is_same_v<T, bfloat16>, T> fmax(T x, T y) {
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
+    (__SYCL_CUDA_ARCH__ >= 800)
   oneapi::detail::Bfloat16StorageT XBits = oneapi::detail::bfloat16ToBits(x);
   oneapi::detail::Bfloat16StorageT YBits = oneapi::detail::bfloat16ToBits(y);
   return oneapi::detail::bitsToBfloat16(__clc_fmax(XBits, YBits));
 #else
   static const oneapi::detail::Bfloat16StorageT CanonicalNan = 0x7FC0;
-  oneapi::detail::Bfloat16StorageT XBits = oneapi::detail::bfloat16ToBits(x);
-  oneapi::detail::Bfloat16StorageT YBits = oneapi::detail::bfloat16ToBits(y);
   if (isnan(x) && isnan(y))
     return oneapi::detail::bitsToBfloat16(CanonicalNan);
 
@@ -155,20 +162,24 @@ std::enable_if_t<std::is_same_v<T, bfloat16>, T> fmax(T x, T y) {
     return y;
   if (isnan(y))
     return x;
+  oneapi::detail::Bfloat16StorageT XBits = oneapi::detail::bfloat16ToBits(x);
+  oneapi::detail::Bfloat16StorageT YBits = oneapi::detail::bfloat16ToBits(y);
   if (((XBits | YBits) ==
        static_cast<oneapi::detail::Bfloat16StorageT>(0x8000)) &&
       !(XBits & YBits))
     return oneapi::detail::bitsToBfloat16(0);
 
   return (x > y) ? x : y;
-#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&
+       // (__SYCL_CUDA_ARCH__ >= 800)
 }
 
 template <size_t N>
 sycl::marray<bfloat16, N> fmax(sycl::marray<bfloat16, N> x,
                                sycl::marray<bfloat16, N> y) {
   sycl::marray<bfloat16, N> res;
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
+    (__SYCL_CUDA_ARCH__ >= 800)
   for (size_t i = 0; i < N / 2; i++) {
     auto partial_res = __clc_fmax(detail::to_uint32_t(x, i * 2),
                                   detail::to_uint32_t(y, i * 2));
@@ -186,20 +197,23 @@ sycl::marray<bfloat16, N> fmax(sycl::marray<bfloat16, N> x,
   for (size_t i = 0; i < N; i++) {
     res[i] = fmax(x[i], y[i]);
   }
-#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&
+       // (__SYCL_CUDA_ARCH__ >= 800)
   return res;
 }
 
 template <typename T>
 std::enable_if_t<std::is_same_v<T, bfloat16>, T> fma(T x, T y, T z) {
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
+    (__SYCL_CUDA_ARCH__ >= 800)
   oneapi::detail::Bfloat16StorageT XBits = oneapi::detail::bfloat16ToBits(x);
   oneapi::detail::Bfloat16StorageT YBits = oneapi::detail::bfloat16ToBits(y);
   oneapi::detail::Bfloat16StorageT ZBits = oneapi::detail::bfloat16ToBits(z);
   return oneapi::detail::bitsToBfloat16(__clc_fma(XBits, YBits, ZBits));
 #else
   return sycl::ext::oneapi::bfloat16{sycl::fma(float{x}, float{y}, float{z})};
-#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&
+       // (__SYCL_CUDA_ARCH__ >= 800)
 }
 
 template <size_t N>
@@ -207,7 +221,8 @@ sycl::marray<bfloat16, N> fma(sycl::marray<bfloat16, N> x,
                               sycl::marray<bfloat16, N> y,
                               sycl::marray<bfloat16, N> z) {
   sycl::marray<bfloat16, N> res;
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
+    (__SYCL_CUDA_ARCH__ >= 800)
   for (size_t i = 0; i < N / 2; i++) {
     auto partial_res =
         __clc_fma(detail::to_uint32_t(x, i * 2), detail::to_uint32_t(y, i * 2),
@@ -228,7 +243,8 @@ sycl::marray<bfloat16, N> fma(sycl::marray<bfloat16, N> x,
   for (size_t i = 0; i < N; i++) {
     res[i] = fma(x[i], y[i], z[i]);
   }
-#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&
+       // (__SYCL_CUDA_ARCH__ >= 800)
   return res;
 }
 

From 387b78424dada4ec421f523e84628cee23a32e8c Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Thu, 15 Jun 2023 11:45:08 -0700
Subject: [PATCH 42/55] [CI] Use reusable workflow for AWS start/stop job
 (#9875)

To remove code duplication and simplify caller's code.
---
 .github/workflows/aws.yml                     | 36 +++++++++++++++
 .../workflows/sycl_linux_build_and_test.yml   | 45 +++++--------------
 2 files changed, 46 insertions(+), 35 deletions(-)
 create mode 100644 .github/workflows/aws.yml

diff --git a/.github/workflows/aws.yml b/.github/workflows/aws.yml
new file mode 100644
index 0000000000000..b149d21dc3548
--- /dev/null
+++ b/.github/workflows/aws.yml
@@ -0,0 +1,36 @@
+name: Start/Stop AWS instance
+
+on:
+  workflow_call:
+    inputs:
+      mode:
+        description: "Mode of operation: start or stop"
+        type: string
+        required: true
+
+      runs-on-list:
+        # See devops/actions/aws-ec2/action.yml for more details.
+        description: "JSON string with array of objects with aws-type, runs-on, aws-ami, aws-spot, aws-disk, aws-timebomb, one-job properties"
+        type: string
+        required: true
+
+jobs:
+  aws:
+    runs-on: ubuntu-20.04
+    environment: aws
+    steps:
+      - name: Setup script
+        run: |
+          mkdir -p ./aws-ec2
+          wget raw.githubusercontent.com/intel/llvm/sycl/devops/actions/aws-ec2/action.yml   -P ./aws-ec2
+          wget raw.githubusercontent.com/intel/llvm/sycl/devops/actions/aws-ec2/aws-ec2.js   -P ./aws-ec2
+          wget raw.githubusercontent.com/intel/llvm/sycl/devops/actions/aws-ec2/package.json -P ./aws-ec2
+          npm install ./aws-ec2
+      - name: Start AWS EC2 runners
+        uses: ./aws-ec2
+        with:
+          mode: ${{ inputs.mode }}
+          runs-on-list: ${{ inputs.runs-on-list }}
+          GH_PERSONAL_ACCESS_TOKEN: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          AWS_ACCESS_KEY: ${{ secrets.AWS_ACCESS_KEY }}
+          AWS_SECRET_KEY: ${{ secrets.AWS_SECRET_KEY }}
diff --git a/.github/workflows/sycl_linux_build_and_test.yml b/.github/workflows/sycl_linux_build_and_test.yml
index 93eadd1faa533..fd3c597c9018c 100644
--- a/.github/workflows/sycl_linux_build_and_test.yml
+++ b/.github/workflows/sycl_linux_build_and_test.yml
@@ -183,23 +183,11 @@ jobs:
     name: Start AWS
     needs: build
     if: ${{ inputs.lts_aws_matrix != '[]' }}
-    runs-on: ubuntu-20.04
-    environment: aws
-    steps:
-      - name: Setup script
-        run: |
-          mkdir -p ./aws-ec2
-          wget raw.githubusercontent.com/intel/llvm/sycl/devops/actions/aws-ec2/action.yml   -P ./aws-ec2
-          wget raw.githubusercontent.com/intel/llvm/sycl/devops/actions/aws-ec2/aws-ec2.js   -P ./aws-ec2
-          wget raw.githubusercontent.com/intel/llvm/sycl/devops/actions/aws-ec2/package.json -P ./aws-ec2
-          npm install ./aws-ec2
-      - name: Start AWS EC2 runners
-        uses: ./aws-ec2
-        with:
-          runs-on-list: ${{ inputs.lts_aws_matrix }}
-          GH_PERSONAL_ACCESS_TOKEN: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          AWS_ACCESS_KEY: ${{ secrets.AWS_ACCESS_KEY }}
-          AWS_SECRET_KEY: ${{ secrets.AWS_SECRET_KEY }}
+    uses: ./.github/workflows/aws.yml
+    secrets: inherit
+    with:
+      mode: start
+      runs-on-list: ${{ inputs.lts_aws_matrix }}
 
   e2e-tests:
     needs: [build, aws-start]
@@ -301,21 +289,8 @@ jobs:
     # Always attempt to shutdown AWS instance, even if AWS start was not
     # successful.
     if: ${{ always() && inputs.lts_aws_matrix != '[]' }}
-    runs-on: ubuntu-20.04
-    environment: aws
-    steps:
-      - name: Setup script
-        run: |
-          mkdir -p ./aws-ec2
-          wget raw.githubusercontent.com/intel/llvm/sycl/devops/actions/aws-ec2/action.yml   -P ./aws-ec2
-          wget raw.githubusercontent.com/intel/llvm/sycl/devops/actions/aws-ec2/aws-ec2.js   -P ./aws-ec2
-          wget raw.githubusercontent.com/intel/llvm/sycl/devops/actions/aws-ec2/package.json -P ./aws-ec2
-          npm install ./aws-ec2
-      - name: Stop AWS EC2 runners
-        uses: ./aws-ec2
-        with:
-          runs-on-list: ${{ inputs.lts_aws_matrix }}
-          mode: stop
-          GH_PERSONAL_ACCESS_TOKEN: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          AWS_ACCESS_KEY: ${{ secrets.AWS_ACCESS_KEY }}
-          AWS_SECRET_KEY: ${{ secrets.AWS_SECRET_KEY }}
+    uses: ./.github/workflows/aws.yml
+    secrets: inherit
+    with:
+      mode: stop
+      runs-on-list: ${{ inputs.lts_aws_matrix }}

From 7d3ddca692f07566a044f5bf6673aa6e5fc283bc Mon Sep 17 00:00:00 2001
From: Steffen Larsen <steffen.larsen@intel.com>
Date: Thu, 15 Jun 2023 19:47:21 +0100
Subject: [PATCH 43/55] [SYCL] Change unique prefix to start with "uid" (#9898)

This commit changes unique ID prefix to start with "uid" to avoid the
name having a number as the first character. This is needed for IGC to
correctly handle the names when used for externally visible
device_global.

Signed-off-by: Larsen, Steffen <steffen.larsen@intel.com>
---
 clang/lib/Driver/Driver.cpp              | 2 +-
 clang/test/Driver/sycl-unique-prefix.cpp | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 159035a887cb1..56cab89c26c30 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -6950,7 +6950,7 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
            types::isSrcFile(I.first))) {
         // Unique ID is generated for source files and preprocessed files.
         SmallString<128> ResultID;
-        llvm::sys::fs::createUniquePath("%%%%%%%%%%%%%%%%", ResultID, false);
+        llvm::sys::fs::createUniquePath("uid%%%%%%%%%%%%%%%%", ResultID, false);
         addSYCLUniqueID(Args.MakeArgString(ResultID.str()), SrcFileName);
       }
       if (!types::isSrcFile(I.first))
diff --git a/clang/test/Driver/sycl-unique-prefix.cpp b/clang/test/Driver/sycl-unique-prefix.cpp
index 032c3c0f8a1de..cad78fb81f875 100644
--- a/clang/test/Driver/sycl-unique-prefix.cpp
+++ b/clang/test/Driver/sycl-unique-prefix.cpp
@@ -3,10 +3,10 @@
 // RUN: touch %t_file2.cpp
 // RUN: %clangxx -fsycl -fsycl-targets=spir64-unknown-unknown,spir64_gen-unknown-unknown -c %t_file1.cpp %t_file2.cpp -### 2>&1 \
 // RUN:  | FileCheck -check-prefix=CHECK_PREFIX %s
-// CHECK_PREFIX: clang{{.*}} "-triple" "spir64-unknown-unknown"{{.*}} "-fsycl-is-device"{{.*}} "-fsycl-unique-prefix=[[PREFIX1:([A-z0-9]){16}]]"{{.*}} "{{.*}}_file1.cpp"
+// CHECK_PREFIX: clang{{.*}} "-triple" "spir64-unknown-unknown"{{.*}} "-fsycl-is-device"{{.*}} "-fsycl-unique-prefix=[[PREFIX1:uid([A-z0-9]){16}]]"{{.*}} "{{.*}}_file1.cpp"
 // CHECK_PREFIX: clang{{.*}} "-triple" "spir64_gen-unknown-unknown"{{.*}} "-fsycl-is-device"{{.*}} "-fsycl-unique-prefix=[[PREFIX1]]"{{.*}} "{{.*}}_file1.cpp"
 // CHECK_PREFIX: clang{{.*}} "-fsycl-unique-prefix=[[PREFIX1]]"{{.*}} "-fsycl-is-host"{{.*}} "{{.*}}_file1.cpp"
-// CHECK_PREFIX: clang{{.*}} "-triple" "spir64-unknown-unknown"{{.*}} "-fsycl-is-device"{{.*}} "-fsycl-unique-prefix=[[PREFIX2:([A-z0-9]){16}]]"{{.*}} "{{.*}}_file2.cpp"
+// CHECK_PREFIX: clang{{.*}} "-triple" "spir64-unknown-unknown"{{.*}} "-fsycl-is-device"{{.*}} "-fsycl-unique-prefix=[[PREFIX2:uid([A-z0-9]){16}]]"{{.*}} "{{.*}}_file2.cpp"
 // CHECK_PREFIX: clang{{.*}} "-triple" "spir64_gen-unknown-unknown"{{.*}} "-fsycl-is-device"{{.*}} "-fsycl-unique-prefix=[[PREFIX2]]"{{.*}} "{{.*}}_file2.cpp"
 // CHECK_PREFIX: clang{{.*}} "-fsycl-unique-prefix=[[PREFIX2]]"{{.*}} "-fsycl-is-host"{{.*}}  "{{.*}}_file2.cpp"
 
@@ -14,5 +14,5 @@
 // RUN: touch %t.ii
 // RUN: %clangxx -fsycl -c %t.ii -### 2>&1 \
 // RUN:  | FileCheck -check-prefix=CHECK_PREFIX_II %s
-// CHECK_PREFIX_II: clang{{.*}} "-fsycl-is-device"{{.*}} "-fsycl-unique-prefix=[[PREFIX:([A-z0-9]){16}]]"{{.*}} "{{.*}}.ii"
+// CHECK_PREFIX_II: clang{{.*}} "-fsycl-is-device"{{.*}} "-fsycl-unique-prefix=[[PREFIX:uid([A-z0-9]){16}]]"{{.*}} "{{.*}}.ii"
 // CHECK_PREFIX_II: clang{{.*}} "-fsycl-unique-prefix=[[PREFIX]]"{{.*}} "-fsycl-is-host"{{.*}} "{{.*}}.ii"

From da5e1b98f1c37fdcd51948fb6b70532ad9e49fcc Mon Sep 17 00:00:00 2001
From: Harini Chilamantula <harini.chilamantula@intel.com>
Date: Thu, 15 Jun 2023 14:27:51 -0700
Subject: [PATCH 44/55] [Driver][sycl]Adding C++ libraries to the linker with
 -fsycl (#9896)

Icx with -fsycl option behaves as C++ compiler but it does not add C++
libraries at link time. So to have some alignment between compilation
and link step we need icx to add C++ libraries if -fsycl option is
specified.
---
 clang/lib/Driver/Driver.cpp        | 4 ++++
 clang/test/Driver/sycl-offload.cpp | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 56cab89c26c30..eaad116905e55 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -1571,6 +1571,10 @@ Compilation *Driver::BuildCompilation(ArrayRef<const char *> ArgList) {
     }
   }
 
+  if (Args.hasFlag(options::OPT_fsycl, options::OPT_fno_sycl, false) &&
+      CCCIsCC())
+    setDriverMode("g++");
+
   // Check for working directory option before accessing any files
   if (Arg *WD = Args.getLastArg(options::OPT_working_directory))
     if (VFS->setCurrentWorkingDirectory(WD->getValue()))
diff --git a/clang/test/Driver/sycl-offload.cpp b/clang/test/Driver/sycl-offload.cpp
index 8337f215fe8f6..0ab01194f06ae 100644
--- a/clang/test/Driver/sycl-offload.cpp
+++ b/clang/test/Driver/sycl-offload.cpp
@@ -169,3 +169,6 @@
 // RUN:  | FileCheck -check-prefix IGNORE_INPUT %s
 // IGNORE_INPUT: input unused
 
+/// Check if the clang with fsycl adds C++ libraries to the link line
+//  RUN:  %clang -### -target x86_64-unknown-linux-gnu -fsycl %s 2>&1 | FileCheck -check-prefix=CHECK-FSYCL-WITH-CLANG %s
+// CHECK-FSYCL-WITH-CLANG: "-lstdc++"

From c86d8fe1fb4ad6b620824eefd448983cf06deeea Mon Sep 17 00:00:00 2001
From: Vyacheslav Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Thu, 15 Jun 2023 17:17:56 -0500
Subject: [PATCH 45/55] [ESIMD][DOC][NFC] Move ESIMD doc files from
 "experimental" folder to "supported" (#9892)

- Move ESIMD doc files from "experimental" to "supported" folder;
- Re-struct the sections in README.md and sycl_ext_intel_esimd.md files;
- Add 'examples' folder for ESIMD tests/examples

---------

Signed-off-by: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
---
 .github/CODEOWNERS                            |   2 +-
 .../sycl_ext_intel_esimd/ESIMD-TODO-list.md   |  19 ---
 .../sycl_ext_intel_esimd/README.md            |  85 +++++-------
 .../sycl_ext_intel_esimd/examples/README.md   |  30 +++++
 .../examples/sum_two_arrays.md                | 103 +++++++++++++++
 .../images/Matrix_2_2_2_4__1_2.svg            |   0
 .../images/Matrix_4_1_4_2__0_0.svg            |   0
 .../images/VectorEven.svg                     |   0
 .../sycl_ext_intel_esimd/images/VectorOdd.svg |   0
 .../sycl_ext_intel_esimd/images/simd_view.svg |   0
 .../sycl_ext_intel_esimd.md                   | 122 +++++++++++++-----
 .../sycl_ext_intel_esimd_emulator.md}         |   2 +-
 sycl/include/sycl/ext/intel/esimd.hpp         |   2 +-
 13 files changed, 255 insertions(+), 110 deletions(-)
 delete mode 100644 sycl/doc/extensions/experimental/sycl_ext_intel_esimd/ESIMD-TODO-list.md
 rename sycl/doc/extensions/{experimental => supported}/sycl_ext_intel_esimd/README.md (60%)
 create mode 100644 sycl/doc/extensions/supported/sycl_ext_intel_esimd/examples/README.md
 create mode 100644 sycl/doc/extensions/supported/sycl_ext_intel_esimd/examples/sum_two_arrays.md
 rename sycl/doc/extensions/{experimental => supported}/sycl_ext_intel_esimd/images/Matrix_2_2_2_4__1_2.svg (100%)
 rename sycl/doc/extensions/{experimental => supported}/sycl_ext_intel_esimd/images/Matrix_4_1_4_2__0_0.svg (100%)
 rename sycl/doc/extensions/{experimental => supported}/sycl_ext_intel_esimd/images/VectorEven.svg (100%)
 rename sycl/doc/extensions/{experimental => supported}/sycl_ext_intel_esimd/images/VectorOdd.svg (100%)
 rename sycl/doc/extensions/{experimental => supported}/sycl_ext_intel_esimd/images/simd_view.svg (100%)
 rename sycl/doc/extensions/{experimental => supported}/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md (91%)
 rename sycl/doc/extensions/{experimental/sycl_ext_intel_esimd/esimd_emulator.md => supported/sycl_ext_intel_esimd/sycl_ext_intel_esimd_emulator.md} (99%)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 23028a406360f..a50cc69c4e05d 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -74,7 +74,7 @@ clang/tools/clang-offload-*/ @intel/dpcpp-tools-reviewers
 ESIMD/ @intel/dpcpp-esimd-reviewers
 esimd/ @intel/dpcpp-esimd-reviewers
 sycl/include/sycl/ext/intel/esimd.hpp @intel/dpcpp-esimd-reviewers
-sycl/doc/extensions/experimental/sycl_ext_intel_esimd/ @intel/dpcpp-esimd-reviewers
+sycl/doc/extensions/**/sycl_ext_intel_esimd/ @intel/dpcpp-esimd-reviewers
 llvm/lib/SYCLLowerIR/CMakeLists.txt @intel/dpcpp-tools-reviewers @intel/dpcpp-esimd-reviewers
 
 # invoke_simd
diff --git a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/ESIMD-TODO-list.md b/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/ESIMD-TODO-list.md
deleted file mode 100644
index deecaff25d702..0000000000000
--- a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/ESIMD-TODO-list.md
+++ /dev/null
@@ -1,19 +0,0 @@
-### TODO list for the SYCL Explicit SIMD extension.
-
-A place for TODO items/feedback regarding the extension in addition to the
-github issues mechanism.
-
-#### Front-End
-
-1. Fix a generic (unrelated to ESIMD) issue [1811](https://github.com/intel/llvm/issues/1811) with lambda/functor function
-   detection. Needed to improve diagnostics, fix attribute propagation.  
-   ETA: ???
-2. Fix kernel body function detection. (unrelated to ESIMD)
-  clang/lib/Sema/SemaSYCL.cpp:296 function isSYCLKernelBodyFunction
-  The test in the function should involve checking the caller and matching function
-  types of the caller's parameter and the type of 'this' of this function. But the
-  information about the original caller (e.g. kernel_parallel_for) is
-  unavailable at this point - kernel creation infrastructure must be enhanced.
-  For now the check is only if FD is '()' operator. Works OK for today's
-  handler::kernel_parallel_for/... implementations as no other '()' operators
-  are invoked except the kernel body.
diff --git a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/README.md b/sycl/doc/extensions/supported/sycl_ext_intel_esimd/README.md
similarity index 60%
rename from sycl/doc/extensions/experimental/sycl_ext_intel_esimd/README.md
rename to sycl/doc/extensions/supported/sycl_ext_intel_esimd/README.md
index 3a74330c0ac6b..9241a5231f6f0 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/README.md
+++ b/sycl/doc/extensions/supported/sycl_ext_intel_esimd/README.md
@@ -4,18 +4,27 @@ OneAPI provides the "Explicit SIMD" SYCL extension (or simply "ESIMD") for
 lower-level Intel GPU programming. It provides APIs closely matching Intel GPU ISA
 yet allows to write explicitly vectorized device code. This helps programmer to
 have more control over the generated code and depend less on compiler
-optimizations. The [specification](sycl_ext_intel_esimd.md),
-[API reference](https://intel.github.io/llvm-docs/doxygen/group__sycl__esimd.html), and
-[working code examples](https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/ESIMD/) are available on the Intel DPC++ project's github.
-
-**_NOTE:_** _Some parts of this extension is under active development and APIs in the
-`sycl::ext::intel::experimental::esimd` package are subject to change. There are
-currently a number of [restrictions](#restrictions) specified below._
+optimizations.
 
 ESIMD kernels and functions always require the subgroup size of one, which means
 compiler never does vectorization across work-items in a subgroup. Instead,
-vectorization is expressed explicitly in the code by the programmer. Here is a
-trivial example which adds elements of two arrays and writes the results to the
+vectorization is expressed explicitly in the code by the programmer.
+
+**IMPORTANT NOTE: _Some parts of this extension are under active development. The APIs in the
+`sycl::ext::intel::experimental::esimd` namespace are subject to change or removal._**
+
+Please see the additional resources on the Intel DPC++ project's github:
+
+1) [ESIMD Extension Specification](./sycl_ext_intel_esimd.md)
+1) [ESIMD API/doxygen reference](https://intel.github.io/llvm-docs/doxygen/group__sycl__esimd.html)
+1) [ESIMD Emulator](./sycl_ext_intel_esimd_emulator.md)
+1) [Examples](./examples/README.md)
+1) [ESIMD end-to-end LIT tests](https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/ESIMD/)
+1) [Implementation and API Restrictions](./sycl_ext_intel_esimd.md#implementation-restrictions)
+
+---
+
+Here is a trivial example which adds elements of two arrays and writes the results to the
 third:
 
 ```cpp
@@ -23,20 +32,16 @@ third:
     float *B = malloc_shared<float>(Size, q);
     float *C = malloc_shared<float>(Size, q);
 
-    for (unsigned i = 0; i != Size; i++) {
+    for (unsigned i = 0; i != Size; i++)
       A[i] = B[i] = i;
-    }
-
-    q.submit([&](handler &cgh) {
-      cgh.parallel_for<class Test>(
-        Size / VL, [=](id<1> i)[[intel::sycl_explicit_simd]] {
-        auto offset = i * VL;
-        // pointer arithmetic, so offset is in elements:
-        simd<float, VL> va(A + offset);
-        simd<float, VL> vb(B + offset);
-        simd<float, VL> vc = va + vb;
-        vc.copy_to(C + offset);
-      });
+
+    q.parallel_for(Size / VL, [=](id<1> i)[[intel::sycl_explicit_simd]] {
+      auto offset = i * VL;
+      // pointer arithmetic, so offset is in elements:
+      simd<float, VL> va(A + offset);
+      simd<float, VL> vb(B + offset);
+      simd<float, VL> vc = va + vb;
+      vc.copy_to(C + offset);
     }).wait_and_throw();
 ```
 
@@ -79,7 +84,7 @@ the same application.
 ### SYCL and ESIMD interoperability
 
 SYCL kernels can call ESIMD functions using the special `invoke_simd` API.
-More details are available in [invoke_simd spec](../sycl_ext_oneapi_invoke_simd.asciidoc)
+More details are available in [invoke_simd spec](../../experimental/sycl_ext_oneapi_invoke_simd.asciidoc)
 Test cases are available [here](../../../../test-e2e/InvokeSimd/)
 
 ```cpp
@@ -120,37 +125,5 @@ Currently, compilation of programs with `invoke_simd` calls requires a few addit
 # and callee in the same module.
 clang++ -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -o invoke_simd
 # run the program:
-IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 invoke_simd
+IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 ./invoke_simd
 ```
-
-### ESIMD_EMULATOR backend
-
-Under Linux environment, the same resulting executable file can be run
-on CPU under emulation mode without Intel GPU. For details, check
-[ESIMD_EMULATOR back-end] (esimd_emulator.md)
-
-### Restrictions
-
-This section contains lists of the main restrictions that apply when using the ESIMD
-extension.
-> **Note**: Some restrictions are not enforced by the compiler, which may lead to
-> undefined program behavior if violated.
-
-#### Features not supported with the ESIMD extension:
-- The [C and C++ Standard libraries support](../supported/C-CXX-StandardLibrary.rst)
-- The [Device library extensions](../../../design/DeviceLibExtensions.rst)
-
-#### Unsupported standard SYCL APIs:
-- Local accessors are not implemented yet. Local memory can be allocated and accessed via the explicit device-side API;
-- 2D and 3D accessors;
-- Constant accessors;
-- `sycl::accessor::get_pointer()`. All memory accesses through an accessor are
-done via explicit APIs; e.g. `sycl::ext::intel::experimental::esimd::block_store(acc, offset)`;
-- Accessors with offsets and/or access range specified;
-- `sycl::image`, `sycl::sampler`, `sycl::stream` classes;
-
-#### Other restrictions:
-
-- Only Intel GPU device is supported.
-- Interoperability between regular SYCL and ESIMD kernels is only supported one way.
-  Regular SYCL kernels can call ESIMD functions, but not vice-versa. Invocation of SYCL code from ESIMD is not supported yet.
diff --git a/sycl/doc/extensions/supported/sycl_ext_intel_esimd/examples/README.md b/sycl/doc/extensions/supported/sycl_ext_intel_esimd/examples/README.md
new file mode 100644
index 0000000000000..917752f03df50
--- /dev/null
+++ b/sycl/doc/extensions/supported/sycl_ext_intel_esimd/examples/README.md
@@ -0,0 +1,30 @@
+# ESIMD Examples
+
+This folder contains simple ESIMD examples. The main purpose of having them
+is to show the basic ESIMD APIs in well known examples.
+
+1) The most basic example - ["sum_two_arrays"](./sum_two_arrays.md).
+   
+   Please see the full source here: ["sum_two_arrays"](./sum_two_arrays.md).
+   ```c++
+   float *a = malloc_shared<float>(Size, q); // USM memory for A
+   float *b = new float[Size];               // B uses HOST memory
+   buffer<float, 1> buf_b(b, Size);
+
+   // Initialize 'a' and 'b' here.
+    
+   // Compute: a[i] += b[i];
+   q.submit([&](handler &cgh) {
+     auto acc_b = buf_b.get_access<access::mode::read>(cgh);
+     cgh.parallel_for(Size / VL, [=](id<1> i) [[intel::sycl_explicit_simd]] {
+       auto element_offset = i * VL;
+       simd<float, VL> vec_a(a + element_offset); // Pointer arithmetic uses element offset
+       simd<float, VL> vec_b(acc_b, element_offset * sizeof(float)); // accessor API uses byte-offset
+
+       vec_a += vec_b;
+       vec_a.copy_to(a + element_offset);
+     });
+   }).wait_and_throw();
+   ```
+
+2) TODO: Add more examples here.
diff --git a/sycl/doc/extensions/supported/sycl_ext_intel_esimd/examples/sum_two_arrays.md b/sycl/doc/extensions/supported/sycl_ext_intel_esimd/examples/sum_two_arrays.md
new file mode 100644
index 0000000000000..771300099354d
--- /dev/null
+++ b/sycl/doc/extensions/supported/sycl_ext_intel_esimd/examples/sum_two_arrays.md
@@ -0,0 +1,103 @@
+## Sum elements in two arrays: a[i] += b[i].
+
+Compile and run:
+```bash
+> clang++ -fsycl sum_two_arrays.cpp
+
+> ONEAPI_DEVICE_SELECTOR=level_zero:gpu ./a.out
+Running on Intel(R) UHD Graphics 630
+Passed
+```
+
+Source code:
+```C++
+#include <sycl/sycl.hpp>
+#include <sycl/ext/intel/esimd.hpp>
+
+#include <iostream>
+
+#if !defined(USE_SYCL) && !defined(USE_ESIMD)
+#define USE_ESIMD
+#endif
+
+using namespace sycl;
+using namespace sycl::ext::intel::esimd;
+
+inline auto createExceptionHandler() {
+  return [](exception_list l) {
+    for (auto ep : l) {
+      try {
+        std::rethrow_exception(ep);
+      } catch (sycl::exception &e0) {
+        std::cout << "sycl::exception: " << e0.what() << std::endl;
+      } catch (std::exception &e) {
+        std::cout << "std::exception: " << e.what() << std::endl;
+      } catch (...) {
+        std::cout << "generic exception\n";
+      }
+    }
+  };
+}
+
+struct usm_deleter {
+  queue q;
+  void operator()(void *ptr) {
+    if (ptr)
+      sycl::free(ptr, q);
+  }
+};
+
+int main() {
+  constexpr unsigned Size = 128;
+  constexpr unsigned VL = 32;
+  int err_cnt = 0;
+
+  try {
+    queue q(gpu_selector_v, createExceptionHandler());
+    auto dev = q.get_device();
+    std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
+
+    float *a = malloc_shared<float>(Size, q); // USM memory for A
+    float *b = new float[Size];               // B uses HOST memory
+    buffer<float, 1> buf_b(b, Size);
+
+    std::unique_ptr<float, usm_deleter> guard_a(a, usm_deleter{ q });
+    std::unique_ptr<float> guard_b(b);
+
+    for (unsigned i = 0; i != Size; i++)
+      a[i] = b[i] = i;
+
+    q.submit([&](handler &cgh) {
+      auto acc_b = buf_b.get_access<access::mode::read>(cgh);
+#ifdef USE_ESIMD
+      cgh.parallel_for(Size / VL, [=](id<1> i) [[intel::sycl_explicit_simd]] {
+        auto element_offset = i * VL;
+        simd<float, VL> vec_a(a + element_offset); // Pointer arithmetic uses element offset
+        simd<float, VL> vec_b(acc_b, element_offset * sizeof(float)); // accessor API uses byte-offset
+
+        vec_a += vec_b;
+        vec_a.copy_to(a + element_offset);
+      });
+#elif defined(USE_SYCL)
+      cgh.parallel_for(Size, [=](id<1> i) {
+        a[i] += acc_b[i];
+      });
+#endif
+    }).wait_and_throw();
+
+    for (unsigned i = 0; i < Size; ++i) {
+      if (a[i] != (float)i + (float)i) {
+        err_cnt++;
+        std::cout << "failed at" << i << ": " << a[i] << " != " << (float)i
+            << " + " << (float)i << std::endl;
+      }
+    }
+  }
+  catch (sycl::exception &e) {
+    std::cout << "SYCL exception caught: " << e.what() << "\n";
+    return 1;
+  }
+  std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
+  return err_cnt > 0 ? 1 : 0;
+}
+```
diff --git a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/images/Matrix_2_2_2_4__1_2.svg b/sycl/doc/extensions/supported/sycl_ext_intel_esimd/images/Matrix_2_2_2_4__1_2.svg
similarity index 100%
rename from sycl/doc/extensions/experimental/sycl_ext_intel_esimd/images/Matrix_2_2_2_4__1_2.svg
rename to sycl/doc/extensions/supported/sycl_ext_intel_esimd/images/Matrix_2_2_2_4__1_2.svg
diff --git a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/images/Matrix_4_1_4_2__0_0.svg b/sycl/doc/extensions/supported/sycl_ext_intel_esimd/images/Matrix_4_1_4_2__0_0.svg
similarity index 100%
rename from sycl/doc/extensions/experimental/sycl_ext_intel_esimd/images/Matrix_4_1_4_2__0_0.svg
rename to sycl/doc/extensions/supported/sycl_ext_intel_esimd/images/Matrix_4_1_4_2__0_0.svg
diff --git a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/images/VectorEven.svg b/sycl/doc/extensions/supported/sycl_ext_intel_esimd/images/VectorEven.svg
similarity index 100%
rename from sycl/doc/extensions/experimental/sycl_ext_intel_esimd/images/VectorEven.svg
rename to sycl/doc/extensions/supported/sycl_ext_intel_esimd/images/VectorEven.svg
diff --git a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/images/VectorOdd.svg b/sycl/doc/extensions/supported/sycl_ext_intel_esimd/images/VectorOdd.svg
similarity index 100%
rename from sycl/doc/extensions/experimental/sycl_ext_intel_esimd/images/VectorOdd.svg
rename to sycl/doc/extensions/supported/sycl_ext_intel_esimd/images/VectorOdd.svg
diff --git a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/images/simd_view.svg b/sycl/doc/extensions/supported/sycl_ext_intel_esimd/images/simd_view.svg
similarity index 100%
rename from sycl/doc/extensions/experimental/sycl_ext_intel_esimd/images/simd_view.svg
rename to sycl/doc/extensions/supported/sycl_ext_intel_esimd/images/simd_view.svg
diff --git a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md b/sycl/doc/extensions/supported/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md
similarity index 91%
rename from sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md
rename to sycl/doc/extensions/supported/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md
index 29095552aeae2..b433adb160b21 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md
+++ b/sycl/doc/extensions/supported/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md
@@ -1,5 +1,48 @@
 # Explicit SIMD Programming Extension for DPC++
 
+## Table of content
+- [Introduction](#introduction)
+- [Explicit SIMD execution model](#explicit-simd-execution-model)
+- [Explicit SIMD extension APIs](#explicit-simd-extension-apis)
+- [Core Explicit SIMD programming APIs](#core-explicit-simd-programming-apis)
+  - [SIMD vector class](#simd-vector-class)
+  - [simd_view class](#simd_view-class)
+  - [Reduction functions](#reduction-functions)
+  - [Memory access APIs](#memory-access-apis)
+    - [APIs overview](#apis-overview)
+    - [SLM - Shared local memory access](#shared-local-memory-access)
+      - [Static allocation of SLM using slm_init function](#static-allocation-of-slm-using-slm_init-function)
+      - [Semi-dynamic allocation of SLM](#semi-dynamic-allocation-of-slm)
+      - [Local accessors](#local-accessors)
+    - [Atomics](#atomics)
+  - [Math operations](#math-operations)
+    - [Extended math](#extended-math)
+    - [Other standard math](#other-standard-math)
+    - [Other non-standard math functions](#other-non-standard-math-functions)
+  - [Dot Product Accumulate Systolic - `DPAS` API](#dot-product-accumulate-systolic---dpas-api)
+    - [DPAS API definition](#dpas-api-definition)
+    - [Example of DPAS usage](#example-of-dpas-usage)
+    - [Possible type combinations for `xmx::dpas()`](#possible-type-combinations-for-xmxdpas)
+    - [Input and output matrices representation as simd vectors](#input-and-output-matrices-representation-as-simd-vectors)
+    - [Horizontal packing](#horizontal-packing-for-a-c-and-result)
+    - [Vertical packing](#vertical-packing)
+  - [Other APIs](#other-apis)
+  - [Private Global Variables](#private-global-variables)
+  - [__regcall Calling convention](#__regcall-calling-convention)
+  - [Inline assembly](#inline-assembly)
+- [Implementation restrictions](#implementation-restrictions)
+  - [Features not supported with the ESIMD extension](#features-not-supported-with-the-esimd-extension)
+  - [Unsupported standard SYCL APIs](#unsupported-standard-sycl-apis)
+  - [Other restrictions](#other-restrictions)
+
+## Other content:
+* [ESIMD API/doxygen reference](https://intel.github.io/llvm-docs/doxygen/group__sycl__esimd.html)
+* [ESIMD Emulator](./sycl_ext_intel_esimd_emulator.md)
+* [Examples](./examples/README.md)
+* [ESIMD LIT tests - working code examples](https://github.com/intel/llvm/blob/sycl/sycl/test-e2e/ESIMD/)
+
+---
+
 ## Introduction
 
 The main motivation for introducing the "Explicit SIMD" SYCL extension 
@@ -12,8 +55,15 @@ Explicit SIMD provides the following key features complementary to SYCL:
   general register file. This allows to write efficient code not relying on
   further widening by the compiler, as with traditional SPMD programming.
 - Low-level APIs efficiently mapped to the Intel GPU architecture, such as block loads/stores/gathers/scatters, explicit cache hints, GPU inline assembly, etc.
+- Regular SYCL and ESIMD kernels can co-exist in the same translation unit and in
+the same application. For more details, see [SYCL and ESIMD interoperability](./README.md#sycl-and-esimd-interoperability)
+
+Explicit SIMD has some [restrictions](#implementation-restrictions) as well.
 
-Explicit SIMD though have some [restrictions](./README.md#restrictions) as well.
+**IMPORTANT NOTE: _Some parts of this extension are under active development. The APIs in the
+`sycl::ext::intel::experimental::esimd` namespace are subject to change or removal._**
+
+---
 
 ## Explicit SIMD execution model
 
@@ -35,7 +85,7 @@ other devices will result in error.
 All the ESIMD APIs are defined in the `sycl::ext::intel::esimd`
 namespace.
 
-Kernels and `SYCL_EXTERNAL` functions using ESP must be explicitly marked with
+Kernels and `SYCL_EXTERNAL` functions using ESIMD must be explicitly marked with
 the `[[intel::sycl_explicit_simd]]` attribute. Subgroup size query within such
 functions will always return `1`.
 
@@ -94,25 +144,6 @@ int main(void) {
 }
 ```
 
-## Implementation restrictions
-
-Current ESIMD implementation does not support using certain standard SYCL features
-inside explicit SIMD kernels and functions. Most of them will be eventually
-dropped. What's not supported today:
-- Explicit SIMD kernels can co-exist with regular SYCL kernels in the same
-  translation unit and in the same program.
-- Interoperability between regular SYCL and ESIMD kernels is only supported one way.
-  Regular SYCL kernels can call ESIMD functions, but not vice-versa. Invocation of SYCL code from ESIMD is not supported yet.
-- Local accessors are not supported yet. Local memory is allocated and accessed
-  via explicit device-side API.
-- 2D and 3D accessors;
-- Constant accessors;
-- `sycl::accessor::get_pointer()`. All memory accesses through an accessor are
-done via explicit APIs; e.g. `sycl::ext::intel::esimd::block_store(acc, offset)`
-- Accessors with offsets and/or access range specified
-- `sycl::image`, `sycl::sampler` and `sycl::stream` classes.
-
-
 ## Core Explicit SIMD programming APIs
 
 The DPC++ Explicit SIMD library defines the following classes to enhance the
@@ -392,7 +423,7 @@ This memory is shared between work items in a workgroup - basically
 it is ESIMD variant of the SYCL `local` memory.
 
 SLM variants of APIs have 'slm_' prefix in their names,
-e.g. slm_block_load(), or lsc_slm_gather().
+e.g. ext::intel::esimd::slm_block_load() or ext::intel::experimental::esimd::lsc_slm_gather().
 
 SLM memory must be explicitly allocated before it is read or written.
 
@@ -1036,16 +1067,13 @@ int main(void) {
     for (unsigned i = 0; i != Size; i++) {
       A[i] = B[i] = i;
     }
-    q.submit([&](handler &cgh) {
-      cgh.parallel_for<class Test>(
-        Size / VL, [=](id<1> i)[[intel::sycl_explicit_simd]]{
-        auto offset = i * VL;
-        // pointer arithmetic, so offset is in elements:
-        simd<float, VL> va(A + offset);
-        simd<float, VL> vb(B + offset);
-        simd<float, VL> vc = va + vb;
-        vc.copy_to(C + offset);
-      });
+    q.parallel_for(Size / VL, [=](id<1> i) [[intel::sycl_explicit_simd]] {
+      auto offset = i * VL;
+      // pointer arithmetic, so offset is in elements:
+      simd<float, VL> va(A + offset);
+      simd<float, VL> vb(B + offset);
+      simd<float, VL> vc = va + vb;
+      vc.copy_to(C + offset);
     }).wait_and_throw();
 
     for (unsigned i = 0; i < Size; ++i) {
@@ -1072,3 +1100,33 @@ int main(void) {
 ```
 more examples can be found in the
 [ESIMD test suite](https://github.com/intel/llvm/tree/sycl/sycl/test-e2e/ESIMD) on github.
+
+## Implementation restrictions
+
+This section contains a list of the main restrictions that apply when using the ESIMD
+extension.
+> **Note**: Some restrictions are not enforced by the compiler, which may lead to
+> undefined program behavior if violated.
+
+### Features not supported with the ESIMD extension:
+- The [C and C++ Standard libraries support](../C-CXX-StandardLibrary.rst)
+- The [Device library extensions](../../../design/DeviceLibExtensions.rst)
+
+### Unsupported standard SYCL APIs:
+
+The current ESIMD implementation does not support certain standard SYCL features
+inside ESIMD kernels and functions. Most of missing SYCL features listed below
+must be supported eventually:
+- 2D and 3D target::device accessor and local_accessor;
+- Constant accessors;
+- `sycl::accessor::get_pointer()` and `sycl::accessor::operator[]` are supported only with `-fsycl-esimd-force-stateless-mem`. Otherwise, All memory accesses through an accessor are
+done via explicit APIs; e.g. `sycl::ext::intel::esimd::block_store(acc, offset)`
+- Accessors with non-zero offsets to accessed buffer;
+- Accessors with access/memory range specified;
+- `sycl::image`, `sycl::sampler` and `sycl::stream` classes.
+
+### Other restrictions:
+
+- Only Intel GPU devices are supported.
+- Interoperability between regular SYCL and ESIMD kernels is only supported one way.
+  Regular SYCL kernels can call ESIMD functions, but not vice-versa. Invocation of SYCL code from ESIMD is not supported yet.
diff --git a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/esimd_emulator.md b/sycl/doc/extensions/supported/sycl_ext_intel_esimd/sycl_ext_intel_esimd_emulator.md
similarity index 99%
rename from sycl/doc/extensions/experimental/sycl_ext_intel_esimd/esimd_emulator.md
rename to sycl/doc/extensions/supported/sycl_ext_intel_esimd/sycl_ext_intel_esimd_emulator.md
index 6e8010f093c7c..97990ae382d79 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/esimd_emulator.md
+++ b/sycl/doc/extensions/supported/sycl_ext_intel_esimd/sycl_ext_intel_esimd_emulator.md
@@ -79,7 +79,7 @@ To compile using the open-source Intel DPC++ compiler:
 > `$ clang++ -fsycl vadd_usm.cpp`
 
 To compile using Intel(R) OneAPI Toolkit:
-> `$ dpcpp vadd_usm.cpp`
+> `$ icpx -fsycl vadd_usm.cpp`
 
 To run under emulation through ESIMD_EMULATOR backend:
 > `$ ONEAPI_DEVICE_SELECTOR=ext_intel_esimd_emulator:gpu ./a.out`
diff --git a/sycl/include/sycl/ext/intel/esimd.hpp b/sycl/include/sycl/ext/intel/esimd.hpp
index 2e20864bffa7b..99e44c5d3ad5c 100644
--- a/sycl/include/sycl/ext/intel/esimd.hpp
+++ b/sycl/include/sycl/ext/intel/esimd.hpp
@@ -15,7 +15,7 @@
 /// @defgroup sycl_esimd DPC++ Explicit SIMD API
 /// This is a low-level API providing direct access to Intel GPU hardware
 /// features. ESIMD overview can be found
-/// [here](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md).
+/// [here](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/supported/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md).
 /// Some terminology used in the API documentation:
 /// - *lane* -
 ///       (or "vector lane") Individual "lane" of input and output elements

From a4164ef51365e7122168d1a69cdffc6b5da2d187 Mon Sep 17 00:00:00 2001
From: Steffen Larsen <steffen.larsen@intel.com>
Date: Fri, 16 Jun 2023 06:09:39 +0100
Subject: [PATCH 46/55] [SYCL][E2E] Split memcpy2d and copy2d tests by
 allocation types (#9870)

The copy2d and memcpy2d, despite having been reduced in coverage, are
still some of the most time-consuming tests in the e2e test suite. To
amend this, this commit splits them up into multiple tests, based on
what type of allocation the copy operation is from and to. This allows
the tests to have LIT-level aspect requirements and lets LIT parallelize
running them.

To avoid being overly aggressive with the number of files in the e2e USM
folder, this commit also moves the 2D USM memory operations into a new
memops2d subfolder.

---------

Signed-off-by: Larsen, Steffen <steffen.larsen@intel.com>
---
 .../copy2d_common.hpp}                        | 48 ++-----------------
 .../USM/memops2d/copy2d_device_to_device.cpp  | 18 +++++++
 .../USM/memops2d/copy2d_device_to_dhost.cpp   | 18 +++++++
 .../USM/memops2d/copy2d_device_to_host.cpp    | 18 +++++++
 .../USM/memops2d/copy2d_device_to_shared.cpp  | 18 +++++++
 .../USM/memops2d/copy2d_dhost_to_device.cpp   | 18 +++++++
 .../USM/memops2d/copy2d_dhost_to_dhost.cpp    | 17 +++++++
 .../USM/memops2d/copy2d_dhost_to_host.cpp     | 18 +++++++
 .../USM/memops2d/copy2d_dhost_to_shared.cpp   | 18 +++++++
 .../USM/memops2d/copy2d_host_to_device.cpp    | 18 +++++++
 .../USM/memops2d/copy2d_host_to_dhost.cpp     | 18 +++++++
 .../USM/memops2d/copy2d_host_to_host.cpp      | 18 +++++++
 .../USM/memops2d/copy2d_host_to_shared.cpp    | 18 +++++++
 .../USM/memops2d/copy2d_shared_to_device.cpp  | 18 +++++++
 .../USM/memops2d/copy2d_shared_to_dhost.cpp   | 18 +++++++
 .../USM/memops2d/copy2d_shared_to_host.cpp    | 18 +++++++
 .../USM/memops2d/copy2d_shared_to_shared.cpp  | 18 +++++++
 sycl/test-e2e/USM/{ => memops2d}/fill2d.cpp   |  0
 .../memcpy2d_common.hpp}                      | 48 ++-----------------
 .../memops2d/memcpy2d_device_to_device.cpp    | 18 +++++++
 .../USM/memops2d/memcpy2d_device_to_dhost.cpp | 18 +++++++
 .../USM/memops2d/memcpy2d_device_to_host.cpp  | 18 +++++++
 .../memops2d/memcpy2d_device_to_shared.cpp    | 18 +++++++
 .../USM/memops2d/memcpy2d_dhost_to_device.cpp | 18 +++++++
 .../USM/memops2d/memcpy2d_dhost_to_dhost.cpp  | 17 +++++++
 .../USM/memops2d/memcpy2d_dhost_to_host.cpp   | 18 +++++++
 .../USM/memops2d/memcpy2d_dhost_to_shared.cpp | 18 +++++++
 .../USM/memops2d/memcpy2d_host_to_device.cpp  | 18 +++++++
 .../USM/memops2d/memcpy2d_host_to_dhost.cpp   | 18 +++++++
 .../USM/memops2d/memcpy2d_host_to_host.cpp    | 18 +++++++
 .../USM/memops2d/memcpy2d_host_to_shared.cpp  | 18 +++++++
 .../memops2d/memcpy2d_shared_to_device.cpp    | 18 +++++++
 .../USM/memops2d/memcpy2d_shared_to_dhost.cpp | 18 +++++++
 .../USM/memops2d/memcpy2d_shared_to_host.cpp  | 18 +++++++
 .../memops2d/memcpy2d_shared_to_shared.cpp    | 18 +++++++
 .../USM/{ => memops2d}/memops2d_utils.hpp     |  6 +--
 sycl/test-e2e/USM/{ => memops2d}/memset2d.cpp |  0
 37 files changed, 581 insertions(+), 95 deletions(-)
 rename sycl/test-e2e/USM/{copy2d.cpp => memops2d/copy2d_common.hpp} (90%)
 create mode 100644 sycl/test-e2e/USM/memops2d/copy2d_device_to_device.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/copy2d_device_to_dhost.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/copy2d_device_to_host.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/copy2d_device_to_shared.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/copy2d_dhost_to_device.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/copy2d_dhost_to_dhost.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/copy2d_dhost_to_host.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/copy2d_dhost_to_shared.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/copy2d_host_to_device.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/copy2d_host_to_dhost.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/copy2d_host_to_host.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/copy2d_host_to_shared.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/copy2d_shared_to_device.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/copy2d_shared_to_dhost.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/copy2d_shared_to_host.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/copy2d_shared_to_shared.cpp
 rename sycl/test-e2e/USM/{ => memops2d}/fill2d.cpp (100%)
 rename sycl/test-e2e/USM/{memcpy2d.cpp => memops2d/memcpy2d_common.hpp} (90%)
 create mode 100644 sycl/test-e2e/USM/memops2d/memcpy2d_device_to_device.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/memcpy2d_device_to_dhost.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/memcpy2d_device_to_host.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/memcpy2d_device_to_shared.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_device.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_dhost.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_host.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_shared.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/memcpy2d_host_to_device.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/memcpy2d_host_to_dhost.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/memcpy2d_host_to_host.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/memcpy2d_host_to_shared.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_device.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_dhost.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_host.cpp
 create mode 100644 sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_shared.cpp
 rename sycl/test-e2e/USM/{ => memops2d}/memops2d_utils.hpp (98%)
 rename sycl/test-e2e/USM/{ => memops2d}/memset2d.cpp (100%)

diff --git a/sycl/test-e2e/USM/copy2d.cpp b/sycl/test-e2e/USM/memops2d/copy2d_common.hpp
similarity index 90%
rename from sycl/test-e2e/USM/copy2d.cpp
rename to sycl/test-e2e/USM/memops2d/copy2d_common.hpp
index 02b945d005993..aad4e85f11c0f 100644
--- a/sycl/test-e2e/USM/copy2d.cpp
+++ b/sycl/test-e2e/USM/memops2d/copy2d_common.hpp
@@ -1,16 +1,4 @@
-//==---- copy2d.cpp - USM 2D copy test -------------------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-// Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+#pragma once
 
 #include <sycl/sycl.hpp>
 
@@ -397,8 +385,8 @@ int testForAllPaths(queue &Q, T ExpectedVal1, T ExpectedVal2) {
   return Failures;
 }
 
-template <Alloc SrcAllocKind, Alloc DstAllocKind>
-int testForAllTypesAndPaths(queue &Q) {
+template <Alloc SrcAllocKind, Alloc DstAllocKind> int test() {
+  queue Q;
 
   bool SupportsHalf = Q.get_device().has(aspect::fp16);
   bool SupportsDouble = Q.get_device().has(aspect::fp64);
@@ -424,33 +412,3 @@ int testForAllTypesAndPaths(queue &Q) {
       Q, TestStructRef1, TestStructRef2);
   return Failures;
 }
-
-template <Alloc SrcAllocKind> int testForAllTypesAndPathsAndDsts(queue &Q) {
-  int Failures = 0;
-  Failures += testForAllTypesAndPaths<SrcAllocKind, Alloc::DirectHost>(Q);
-  if (Q.get_device().has(aspect::usm_device_allocations))
-    Failures += testForAllTypesAndPaths<SrcAllocKind, Alloc::Device>(Q);
-  if (Q.get_device().has(aspect::usm_host_allocations))
-    Failures += testForAllTypesAndPaths<SrcAllocKind, Alloc::Host>(Q);
-  if (Q.get_device().has(aspect::usm_shared_allocations))
-    Failures += testForAllTypesAndPaths<SrcAllocKind, Alloc::Shared>(Q);
-  return Failures;
-}
-
-int main() {
-  queue Q;
-
-  int Failures = 0;
-  Failures += testForAllTypesAndPathsAndDsts<Alloc::DirectHost>(Q);
-  if (Q.get_device().has(aspect::usm_device_allocations))
-    Failures += testForAllTypesAndPathsAndDsts<Alloc::Device>(Q);
-  if (Q.get_device().has(aspect::usm_host_allocations))
-    Failures += testForAllTypesAndPathsAndDsts<Alloc::Host>(Q);
-  if (Q.get_device().has(aspect::usm_shared_allocations))
-    Failures += testForAllTypesAndPathsAndDsts<Alloc::Shared>(Q);
-
-  if (!Failures)
-    std::cout << "Passed!" << std::endl;
-
-  return Failures;
-}
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_device_to_device.cpp b/sycl/test-e2e/USM/memops2d/copy2d_device_to_device.cpp
new file mode 100644
index 0000000000000..7105217981123
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/copy2d_device_to_device.cpp
@@ -0,0 +1,18 @@
+//==-- copy2d_device_to_device.cpp - 2D copy from device USM to device USM -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_device_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "copy2d_common.hpp"
+
+int main() { return test<Alloc::Device, Alloc::Device>(); }
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_device_to_dhost.cpp b/sycl/test-e2e/USM/memops2d/copy2d_device_to_dhost.cpp
new file mode 100644
index 0000000000000..524714903d9c1
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/copy2d_device_to_dhost.cpp
@@ -0,0 +1,18 @@
+//==----- copy2d_device_to_dhost.cpp - 2D copy from device USM to host -----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_device_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "copy2d_common.hpp"
+
+int main() { return test<Alloc::Device, Alloc::DirectHost>(); }
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_device_to_host.cpp b/sycl/test-e2e/USM/memops2d/copy2d_device_to_host.cpp
new file mode 100644
index 0000000000000..376afa79af3b2
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/copy2d_device_to_host.cpp
@@ -0,0 +1,18 @@
+//==---- copy2d_device_to_host.cpp - 2D copy from device USM to host USM ---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_device_allocations, aspect-usm_host_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "copy2d_common.hpp"
+
+int main() { return test<Alloc::Device, Alloc::Host>(); }
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_device_to_shared.cpp b/sycl/test-e2e/USM/memops2d/copy2d_device_to_shared.cpp
new file mode 100644
index 0000000000000..0897b277c1ae2
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/copy2d_device_to_shared.cpp
@@ -0,0 +1,18 @@
+//==-- copy2d_device_to_shared.cpp - 2D copy from device USM to shared USM -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_device_allocations, aspect-usm_shared_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "copy2d_common.hpp"
+
+int main() { return test<Alloc::Device, Alloc::Shared>(); }
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_device.cpp b/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_device.cpp
new file mode 100644
index 0000000000000..bace791339282
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_device.cpp
@@ -0,0 +1,18 @@
+//==----- copy2d_dhost_to_device.cpp - 2D copy from host to device USM -----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_device_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "copy2d_common.hpp"
+
+int main() { return test<Alloc::DirectHost, Alloc::Device>(); }
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_dhost.cpp b/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_dhost.cpp
new file mode 100644
index 0000000000000..f6aed2ea89f13
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_dhost.cpp
@@ -0,0 +1,17 @@
+//==--------- copy2d_dhost_to_dhost.cpp - 2D copy from host to host --------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "copy2d_common.hpp"
+
+int main() { return test<Alloc::DirectHost, Alloc::DirectHost>(); }
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_host.cpp b/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_host.cpp
new file mode 100644
index 0000000000000..989c76d6f11d9
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_host.cpp
@@ -0,0 +1,18 @@
+//==------- copy2d_dhost_to_host.cpp - 2D copy from host to host USM -------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_host_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "copy2d_common.hpp"
+
+int main() { return test<Alloc::DirectHost, Alloc::Host>(); }
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_shared.cpp b/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_shared.cpp
new file mode 100644
index 0000000000000..7b2a547ed9b28
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/copy2d_dhost_to_shared.cpp
@@ -0,0 +1,18 @@
+//==----- copy2d_dhost_to_shared.cpp - 2D copy from host to shared USM -----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_shared_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "copy2d_common.hpp"
+
+int main() { return test<Alloc::DirectHost, Alloc::Shared>(); }
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_host_to_device.cpp b/sycl/test-e2e/USM/memops2d/copy2d_host_to_device.cpp
new file mode 100644
index 0000000000000..08e21a7bc616e
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/copy2d_host_to_device.cpp
@@ -0,0 +1,18 @@
+//==---- copy2d_host_to_device.cpp - 2D copy from host USM to device USM ---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_host_allocations, aspect-usm_device_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "copy2d_common.hpp"
+
+int main() { return test<Alloc::Host, Alloc::Device>(); }
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_host_to_dhost.cpp b/sycl/test-e2e/USM/memops2d/copy2d_host_to_dhost.cpp
new file mode 100644
index 0000000000000..768124e58c77d
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/copy2d_host_to_dhost.cpp
@@ -0,0 +1,18 @@
+//==------- copy2d_host_to_dhost.cpp - 2D copy from host USM to host -------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_host_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "copy2d_common.hpp"
+
+int main() { return test<Alloc::Host, Alloc::DirectHost>(); }
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_host_to_host.cpp b/sycl/test-e2e/USM/memops2d/copy2d_host_to_host.cpp
new file mode 100644
index 0000000000000..b286386fc348a
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/copy2d_host_to_host.cpp
@@ -0,0 +1,18 @@
+//==----- copy2d_host_to_host.cpp - 2D copy from host USM to device USM ----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_host_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "copy2d_common.hpp"
+
+int main() { return test<Alloc::Host, Alloc::Host>(); }
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_host_to_shared.cpp b/sycl/test-e2e/USM/memops2d/copy2d_host_to_shared.cpp
new file mode 100644
index 0000000000000..46be8cb2b73ce
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/copy2d_host_to_shared.cpp
@@ -0,0 +1,18 @@
+//==---- copy2d_host_to_shared.cpp - 2D copy from host USM to shared USM ---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_host_allocations, aspect-usm_shared_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "copy2d_common.hpp"
+
+int main() { return test<Alloc::Host, Alloc::Shared>(); }
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_shared_to_device.cpp b/sycl/test-e2e/USM/memops2d/copy2d_shared_to_device.cpp
new file mode 100644
index 0000000000000..596be876d0283
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/copy2d_shared_to_device.cpp
@@ -0,0 +1,18 @@
+//==-- copy2d_shared_to_device.cpp - 2D copy from shared USM to device USM -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_shared_allocations, aspect-usm_device_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "copy2d_common.hpp"
+
+int main() { return test<Alloc::Shared, Alloc::Device>(); }
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_shared_to_dhost.cpp b/sycl/test-e2e/USM/memops2d/copy2d_shared_to_dhost.cpp
new file mode 100644
index 0000000000000..1bb0270688c9b
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/copy2d_shared_to_dhost.cpp
@@ -0,0 +1,18 @@
+//==----- copy2d_shared_to_dhost.cpp - 2D copy from shared USM to host -----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_shared_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "copy2d_common.hpp"
+
+int main() { return test<Alloc::Shared, Alloc::DirectHost>(); }
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_shared_to_host.cpp b/sycl/test-e2e/USM/memops2d/copy2d_shared_to_host.cpp
new file mode 100644
index 0000000000000..126e47d7417c1
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/copy2d_shared_to_host.cpp
@@ -0,0 +1,18 @@
+//==---- copy2d_shared_to_host.cpp - 2D copy from shared USM to host USM ---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_shared_allocations, aspect-usm_host_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "copy2d_common.hpp"
+
+int main() { return test<Alloc::Shared, Alloc::Host>(); }
diff --git a/sycl/test-e2e/USM/memops2d/copy2d_shared_to_shared.cpp b/sycl/test-e2e/USM/memops2d/copy2d_shared_to_shared.cpp
new file mode 100644
index 0000000000000..bb83ca22ce9fb
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/copy2d_shared_to_shared.cpp
@@ -0,0 +1,18 @@
+//==-- copy2d_shared_to_device.cpp - 2D copy from shared USM to shared USM -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_shared_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "copy2d_common.hpp"
+
+int main() { return test<Alloc::Shared, Alloc::Shared>(); }
diff --git a/sycl/test-e2e/USM/fill2d.cpp b/sycl/test-e2e/USM/memops2d/fill2d.cpp
similarity index 100%
rename from sycl/test-e2e/USM/fill2d.cpp
rename to sycl/test-e2e/USM/memops2d/fill2d.cpp
diff --git a/sycl/test-e2e/USM/memcpy2d.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_common.hpp
similarity index 90%
rename from sycl/test-e2e/USM/memcpy2d.cpp
rename to sycl/test-e2e/USM/memops2d/memcpy2d_common.hpp
index d5f733dba5033..79a25f4da64eb 100644
--- a/sycl/test-e2e/USM/memcpy2d.cpp
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_common.hpp
@@ -1,16 +1,4 @@
-//==---- memcpy2d.cpp - USM 2D memcpy test ---------------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-// Temporarily disabled until the failure is addressed.
-// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+#pragma once
 
 #include <sycl/sycl.hpp>
 
@@ -403,8 +391,8 @@ int testForAllPaths(queue &Q, T ExpectedVal1, T ExpectedVal2) {
   return Failures;
 }
 
-template <Alloc SrcAllocKind, Alloc DstAllocKind>
-int testForAllTypesAndPaths(queue &Q) {
+template <Alloc SrcAllocKind, Alloc DstAllocKind> int test() {
+  queue Q;
 
   bool SupportsHalf = Q.get_device().has(aspect::fp16);
   bool SupportsDouble = Q.get_device().has(aspect::fp64);
@@ -430,33 +418,3 @@ int testForAllTypesAndPaths(queue &Q) {
       Q, TestStructRef1, TestStructRef2);
   return Failures;
 }
-
-template <Alloc SrcAllocKind> int testForAllTypesAndPathsAndDsts(queue &Q) {
-  int Failures = 0;
-  Failures += testForAllTypesAndPaths<SrcAllocKind, Alloc::DirectHost>(Q);
-  if (Q.get_device().has(aspect::usm_device_allocations))
-    Failures += testForAllTypesAndPaths<SrcAllocKind, Alloc::Device>(Q);
-  if (Q.get_device().has(aspect::usm_host_allocations))
-    Failures += testForAllTypesAndPaths<SrcAllocKind, Alloc::Host>(Q);
-  if (Q.get_device().has(aspect::usm_shared_allocations))
-    Failures += testForAllTypesAndPaths<SrcAllocKind, Alloc::Shared>(Q);
-  return Failures;
-}
-
-int main() {
-  queue Q;
-
-  int Failures = 0;
-  Failures += testForAllTypesAndPathsAndDsts<Alloc::DirectHost>(Q);
-  if (Q.get_device().has(aspect::usm_device_allocations))
-    Failures += testForAllTypesAndPathsAndDsts<Alloc::Device>(Q);
-  if (Q.get_device().has(aspect::usm_host_allocations))
-    Failures += testForAllTypesAndPathsAndDsts<Alloc::Host>(Q);
-  if (Q.get_device().has(aspect::usm_shared_allocations))
-    Failures += testForAllTypesAndPathsAndDsts<Alloc::Shared>(Q);
-
-  if (!Failures)
-    std::cout << "Passed!" << std::endl;
-
-  return Failures;
-}
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_device.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_device.cpp
new file mode 100644
index 0000000000000..519f44d25aeb9
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_device.cpp
@@ -0,0 +1,18 @@
+//== memcpy2d_device_to_device.cpp - 2D memcpy from device USM to device USM =//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_device_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "memcpy2d_common.hpp"
+
+int main() { return test<Alloc::Device, Alloc::Device>(); }
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_dhost.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_dhost.cpp
new file mode 100644
index 0000000000000..8467f7720b445
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_dhost.cpp
@@ -0,0 +1,18 @@
+//==--- memcpy2d_device_to_dhost.cpp - 2D memcpy from device USM to host ---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_device_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "memcpy2d_common.hpp"
+
+int main() { return test<Alloc::Device, Alloc::DirectHost>(); }
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_host.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_host.cpp
new file mode 100644
index 0000000000000..558aab293aaa1
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_host.cpp
@@ -0,0 +1,18 @@
+//==-- memcpy2d_device_to_host.cpp - 2D memcpy from device USM to host USM -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_device_allocations, aspect-usm_host_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "memcpy2d_common.hpp"
+
+int main() { return test<Alloc::Device, Alloc::Host>(); }
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_shared.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_shared.cpp
new file mode 100644
index 0000000000000..4fe95014e46cc
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_device_to_shared.cpp
@@ -0,0 +1,18 @@
+//== memcpy2d_device_to_shared.cpp - 2D memcpy from device USM to shared USM =//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_device_allocations, aspect-usm_shared_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "memcpy2d_common.hpp"
+
+int main() { return test<Alloc::Device, Alloc::Shared>(); }
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_device.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_device.cpp
new file mode 100644
index 0000000000000..cacf0b673f907
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_device.cpp
@@ -0,0 +1,18 @@
+//==--- memcpy2d_dhost_to_device.cpp - 2D memcpy from host to device USM ---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_device_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "memcpy2d_common.hpp"
+
+int main() { return test<Alloc::DirectHost, Alloc::Device>(); }
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_dhost.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_dhost.cpp
new file mode 100644
index 0000000000000..66b5bc2fffe49
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_dhost.cpp
@@ -0,0 +1,17 @@
+//==------- memcpy2d_dhost_to_dhost.cpp - 2D memcpy from host to host ------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "memcpy2d_common.hpp"
+
+int main() { return test<Alloc::DirectHost, Alloc::DirectHost>(); }
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_host.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_host.cpp
new file mode 100644
index 0000000000000..b4553df6e575d
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_host.cpp
@@ -0,0 +1,18 @@
+//==----- memcpy2d_dhost_to_host.cpp - 2D memcpy from host to host USM -----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_host_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "memcpy2d_common.hpp"
+
+int main() { return test<Alloc::DirectHost, Alloc::Host>(); }
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_shared.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_shared.cpp
new file mode 100644
index 0000000000000..b4b232d326131
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_dhost_to_shared.cpp
@@ -0,0 +1,18 @@
+//==--- memcpy2d_dhost_to_shared.cpp - 2D memcpy from host to shared USM ---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_shared_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "memcpy2d_common.hpp"
+
+int main() { return test<Alloc::DirectHost, Alloc::Shared>(); }
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_device.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_device.cpp
new file mode 100644
index 0000000000000..a20dcb99d62dc
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_device.cpp
@@ -0,0 +1,18 @@
+//==-- memcpy2d_host_to_device.cpp - 2D memcpy from host USM to device USM -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_host_allocations, aspect-usm_device_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "memcpy2d_common.hpp"
+
+int main() { return test<Alloc::Host, Alloc::Device>(); }
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_dhost.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_dhost.cpp
new file mode 100644
index 0000000000000..4dce170c37743
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_dhost.cpp
@@ -0,0 +1,18 @@
+//==----- memcpy2d_host_to_dhost.cpp - 2D memcpy from host USM to host -----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_host_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "memcpy2d_common.hpp"
+
+int main() { return test<Alloc::Host, Alloc::DirectHost>(); }
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_host.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_host.cpp
new file mode 100644
index 0000000000000..ec8bd062c3501
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_host.cpp
@@ -0,0 +1,18 @@
+//==--- memcpy2d_host_to_host.cpp - 2D memcpy from host USM to device USM --==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_host_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "memcpy2d_common.hpp"
+
+int main() { return test<Alloc::Host, Alloc::Host>(); }
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_shared.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_shared.cpp
new file mode 100644
index 0000000000000..54ce7718bc564
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_host_to_shared.cpp
@@ -0,0 +1,18 @@
+//==-- memcpy2d_host_to_shared.cpp - 2D memcpy from host USM to shared USM -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_host_allocations, aspect-usm_shared_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "memcpy2d_common.hpp"
+
+int main() { return test<Alloc::Host, Alloc::Shared>(); }
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_device.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_device.cpp
new file mode 100644
index 0000000000000..f04f9d47cead2
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_device.cpp
@@ -0,0 +1,18 @@
+//== memcpy2d_shared_to_device.cpp - 2D memcpy from shared USM to device USM =//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_shared_allocations, aspect-usm_device_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "memcpy2d_common.hpp"
+
+int main() { return test<Alloc::Shared, Alloc::Device>(); }
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_dhost.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_dhost.cpp
new file mode 100644
index 0000000000000..1dcfe5e4e04dd
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_dhost.cpp
@@ -0,0 +1,18 @@
+//==--- memcpy2d_shared_to_dhost.cpp - 2D memcpy from shared USM to host ---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_shared_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "memcpy2d_common.hpp"
+
+int main() { return test<Alloc::Shared, Alloc::DirectHost>(); }
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_host.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_host.cpp
new file mode 100644
index 0000000000000..fbf8f8d191956
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_host.cpp
@@ -0,0 +1,18 @@
+//==-- memcpy2d_shared_to_host.cpp - 2D memcpy from shared USM to host USM -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_shared_allocations, aspect-usm_host_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "memcpy2d_common.hpp"
+
+int main() { return test<Alloc::Shared, Alloc::Host>(); }
diff --git a/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_shared.cpp b/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_shared.cpp
new file mode 100644
index 0000000000000..50572328c075a
--- /dev/null
+++ b/sycl/test-e2e/USM/memops2d/memcpy2d_shared_to_shared.cpp
@@ -0,0 +1,18 @@
+//== memcpy2d_shared_to_device.cpp - 2D memcpy from shared USM to shared USM =//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// REQUIRES: aspect-usm_shared_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Temporarily disabled until the failure is addressed.
+// UNSUPPORTED: gpu-intel-pvc || (level_zero && windows)
+
+#include "memcpy2d_common.hpp"
+
+int main() { return test<Alloc::Shared, Alloc::Shared>(); }
diff --git a/sycl/test-e2e/USM/memops2d_utils.hpp b/sycl/test-e2e/USM/memops2d/memops2d_utils.hpp
similarity index 98%
rename from sycl/test-e2e/USM/memops2d_utils.hpp
rename to sycl/test-e2e/USM/memops2d/memops2d_utils.hpp
index 3e4bd3ddbe61c..d294c9df67c24 100644
--- a/sycl/test-e2e/USM/memops2d_utils.hpp
+++ b/sycl/test-e2e/USM/memops2d/memops2d_utils.hpp
@@ -12,11 +12,7 @@
 
 using namespace sycl;
 
-enum OperationPath {
-  Expanded,
-  ExpandedDependsOn,
-  ShortcutEventList
-};
+enum OperationPath { Expanded, ExpandedDependsOn, ShortcutEventList };
 
 enum Alloc {
   Device = (int)usm::alloc::device,
diff --git a/sycl/test-e2e/USM/memset2d.cpp b/sycl/test-e2e/USM/memops2d/memset2d.cpp
similarity index 100%
rename from sycl/test-e2e/USM/memset2d.cpp
rename to sycl/test-e2e/USM/memops2d/memset2d.cpp

From 0a39bb880bd1ce5505b059dcde0406b58c7881eb Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Fri, 16 Jun 2023 05:39:09 -0400
Subject: [PATCH 47/55] [SYCL][ESIMD][E2E] Revert "Fix compile fail for
 lsc_gather_scatter_stateless_64.cpp test (#9816)" (#9911)

After
https://github.com/intel/llvm/commit/8b85b6400b8271cf3bcf8e91fe1cb81a2665e23a,
we don't need the conversion operator so let's revert the change. I
manually confirmed this compiles.

This reverts commit f448631ace2e6ea31e1f506d9444185a099981c0.
---
 .../lsc/lsc_gather_scatter_stateless_64.cpp   | 20 ++++++-------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_gather_scatter_stateless_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_gather_scatter_stateless_64.cpp
index 02610c944b86c..6ff0dc9af80fa 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_gather_scatter_stateless_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_gather_scatter_stateless_64.cpp
@@ -40,26 +40,18 @@ int main() {
          uint64_t offsetStart = (Size - VL) * sizeof(uint64_t);
          simd<uint64_t, VL> offset(offsetStart, sizeof(uint64_t));
          simd<uint64_t, VL> beginning(0, sizeof(uint64_t));
-         simd<uint64_t, VL> va =
-             lsc_gather<uint64_t, 1, lsc_data_size::default_size,
-                        cache_hint::none, cache_hint::none, VL>(PA, beginning);
+         simd<uint64_t, VL> va = lsc_gather<uint64_t>(PA, beginning);
          simd_mask<VL> pred = 1;
          simd<uint64_t, VL> old_values = 0;
          lsc_prefetch<uint64_t, 1, lsc_data_size::default_size,
-                      cache_hint::cached, cache_hint::cached, VL>(PA, offset);
+                      cache_hint::cached, cache_hint::cached>(PA, offset);
          simd<uint64_t, VL> vb =
-             lsc_gather<uint64_t, 1, lsc_data_size::default_size,
-                        cache_hint::none, cache_hint::none, VL>(
-                 PA, offset, pred, old_values);
-         simd<uint64_t, VL> vc =
-             lsc_gather<uint64_t, 1, lsc_data_size::default_size,
-                        cache_hint::none, cache_hint::none, VL>(PA, offset);
+             lsc_gather<uint64_t>(PA, offset, pred, old_values);
+         simd<uint64_t, VL> vc = lsc_gather<uint64_t>(PA, offset);
          va *= 5;
          vb += vc;
-         lsc_scatter<uint64_t, 1, lsc_data_size::default_size, cache_hint::none,
-                     cache_hint::none, VL>(PA, beginning, va);
-         lsc_scatter<uint64_t, 1, lsc_data_size::default_size, cache_hint::none,
-                     cache_hint::none, VL>(PA, offset, vb);
+         lsc_scatter<uint64_t>(PA, beginning, va);
+         lsc_scatter<uint64_t>(PA, offset, vb);
        });
      }).wait();
   } catch (sycl::exception const &e) {

From 8baa8aca04510828859274890f57a5382496e8ed Mon Sep 17 00:00:00 2001
From: Dmitry Vodopyanov <dmitry.vodopyanov@intel.com>
Date: Fri, 16 Jun 2023 14:53:47 +0200
Subject: [PATCH 48/55] [SYCL][FE][Driver] Disable fp-accuracy tests (#9935)

They fail in post commit for a couple of days.

https://github.com/intel/llvm/issues/9934
---
 clang/test/CodeGen/fp-accuracy.c | 3 +++
 clang/test/Driver/fp-accuracy.c  | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/clang/test/CodeGen/fp-accuracy.c b/clang/test/CodeGen/fp-accuracy.c
index 7cc5296089adc..a08b811d2e653 100644
--- a/clang/test/CodeGen/fp-accuracy.c
+++ b/clang/test/CodeGen/fp-accuracy.c
@@ -30,6 +30,9 @@
 // RUN: -Wno-return-type -Wno-implicit-function-declaration -emit-llvm -o - %s \
 // RUN: | FileCheck --check-prefixes=CHECK-DEFAULT %s
 
+// Disabled due to https://github.com/intel/llvm/issues/9934
+// UNSUPPORTED: system-linux
+
 #ifdef SPIR
 // This is a declaration when compiling with -fsycl to avoid
 // the compilation error "function with no prototype cannot use
diff --git a/clang/test/Driver/fp-accuracy.c b/clang/test/Driver/fp-accuracy.c
index e13c2dfc657f1..6ab48f92308d1 100644
--- a/clang/test/Driver/fp-accuracy.c
+++ b/clang/test/Driver/fp-accuracy.c
@@ -48,6 +48,8 @@
 // RUN: -fmath-errno %s 2>&1  \
 // RUN: | FileCheck %s --check-prefixes=ERR-3
 
+// Disabled due to https://github.com/intel/llvm/issues/9934
+// UNSUPPORTED: system-linux
 
 // HIGH: "-ffp-builtin-accuracy=high"
 // LOW: "-ffp-builtin-accuracy=low"

From 91148649ca6090e4216b3535b247225328d0ad2c Mon Sep 17 00:00:00 2001
From: Evgeniy <evgeniy.tyurin@intel.com>
Date: Fri, 16 Jun 2023 08:49:07 -0700
Subject: [PATCH 49/55] [SYCL][ESIMD][E2E] Fix a copy-paste error in DPAS API
 tests (#9897)

Test function was inlined into the kernel and remained in the module.
Since begin/end CHECKs were not CHECK-LABELs a DPAS API call was matched
with the code from the following function (kernel) because of inlining.
---
 sycl/test/esimd/dpas.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sycl/test/esimd/dpas.cpp b/sycl/test/esimd/dpas.cpp
index 27307b0ddfce0..cba9d5ded5cfa 100644
--- a/sycl/test/esimd/dpas.cpp
+++ b/sycl/test/esimd/dpas.cpp
@@ -58,7 +58,7 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void old_func() {
   constexpr int N_pvc = 16;
   constexpr int N_dg2 = 8;
 
-  // CHECK: define dso_local spir_func void @_Z8old_funcv()
+  // CHECK-LABEL: define dso_local spir_func void @_Z8old_funcv()
 
   { // ======= DPAS BF16 =======================================================
     simd<bfloat16, M_one *N_pvc> R_bf = 0;
@@ -194,7 +194,7 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void old_func() {
   }
 
   old_func_end();
-  // CHECK: call spir_func void @_Z12old_func_endv()
+  // CHECK-LABEL: call spir_func void @_Z12old_func_endv()
 }
 
 SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void xmx_func() {
@@ -211,7 +211,7 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void xmx_func() {
   constexpr int N_pvc = 16;
   constexpr int N_dg2 = 8;
 
-  // CHECK: define dso_local spir_func void @_Z8xmx_funcv()
+  // CHECK-LABEL: define dso_local spir_func void @_Z8xmx_funcv()
 
   { // ======= DPAS BF16 =======================================================
     simd<bfloat16, M_one *N_pvc> R_bf = 0;
@@ -325,8 +325,8 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void xmx_func() {
     simd<float, M_one *N_dg2> R_f = 0;
     simd<float, M_one *N_dg2> C_f = 0;
 
-    simd<bfloat16, K_half *N_dg2> B_hf = 0;
-    simd<bfloat16, M_one *K_half / 2> A_hf = 0;
+    simd<half, K_half * N_dg2> B_hf = 0;
+    simd<half, M_one * K_half / 2> A_hf = 0;
 
     // ------------ DPASW FP16: WITH THE ACCUMULATOR OPERAND -------------------
     R_f = xmx::dpasw<8, 1, float>(C_f, B_hf, A_hf);
@@ -360,5 +360,5 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void xmx_func() {
   }
 
   xmx_func_end();
-  // CHECK: call spir_func void @_Z12xmx_func_endv()
+  // CHECK-LABEL: call spir_func void @_Z12xmx_func_endv()
 }

From 0c775f40f65429afc97916dde1d23b234ecc8a92 Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Fri, 16 Jun 2023 13:15:27 -0400
Subject: [PATCH 50/55] [SYCL][ESIMD][E2E] Enable thread_id_test on GPU (#9943)

We updated the GPU driver in
https://github.com/intel/llvm/commit/52a0fc8ed65d31b403fa68e24d4db8e77685171f
and this test is passing on GPU now.

The test uses XFAIL so the test is actually failing in CI.

Fixes: https://github.com/intel/llvm/issues/9941

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 sycl/test-e2e/ESIMD/thread_id_test.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/sycl/test-e2e/ESIMD/thread_id_test.cpp b/sycl/test-e2e/ESIMD/thread_id_test.cpp
index 40bbb0b5fbbdc..ff186b1a32592 100644
--- a/sycl/test-e2e/ESIMD/thread_id_test.cpp
+++ b/sycl/test-e2e/ESIMD/thread_id_test.cpp
@@ -1,6 +1,5 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-// XFAIL: linux && gpu && !esimd_emulator
 //==- thread_id_test.cpp - Test to verify thread id functionlity-==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
@@ -10,8 +9,6 @@
 //===----------------------------------------------------------------------===//
 
 // This is basic test to validate thread id functions.
-// TODO: Enable the test once the GPU RT supporting the functionality reaches
-// the CI
 
 #include <cmath>
 #include <iostream>

From 35465da4824ad1cafb7c3f5c9c803df6ccf6fbf9 Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Fri, 16 Jun 2023 12:57:08 -0500
Subject: [PATCH 51/55] [SYCL][Graph] Add initial support for SYCL Graph (1/4)
 (#9728)

# Initial SYCL-Graph Patch
This is the first patch of a series that adds support for an
[experimental command graph
extension](https://github.com/intel/llvm/pull/5626)

A snapshot of the complete work can be seen in draft PR #9375 which has
support all the specification defined ways of
adding nodes and edges to the graph, including both Explicit and Record
& Replay graph construction. The two types of nodes currently
implemented are kernel execution and memcpy commands.

See https://github.com/reble/llvm#implementation-status for the status
of our total work.

## Scope
This first patch focuses on ABI breaking changes and includes:
* Experimental implementation of extension API (hpp and cpp).
* New `sycl::handler` constructor and graph related member variables.
* New scheduler command type `CG::CGTYPE::ExecCommandBuffer`.
* Change `CGExecKernel::MHostKernel` member variable from a unique ptr
to a shared ptr so that the command-group can be copied.
* Extensions `unittests` tests.

## Following Split PRs
Future follow-up PRs with the remainder of our work on the extension
will include:
* Add UR support with Level Zero implementation. (2/4)
* Changes to the runtime not covered here, such as bugfixes and feature
additions, will add symbols but not break the ABI. (3/4)
* Add test-e2e tests for SYCL Graph extension. (4/4)
* NFC changes - Design doc and codeowner update.

## Authors
Co-authored-by: Pablo Reble <pablo.reble@intel.com>
Co-authored-by: Julian Miller <julian.miller@intel.com>
Co-authored-by: Ben Tracy <ben.tracy@codeplay.com>
Co-authored-by: Ewan Crawford <ewan@codeplay.com>
---
 sycl/include/sycl/detail/cg.hpp               |  21 +-
 sycl/include/sycl/detail/property_helper.hpp  |   6 +-
 .../sycl/ext/oneapi/experimental/graph.hpp    | 253 +++++++++++
 sycl/include/sycl/handler.hpp                 |  32 ++
 sycl/include/sycl/queue.hpp                   |  52 +++
 sycl/source/CMakeLists.txt                    |   1 +
 sycl/source/detail/graph_impl.cpp             | 415 ++++++++++++++++++
 sycl/source/detail/graph_impl.hpp             | 362 +++++++++++++++
 sycl/source/detail/handler_impl.hpp           |   2 +
 sycl/source/detail/queue_impl.hpp             |  24 +
 sycl/source/detail/scheduler/commands.cpp     |  18 +-
 sycl/source/handler.cpp                       |  90 +++-
 sycl/test/abi/layout_handler.cpp              |  59 ++-
 sycl/test/abi/sycl_symbols_linux.dump         |  18 +
 sycl/test/abi/sycl_symbols_windows.dump       |  36 +-
 sycl/test/abi/symbol_size_alignment.cpp       |   4 +-
 sycl/unittests/Extensions/CMakeLists.txt      |   1 +
 sycl/unittests/Extensions/CommandGraph.cpp    | 159 +++++++
 sycl/unittests/scheduler/Regression.cpp       |   6 +-
 19 files changed, 1516 insertions(+), 43 deletions(-)
 create mode 100644 sycl/include/sycl/ext/oneapi/experimental/graph.hpp
 create mode 100644 sycl/source/detail/graph_impl.cpp
 create mode 100644 sycl/source/detail/graph_impl.hpp
 create mode 100644 sycl/unittests/Extensions/CommandGraph.cpp

diff --git a/sycl/include/sycl/detail/cg.hpp b/sycl/include/sycl/detail/cg.hpp
index cf652d035a7f9..972a5daac601e 100644
--- a/sycl/include/sycl/detail/cg.hpp
+++ b/sycl/include/sycl/detail/cg.hpp
@@ -75,6 +75,7 @@ class CG {
     CopyToDeviceGlobal = 19,
     CopyFromDeviceGlobal = 20,
     ReadWriteHostPipe = 21,
+    ExecCommandBuffer = 22,
   };
 
   struct StorageInitHelper {
@@ -89,6 +90,7 @@ class CG {
           MSharedPtrStorage(std::move(SharedPtrStorage)),
           MRequirements(std::move(Requirements)), MEvents(std::move(Events)) {}
     StorageInitHelper(StorageInitHelper &&) = default;
+    StorageInitHelper(const StorageInitHelper &) = default;
     // The following storages are needed to ensure that arguments won't die
     // while we are using them.
     /// Storage for standard layout arguments.
@@ -119,16 +121,23 @@ class CG {
   }
 
   CG(CG &&CommandGroup) = default;
+  CG(const CG &CommandGroup) = default;
 
   CGTYPE getType() { return MType; }
 
-  std::vector<std::vector<char>> &getArgsStorage() { return MData.MArgsStorage; }
-  std::vector<detail::AccessorImplPtr> &getAccStorage() { return MData.MAccStorage; }
+  std::vector<std::vector<char>> &getArgsStorage() {
+    return MData.MArgsStorage;
+  }
+  std::vector<detail::AccessorImplPtr> &getAccStorage() {
+    return MData.MAccStorage;
+  }
   std::vector<std::shared_ptr<const void>> &getSharedPtrStorage() {
     return MData.MSharedPtrStorage;
   }
 
-  std::vector<AccessorImplHost *> &getRequirements() { return MData.MRequirements; }
+  std::vector<AccessorImplHost *> &getRequirements() {
+    return MData.MRequirements;
+  }
   std::vector<detail::EventImplPtr> &getEvents() { return MData.MEvents; }
 
   virtual ~CG() = default;
@@ -151,7 +160,7 @@ class CGExecKernel : public CG {
 public:
   /// Stores ND-range description.
   NDRDescT MNDRDesc;
-  std::unique_ptr<HostKernelBase> MHostKernel;
+  std::shared_ptr<HostKernelBase> MHostKernel;
   std::shared_ptr<detail::kernel_impl> MSyclKernel;
   std::shared_ptr<detail::kernel_bundle_impl> MKernelBundle;
   std::vector<ArgDesc> MArgs;
@@ -160,7 +169,7 @@ class CGExecKernel : public CG {
   std::vector<std::shared_ptr<const void>> MAuxiliaryResources;
   RT::PiKernelCacheConfig MKernelCacheConfig;
 
-  CGExecKernel(NDRDescT NDRDesc, std::unique_ptr<HostKernelBase> HKernel,
+  CGExecKernel(NDRDescT NDRDesc, std::shared_ptr<HostKernelBase> HKernel,
                std::shared_ptr<detail::kernel_impl> SyclKernel,
                std::shared_ptr<detail::kernel_bundle_impl> KernelBundle,
                CG::StorageInitHelper CGData, std::vector<ArgDesc> Args,
@@ -180,6 +189,8 @@ class CGExecKernel : public CG {
            "Wrong type of exec kernel CG.");
   }
 
+  CGExecKernel(const CGExecKernel &CGExec) = default;
+
   std::vector<ArgDesc> getArguments() const { return MArgs; }
   std::string getKernelName() const { return MKernelName; }
   std::vector<std::shared_ptr<detail::stream_impl>> getStreams() const {
diff --git a/sycl/include/sycl/detail/property_helper.hpp b/sycl/include/sycl/detail/property_helper.hpp
index 8be70fe39ae58..93eebdcb81ffb 100644
--- a/sycl/include/sycl/detail/property_helper.hpp
+++ b/sycl/include/sycl/detail/property_helper.hpp
@@ -43,8 +43,9 @@ enum DataLessPropKind {
   QueuePriorityNormal = 16,
   QueuePriorityLow = 17,
   QueuePriorityHigh = 18,
+  GraphNoCycleCheck = 19,
   // Indicates the last known dataless property.
-  LastKnownDataLessPropKind = 18,
+  LastKnownDataLessPropKind = 19,
   // Exceeding 32 may cause ABI breaking change on some of OSes.
   DataLessPropKindSize = 32
 };
@@ -58,7 +59,8 @@ enum PropWithDataKind {
   BufferMemChannel = 4,
   AccPropBufferLocation = 5,
   QueueComputeIndex = 6,
-  PropWithDataKindSize = 7,
+  GraphNodeDependencies = 7,
+  PropWithDataKindSize = 8
 };
 
 // Base class for dataless properties, needed to check that the type of an
diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
new file mode 100644
index 0000000000000..9bbbe2680e7d9
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
@@ -0,0 +1,253 @@
+//==--------- graph.hpp --- SYCL graph extension ---------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include <sycl/detail/common.hpp>
+#include <sycl/detail/defines_elementary.hpp>
+#include <sycl/property_list.hpp>
+
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+
+class handler;
+class queue;
+class device;
+namespace ext {
+namespace oneapi {
+namespace experimental {
+
+namespace detail {
+class node_impl;
+class graph_impl;
+class exec_graph_impl;
+
+} // namespace detail
+
+/// State to template the command_graph class on.
+enum class graph_state {
+  modifiable, ///< In modifiable state, commands can be added to graph.
+  executable, ///< In executable state, the graph is ready to execute.
+};
+
+/// Class representing a node in the graph, returned by command_graph::add().
+class __SYCL_EXPORT node {
+private:
+  node(const std::shared_ptr<detail::node_impl> &Impl) : impl(Impl) {}
+
+  template <class Obj>
+  friend decltype(Obj::impl)
+  sycl::detail::getSyclObjImpl(const Obj &SyclObject);
+  template <class T>
+  friend T sycl::detail::createSyclObjFromImpl(decltype(T::impl) ImplObj);
+
+  std::shared_ptr<detail::node_impl> impl;
+};
+
+namespace property {
+namespace graph {
+
+/// Property passed to command_graph constructor to disable checking for cycles.
+///
+/// \todo Cycle check not yet implemented.
+class no_cycle_check : public ::sycl::detail::DataLessProperty<
+                           ::sycl::detail::GraphNoCycleCheck> {
+public:
+  no_cycle_check() = default;
+};
+
+} // namespace graph
+
+namespace node {
+
+/// Property used to define dependent nodes when creating a new node with
+/// command_graph::add().
+class depends_on : public ::sycl::detail::PropertyWithData<
+                       ::sycl::detail::GraphNodeDependencies> {
+public:
+  template <typename... NodeTN> depends_on(NodeTN... nodes) : MDeps{nodes...} {}
+
+  const std::vector<::sycl::ext::oneapi::experimental::node> &
+  get_dependencies() const {
+    return MDeps;
+  }
+
+private:
+  const std::vector<::sycl::ext::oneapi::experimental::node> MDeps;
+};
+
+} // namespace node
+} // namespace property
+
+/// Graph in the modifiable state.
+template <graph_state State = graph_state::modifiable>
+class __SYCL_EXPORT command_graph {
+public:
+  /// Constructor.
+  /// @param SyclContext Context to use for graph.
+  /// @param SyclDevice Device all nodes will be associated with.
+  /// @param PropList Optional list of properties to pass.
+  command_graph(const context &SyclContext, const device &SyclDevice,
+                const property_list &PropList = {});
+
+  /// Add an empty node to the graph.
+  /// @param PropList Property list used to pass [0..n] predecessor nodes.
+  /// @return Constructed empty node which has been added to the graph.
+  node add(const property_list &PropList = {}) {
+    if (PropList.has_property<property::node::depends_on>()) {
+      auto Deps = PropList.get_property<property::node::depends_on>();
+      return addImpl(Deps.get_dependencies());
+    }
+    return addImpl({});
+  }
+
+  /// Add a command-group node to the graph.
+  /// @param CGF Command-group function to create node with.
+  /// @param PropList Property list used to pass [0..n] predecessor nodes.
+  /// @return Constructed node which has been added to the graph.
+  template <typename T> node add(T CGF, const property_list &PropList = {}) {
+    if (PropList.has_property<property::node::depends_on>()) {
+      auto Deps = PropList.get_property<property::node::depends_on>();
+      return addImpl(CGF, Deps.get_dependencies());
+    }
+    return addImpl(CGF, {});
+  }
+
+  /// Add a dependency between two nodes.
+  /// @param Src Node which will be a dependency of \p Dest.
+  /// @param Dest Node which will be dependent on \p Src.
+  void make_edge(node &Src, node &Dest);
+
+  /// Finalize modifiable graph into an executable graph.
+  /// @param PropList Property list used to pass properties for finalization.
+  /// @return Executable graph object.
+  command_graph<graph_state::executable>
+  finalize(const property_list &PropList = {}) const;
+
+  /// Change the state of a queue to be recording and associate this graph with
+  /// it.
+  /// @param RecordingQueue The queue to change state on and associate this
+  /// graph with.
+  /// @return True if the queue had its state changed from executing to
+  /// recording.
+  bool begin_recording(queue &RecordingQueue);
+
+  /// Change the state of multiple queues to be recording and associate this
+  /// graph with each of them.
+  /// @param RecordingQueues The queues to change state on and associate this
+  /// graph with.
+  /// @return True if any queue had its state changed from executing to
+  /// recording.
+  bool begin_recording(const std::vector<queue> &RecordingQueues);
+
+  /// Set all queues currently recording to this graph to the executing state.
+  /// @return True if any queue had its state changed from recording to
+  /// executing.
+  bool end_recording();
+
+  /// Set a queue currently recording to this graph to the executing state.
+  /// @param RecordingQueue The queue to change state on.
+  /// @return True if the queue had its state changed from recording to
+  /// executing.
+  bool end_recording(queue &RecordingQueue);
+
+  /// Set multiple queues currently recording to this graph to the executing
+  /// state.
+  /// @param RecordingQueues The queues to change state on.
+  /// @return True if any queue had its state changed from recording to
+  /// executing.
+  bool end_recording(const std::vector<queue> &RecordingQueues);
+
+private:
+  /// Constructor used internally by the runtime.
+  /// @param Impl Detail implementation class to construct object with.
+  command_graph(const std::shared_ptr<detail::graph_impl> &Impl) : impl(Impl) {}
+
+  /// Template-less implementation of add() for CGF nodes.
+  /// @param CGF Command-group function to add.
+  /// @param Dep List of predecessor nodes.
+  /// @return Node added to the graph.
+  node addImpl(std::function<void(handler &)> CGF,
+               const std::vector<node> &Dep);
+
+  /// Template-less implementation of add() for empty nodes.
+  /// @param Dep List of predecessor nodes.
+  /// @return Node added to the graph.
+  node addImpl(const std::vector<node> &Dep);
+
+  template <class Obj>
+  friend decltype(Obj::impl)
+  sycl::detail::getSyclObjImpl(const Obj &SyclObject);
+  template <class T>
+  friend T sycl::detail::createSyclObjFromImpl(decltype(T::impl) ImplObj);
+
+  std::shared_ptr<detail::graph_impl> impl;
+};
+
+template <> class __SYCL_EXPORT command_graph<graph_state::executable> {
+public:
+  /// An executable command-graph is not user constructable.
+  command_graph() = delete;
+
+  /// Update the inputs & output of the graph.
+  /// @param Graph Graph to use the inputs and outputs of.
+  void update(const command_graph<graph_state::modifiable> &Graph);
+
+private:
+  /// Constructor used by internal runtime.
+  /// @param Graph Detail implementation class to construct with.
+  /// @param Ctx Context to use for graph.
+  command_graph(const std::shared_ptr<detail::graph_impl> &Graph,
+                const sycl::context &Ctx);
+
+  template <class Obj>
+  friend decltype(Obj::impl)
+  sycl::detail::getSyclObjImpl(const Obj &SyclObject);
+
+  /// Creates a backend representation of the graph in \p impl member variable.
+  void finalizeImpl();
+
+  int MTag;
+  std::shared_ptr<detail::exec_graph_impl> impl;
+
+  friend class command_graph<graph_state::modifiable>;
+};
+
+/// Additional CTAD deduction guide.
+template <graph_state State = graph_state::modifiable>
+command_graph(const context &SyclContext, const device &SyclDevice,
+              const property_list &PropList) -> command_graph<State>;
+
+} // namespace experimental
+} // namespace oneapi
+} // namespace ext
+
+template <>
+struct is_property<ext::oneapi::experimental::property::graph::no_cycle_check>
+    : std::true_type {};
+
+template <>
+struct is_property<ext::oneapi::experimental::property::node::depends_on>
+    : std::true_type {};
+
+template <>
+struct is_property_of<
+    ext::oneapi::experimental::property::graph::no_cycle_check,
+    ext::oneapi::experimental::command_graph<
+        ext::oneapi::experimental::graph_state::modifiable>> : std::true_type {
+};
+
+template <>
+struct is_property_of<ext::oneapi::experimental::property::node::depends_on,
+                      ext::oneapi::experimental::node> : std::true_type {};
+
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl
diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp
index 86313cf734dae..1463babc8db64 100644
--- a/sycl/include/sycl/handler.hpp
+++ b/sycl/include/sycl/handler.hpp
@@ -37,6 +37,8 @@
 #include <sycl/stl.hpp>
 #include <sycl/usm/usm_pointer_info.hpp>
 
+#include <sycl/ext/oneapi/experimental/graph.hpp>
+
 #include <functional>
 #include <limits>
 #include <memory>
@@ -102,6 +104,9 @@ template <class _name, class _dataT, int32_t _min_capacity, class _propertiesT,
 class pipe;
 }
 
+namespace ext::oneapi::experimental::detail {
+class graph_impl;
+}
 namespace detail {
 
 class handler_impl;
@@ -343,6 +348,14 @@ class __SYCL_EXPORT handler {
           std::shared_ptr<detail::queue_impl> PrimaryQueue,
           std::shared_ptr<detail::queue_impl> SecondaryQueue, bool IsHost);
 
+  /// Constructs SYCL handler from Graph.
+  ///
+  /// The hander will add the command-group as a node to the graph rather than
+  /// enqueueing it straight away.
+  ///
+  /// \param Graph is a SYCL command_graph
+  handler(std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph);
+
   /// Stores copy of Arg passed to the CGData.MArgsStorage.
   template <typename T, typename F = typename std::remove_const_t<
                             typename std::remove_reference_t<T>>>
@@ -2892,10 +2905,17 @@ class __SYCL_EXPORT handler {
     this->memcpy(Dest, Src, Count * sizeof(std::remove_all_extents_t<T>),
                  StartIndex * sizeof(std::remove_all_extents_t<T>));
   }
+  /// Executes a command_graph.
+  ///
+  /// \param Graph Executable command_graph to run
+  void ext_oneapi_graph(ext::oneapi::experimental::command_graph<
+                        ext::oneapi::experimental::graph_state::executable>
+                            Graph);
 
 private:
   std::shared_ptr<detail::handler_impl> MImpl;
   std::shared_ptr<detail::queue_impl> MQueue;
+
   /// The storage for the arguments passed.
   /// We need to store a copy of values that are passed explicitly through
   /// set_arg, require and so on, because we need them to be alive after
@@ -2936,6 +2956,17 @@ class __SYCL_EXPORT handler {
   /// before barrier command can be executed
   std::vector<detail::EventImplPtr> MEventsWaitWithBarrier;
 
+  /// The graph that is associated with this handler.
+  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> MGraph;
+  /// If we are submitting a graph using ext_oneapi_graph this will be the graph
+  /// to be executed.
+  std::shared_ptr<ext::oneapi::experimental::detail::exec_graph_impl>
+      MExecGraph;
+  /// Storage for a node created from a subgraph submission.
+  std::shared_ptr<ext::oneapi::experimental::detail::node_impl> MSubgraphNode;
+  /// Storage for the CG created when handling graph nodes added explicitly.
+  std::unique_ptr<detail::CG> MGraphNodeCG;
+
   bool MIsHost = false;
 
   detail::code_location MCodeLoc = {};
@@ -3015,6 +3046,7 @@ class __SYCL_EXPORT handler {
   /// if write opeartion is blocking, default to false.
   void ext_intel_write_host_pipe(const std::string &Name, void *Ptr,
                                  size_t Size, bool Block = false);
+  friend class ext::oneapi::experimental::detail::graph_impl;
 
   bool DisableRangeRounding();
 
diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp
index a5edd309ba67b..add48de0fa8b1 100644
--- a/sycl/include/sycl/queue.hpp
+++ b/sycl/include/sycl/queue.hpp
@@ -2038,6 +2038,58 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
 // Clean KERNELFUNC macros.
 #undef _KERNELFUNCPARAM
 
+  /// Shortcut for executing a graph of commands.
+  ///
+  /// \param Graph the graph of commands to execute
+  /// \return an event representing graph execution operation.
+  event ext_oneapi_graph(
+      ext::oneapi::experimental::command_graph<
+          ext::oneapi::experimental::graph_state::executable>
+          Graph,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit([&](handler &CGH) { CGH.ext_oneapi_graph(Graph); }, CodeLoc);
+  }
+
+  /// Shortcut for executing a graph of commands with a single dependency.
+  ///
+  /// \param Graph the graph of commands to execute
+  /// \param DepEvent is an event that specifies the graph execution
+  /// dependencies.
+  /// \return an event representing graph execution operation.
+  event ext_oneapi_graph(
+      ext::oneapi::experimental::command_graph<
+          ext::oneapi::experimental::graph_state::executable>
+          Graph,
+      event DepEvent,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit(
+        [&](handler &CGH) {
+          CGH.depends_on(DepEvent);
+          CGH.ext_oneapi_graph(Graph);
+        },
+        CodeLoc);
+  }
+
+  /// Shortcut for executing a graph of commands with multiple dependencies.
+  ///
+  /// \param Graph the graph of commands to execute
+  /// \param DepEvents is a vector of events that specifies the graph
+  /// execution dependencies.
+  /// \return an event representing graph execution operation.
+  event ext_oneapi_graph(
+      ext::oneapi::experimental::command_graph<
+          ext::oneapi::experimental::graph_state::executable>
+          Graph,
+      const std::vector<event> &DepEvents,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit(
+        [&](handler &CGH) {
+          CGH.depends_on(DepEvents);
+          CGH.ext_oneapi_graph(Graph);
+        },
+        CodeLoc);
+  }
+
   /// Returns whether the queue is in order or OoO
   ///
   /// Equivalent to has_property<property::queue::in_order>()
diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt
index 7e2b6eca02bc4..9820fe2e37c2f 100644
--- a/sycl/source/CMakeLists.txt
+++ b/sycl/source/CMakeLists.txt
@@ -175,6 +175,7 @@ set(SYCL_SOURCES
     "detail/fusion/fusion_wrapper.cpp"
     "detail/fusion/fusion_wrapper_impl.cpp"
     "detail/global_handler.cpp"
+    "detail/graph_impl.cpp"
     "detail/helpers.cpp"
     "detail/handler_proxy.cpp"
     "detail/image_accessor_util.cpp"
diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp
new file mode 100644
index 0000000000000..5fb47774dc6db
--- /dev/null
+++ b/sycl/source/detail/graph_impl.cpp
@@ -0,0 +1,415 @@
+//==--------- graph_impl.cpp - SYCL graph extension -----------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <detail/graph_impl.hpp>
+#include <detail/handler_impl.hpp>
+#include <detail/kernel_arg_mask.hpp>
+#include <detail/program_manager/program_manager.hpp>
+#include <detail/queue_impl.hpp>
+#include <detail/scheduler/commands.hpp>
+#include <sycl/feature_test.hpp>
+#include <sycl/queue.hpp>
+
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+
+namespace ext {
+namespace oneapi {
+namespace experimental {
+namespace detail {
+
+namespace {
+
+/// Recursively check if a given node is an exit node, and add the new nodes as
+/// successors if so.
+/// @param[in] CurrentNode Node to check as exit node.
+/// @param[in] NewInputs Noes to add as successors.
+void connectToExitNodes(
+    std::shared_ptr<node_impl> CurrentNode,
+    const std::vector<std::shared_ptr<node_impl>> &NewInputs) {
+  if (CurrentNode->MSuccessors.size() > 0) {
+    for (auto Successor : CurrentNode->MSuccessors) {
+      connectToExitNodes(Successor, NewInputs);
+    }
+
+  } else {
+    for (auto Input : NewInputs) {
+      CurrentNode->registerSuccessor(Input, CurrentNode);
+    }
+  }
+}
+
+/// Recursive check if a graph node or its successors contains a given kernel
+/// argument.
+/// @param[in] Arg The kernel argument to check for.
+/// @param[in] CurrentNode The current graph node being checked.
+/// @param[in,out] Deps The unique list of dependencies which have been
+/// identified for this arg.
+/// @return True if a dependency was added in this node or any of its
+/// successors.
+bool checkForArg(const sycl::detail::ArgDesc &Arg,
+                 const std::shared_ptr<node_impl> &CurrentNode,
+                 std::set<std::shared_ptr<node_impl>> &Deps) {
+  bool SuccessorAddedDep = false;
+  for (auto &Successor : CurrentNode->MSuccessors) {
+    SuccessorAddedDep |= checkForArg(Arg, Successor, Deps);
+  }
+
+  if (!CurrentNode->isEmpty() && Deps.find(CurrentNode) == Deps.end() &&
+      CurrentNode->hasArg(Arg) && !SuccessorAddedDep) {
+    Deps.insert(CurrentNode);
+    return true;
+  }
+  return SuccessorAddedDep;
+}
+} // anonymous namespace
+
+void exec_graph_impl::schedule() {
+  if (MSchedule.empty()) {
+    for (auto Node : MGraphImpl->MRoots) {
+      Node->sortTopological(Node, MSchedule);
+    }
+  }
+}
+
+std::shared_ptr<node_impl> graph_impl::addSubgraphNodes(
+    const std::list<std::shared_ptr<node_impl>> &NodeList) {
+  // Find all input and output nodes from the node list
+  std::vector<std::shared_ptr<node_impl>> Inputs;
+  std::vector<std::shared_ptr<node_impl>> Outputs;
+  for (auto &NodeImpl : NodeList) {
+    if (NodeImpl->MPredecessors.size() == 0) {
+      Inputs.push_back(NodeImpl);
+    }
+    if (NodeImpl->MSuccessors.size() == 0) {
+      Outputs.push_back(NodeImpl);
+    }
+  }
+
+  // Recursively walk the graph to find exit nodes and connect up the inputs
+  // TODO: Consider caching exit nodes so we don't have to do this
+  for (auto NodeImpl : MRoots) {
+    connectToExitNodes(NodeImpl, Inputs);
+  }
+
+  return this->add(Outputs);
+}
+
+void graph_impl::addRoot(const std::shared_ptr<node_impl> &Root) {
+  MRoots.insert(Root);
+}
+
+void graph_impl::removeRoot(const std::shared_ptr<node_impl> &Root) {
+  MRoots.erase(Root);
+}
+
+std::shared_ptr<node_impl>
+graph_impl::add(const std::vector<std::shared_ptr<node_impl>> &Dep) {
+  const std::shared_ptr<node_impl> &NodeImpl = std::make_shared<node_impl>();
+
+  // TODO: Encapsulate in separate function to avoid duplication
+  if (!Dep.empty()) {
+    for (auto N : Dep) {
+      N->registerSuccessor(NodeImpl, N); // register successor
+      this->removeRoot(NodeImpl);        // remove receiver from root node
+                                         // list
+    }
+  } else {
+    this->addRoot(NodeImpl);
+  }
+
+  return NodeImpl;
+}
+
+std::shared_ptr<node_impl>
+graph_impl::add(const std::shared_ptr<graph_impl> &Impl,
+                std::function<void(handler &)> CGF,
+                const std::vector<sycl::detail::ArgDesc> &Args,
+                const std::vector<std::shared_ptr<node_impl>> &Dep) {
+  sycl::handler Handler{Impl};
+  CGF(Handler);
+  Handler.finalize();
+
+  // If the handler recorded a subgraph return that here as the relevant nodes
+  // have already been added. The node returned here is an empty node with
+  // dependencies on all the exit nodes of the subgraph.
+  if (Handler.MSubgraphNode) {
+    return Handler.MSubgraphNode;
+  }
+  if (Handler.MCGType == sycl::detail::CG::None) {
+    return this->add(Dep);
+  }
+  return this->add(Handler.MCGType, std::move(Handler.MGraphNodeCG), Dep);
+}
+
+std::shared_ptr<node_impl>
+graph_impl::add(sycl::detail::CG::CGTYPE CGType,
+                std::unique_ptr<sycl::detail::CG> CommandGroup,
+                const std::vector<std::shared_ptr<node_impl>> &Dep) {
+  // Copy deps so we can modify them
+  auto Deps = Dep;
+  if (CGType == sycl::detail::CG::Kernel) {
+    // A unique set of dependencies obtained by checking kernel arguments
+    // for accessors
+    std::set<std::shared_ptr<node_impl>> UniqueDeps;
+    const auto &Args =
+        static_cast<sycl::detail::CGExecKernel *>(CommandGroup.get())->MArgs;
+    for (auto &Arg : Args) {
+      if (Arg.MType != sycl::detail::kernel_param_kind_t::kind_accessor) {
+        continue;
+      }
+      // Look through the graph for nodes which share this argument
+      for (auto NodePtr : MRoots) {
+        checkForArg(Arg, NodePtr, UniqueDeps);
+      }
+    }
+
+    // Add any deps determined from accessor arguments into the dependency list
+    Deps.insert(Deps.end(), UniqueDeps.begin(), UniqueDeps.end());
+  }
+
+  // Add any nodes specified by event dependencies into the dependency list
+  for (auto Dep : CommandGroup->getEvents()) {
+    if (auto NodeImpl = MEventsMap.find(Dep); NodeImpl != MEventsMap.end()) {
+      Deps.push_back(NodeImpl->second);
+    } else {
+      throw sycl::exception(sycl::make_error_code(errc::invalid),
+                            "Event dependency from handler::depends_on does "
+                            "not correspond to a node within the graph");
+    }
+  }
+
+  const std::shared_ptr<node_impl> &NodeImpl =
+      std::make_shared<node_impl>(CGType, std::move(CommandGroup));
+  if (!Deps.empty()) {
+    for (auto N : Deps) {
+      N->registerSuccessor(NodeImpl, N); // register successor
+      this->removeRoot(NodeImpl);        // remove receiver from root node
+                                         // list
+    }
+  } else {
+    this->addRoot(NodeImpl);
+  }
+  return NodeImpl;
+}
+
+bool graph_impl::clearQueues() {
+  bool AnyQueuesCleared = false;
+  for (auto &Queue : MRecordingQueues) {
+    Queue->setCommandGraph(nullptr);
+    AnyQueuesCleared = true;
+  }
+  MRecordingQueues.clear();
+
+  return AnyQueuesCleared;
+}
+
+exec_graph_impl::~exec_graph_impl() { MSchedule.clear(); }
+
+sycl::event exec_graph_impl::enqueue(
+    const std::shared_ptr<sycl::detail::queue_impl> &Queue) {
+  std::vector<sycl::detail::pi::PiEvent> RawEvents;
+  auto CreateNewEvent([&]() {
+    auto NewEvent = std::make_shared<sycl::detail::event_impl>(Queue);
+    NewEvent->setContextImpl(Queue->getContextImplPtr());
+    NewEvent->setStateIncomplete();
+    return NewEvent;
+  });
+
+  sycl::detail::EventImplPtr NewEvent;
+
+  {
+    std::vector<std::shared_ptr<sycl::detail::event_impl>> ScheduledEvents;
+    for (auto &NodeImpl : MSchedule) {
+      std::vector<sycl::detail::pi::PiEvent> RawEvents;
+
+      // If the node has no requirements for accessors etc. then we skip the
+      // scheduler and enqueue directly.
+      if (NodeImpl->MCGType == sycl::detail::CG::Kernel &&
+          NodeImpl->MCommandGroup->getRequirements().size() +
+                  static_cast<sycl::detail::CGExecKernel *>(
+                      NodeImpl->MCommandGroup.get())
+                      ->MStreams.size() ==
+              0) {
+        sycl::detail::CGExecKernel *CG =
+            static_cast<sycl::detail::CGExecKernel *>(
+                NodeImpl->MCommandGroup.get());
+        NewEvent = CreateNewEvent();
+        sycl::detail::pi::PiEvent *OutEvent = &NewEvent->getHandleRef();
+        pi_int32 Res = sycl::detail::enqueueImpKernel(
+            Queue, CG->MNDRDesc, CG->MArgs,
+            // TODO: Handler KernelBundles
+            nullptr, CG->MSyclKernel, CG->MKernelName, RawEvents, OutEvent,
+            // TODO: Pass accessor mem allocations
+            nullptr,
+            // TODO: Extract from handler
+            PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT);
+        if (Res != pi_result::PI_SUCCESS) {
+          throw sycl::exception(
+              sycl::make_error_code(sycl::errc::kernel),
+              "Error during emulated graph command group submission.");
+        }
+        ScheduledEvents.push_back(NewEvent);
+      } else {
+
+        sycl::detail::EventImplPtr EventImpl =
+            sycl::detail::Scheduler::getInstance().addCG(
+                std::move(NodeImpl->getCGCopy()), Queue);
+
+        ScheduledEvents.push_back(EventImpl);
+      }
+    }
+    // Create an event which has all kernel events as dependencies
+    NewEvent = std::make_shared<sycl::detail::event_impl>(Queue);
+    NewEvent->setStateIncomplete();
+    NewEvent->getPreparedDepsEvents() = ScheduledEvents;
+  }
+
+  sycl::event QueueEvent =
+      sycl::detail::createSyclObjFromImpl<sycl::event>(NewEvent);
+  return QueueEvent;
+}
+} // namespace detail
+
+template <>
+command_graph<graph_state::modifiable>::command_graph(
+    const sycl::context &SyclContext, const sycl::device &SyclDevice,
+    const sycl::property_list &)
+    : impl(std::make_shared<detail::graph_impl>(SyclContext, SyclDevice)) {}
+
+template <>
+node command_graph<graph_state::modifiable>::addImpl(
+    const std::vector<node> &Deps) {
+  std::vector<std::shared_ptr<detail::node_impl>> DepImpls;
+  for (auto &D : Deps) {
+    DepImpls.push_back(sycl::detail::getSyclObjImpl(D));
+  }
+
+  std::shared_ptr<detail::node_impl> NodeImpl = impl->add(DepImpls);
+  return sycl::detail::createSyclObjFromImpl<node>(NodeImpl);
+}
+
+template <>
+node command_graph<graph_state::modifiable>::addImpl(
+    std::function<void(handler &)> CGF, const std::vector<node> &Deps) {
+  std::vector<std::shared_ptr<detail::node_impl>> DepImpls;
+  for (auto &D : Deps) {
+    DepImpls.push_back(sycl::detail::getSyclObjImpl(D));
+  }
+
+  std::shared_ptr<detail::node_impl> NodeImpl =
+      impl->add(impl, CGF, {}, DepImpls);
+  return sycl::detail::createSyclObjFromImpl<node>(NodeImpl);
+}
+
+template <>
+void command_graph<graph_state::modifiable>::make_edge(node &Src, node &Dest) {
+  std::shared_ptr<detail::node_impl> SenderImpl =
+      sycl::detail::getSyclObjImpl(Src);
+  std::shared_ptr<detail::node_impl> ReceiverImpl =
+      sycl::detail::getSyclObjImpl(Dest);
+
+  SenderImpl->registerSuccessor(ReceiverImpl,
+                                SenderImpl); // register successor
+  impl->removeRoot(ReceiverImpl); // remove receiver from root node list
+}
+
+template <>
+command_graph<graph_state::executable>
+command_graph<graph_state::modifiable>::finalize(
+    const sycl::property_list &) const {
+  return command_graph<graph_state::executable>{this->impl,
+                                                this->impl->getContext()};
+}
+
+template <>
+bool command_graph<graph_state::modifiable>::begin_recording(
+    queue &RecordingQueue) {
+  auto QueueImpl = sycl::detail::getSyclObjImpl(RecordingQueue);
+  if (QueueImpl->getCommandGraph() == nullptr) {
+    QueueImpl->setCommandGraph(impl);
+    impl->addQueue(QueueImpl);
+    return true;
+  }
+  if (QueueImpl->getCommandGraph() != impl) {
+    throw sycl::exception(sycl::make_error_code(errc::invalid),
+                          "begin_recording called for a queue which is already "
+                          "recording to a different graph.");
+  }
+
+  // Queue was already recording to this graph.
+  return false;
+}
+
+template <>
+bool command_graph<graph_state::modifiable>::begin_recording(
+    const std::vector<queue> &RecordingQueues) {
+  bool QueueStateChanged = false;
+  for (queue Queue : RecordingQueues) {
+    QueueStateChanged |= this->begin_recording(Queue);
+  }
+  return QueueStateChanged;
+}
+
+template <> bool command_graph<graph_state::modifiable>::end_recording() {
+  return impl->clearQueues();
+}
+
+template <>
+bool command_graph<graph_state::modifiable>::end_recording(
+    queue &RecordingQueue) {
+  auto QueueImpl = sycl::detail::getSyclObjImpl(RecordingQueue);
+  if (QueueImpl->getCommandGraph() == impl) {
+    QueueImpl->setCommandGraph(nullptr);
+    impl->removeQueue(QueueImpl);
+    return true;
+  }
+  if (QueueImpl->getCommandGraph() != nullptr) {
+    throw sycl::exception(sycl::make_error_code(errc::invalid),
+                          "end_recording called for a queue which is recording "
+                          "to a different graph.");
+  }
+
+  // Queue was not recording to a graph.
+  return false;
+}
+
+template <>
+bool command_graph<graph_state::modifiable>::end_recording(
+    const std::vector<queue> &RecordingQueues) {
+  bool QueueStateChanged = false;
+  for (queue Queue : RecordingQueues) {
+    QueueStateChanged |= this->end_recording(Queue);
+  }
+  return QueueStateChanged;
+}
+
+command_graph<graph_state::executable>::command_graph(
+    const std::shared_ptr<detail::graph_impl> &Graph, const sycl::context &Ctx)
+    : MTag(rand()),
+      impl(std::make_shared<detail::exec_graph_impl>(Ctx, Graph)) {
+  finalizeImpl(); // Create backend representation for executable graph
+}
+
+void command_graph<graph_state::executable>::finalizeImpl() {
+  // Create PI command-buffers for each device in the finalized context
+  impl->schedule();
+}
+
+void command_graph<graph_state::executable>::update(
+    const command_graph<graph_state::modifiable> &Graph) {
+  (void)Graph;
+  throw sycl::exception(sycl::make_error_code(errc::invalid),
+                        "Method not yet implemented");
+}
+
+} // namespace experimental
+} // namespace oneapi
+} // namespace ext
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl
diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp
new file mode 100644
index 0000000000000..ac4dca9e395dd
--- /dev/null
+++ b/sycl/source/detail/graph_impl.hpp
@@ -0,0 +1,362 @@
+//==--------- graph_impl.hpp --- SYCL graph extension ---------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <sycl/detail/cg_types.hpp>
+#include <sycl/detail/os_util.hpp>
+#include <sycl/ext/oneapi/experimental/graph.hpp>
+#include <sycl/handler.hpp>
+
+#include <detail/kernel_impl.hpp>
+
+#include <cstring>
+#include <functional>
+#include <list>
+#include <set>
+
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+
+namespace ext {
+namespace oneapi {
+namespace experimental {
+namespace detail {
+
+/// Implementation of node class from SYCL_EXT_ONEAPI_GRAPH.
+class node_impl {
+public:
+  /// List of successors to this node.
+  std::vector<std::shared_ptr<node_impl>> MSuccessors;
+  /// List of predecessors to this node.
+  ///
+  /// Using weak_ptr here to prevent circular references between nodes.
+  std::vector<std::weak_ptr<node_impl>> MPredecessors;
+  /// Type of the command-group for the node.
+  sycl::detail::CG::CGTYPE MCGType = sycl::detail::CG::None;
+  /// Command group object which stores all args etc needed to enqueue the node
+  std::unique_ptr<sycl::detail::CG> MCommandGroup;
+
+  /// True if an empty node, false otherwise.
+  bool MIsEmpty = false;
+
+  /// Add successor to the node.
+  /// @param Node Node to add as a successor.
+  /// @param Prev Predecessor to \p node being added as successor.
+  ///
+  /// \p Prev should be a shared_ptr to an instance of this object, but can't
+  /// use a raw \p this pointer, so the extra \Prev parameter is passed.
+  void registerSuccessor(const std::shared_ptr<node_impl> &Node,
+                         const std::shared_ptr<node_impl> &Prev) {
+    MSuccessors.push_back(Node);
+    Node->registerPredecessor(Prev);
+  }
+
+  /// Add predecessor to the node.
+  /// @param Node Node to add as a predecessor.
+  void registerPredecessor(const std::shared_ptr<node_impl> &Node) {
+    MPredecessors.push_back(Node);
+  }
+
+  /// Construct an empty node.
+  node_impl() : MIsEmpty(true) {}
+
+  /// Construct a node representing a command-group.
+  /// @param CGType Type of the command-group.
+  /// @param CommandGroup The CG which stores the command information for this
+  /// node.
+  node_impl(sycl::detail::CG::CGTYPE CGType,
+            std::unique_ptr<sycl::detail::CG> &&CommandGroup)
+      : MCGType(CGType), MCommandGroup(std::move(CommandGroup)) {}
+
+  /// Recursively add nodes to execution stack.
+  /// @param NodeImpl Node to schedule.
+  /// @param Schedule Execution ordering to add node to.
+  void sortTopological(std::shared_ptr<node_impl> NodeImpl,
+                       std::list<std::shared_ptr<node_impl>> &Schedule) {
+    for (auto Next : MSuccessors) {
+      // Check if we've already scheduled this node
+      if (std::find(Schedule.begin(), Schedule.end(), Next) == Schedule.end())
+        Next->sortTopological(Next, Schedule);
+    }
+    // We don't need to schedule empty nodes as they are only used when
+    // calculating dependencies
+    if (!NodeImpl->isEmpty())
+      Schedule.push_front(NodeImpl);
+  }
+
+  /// Checks if this node has an argument.
+  /// @param Arg Argument to lookup.
+  /// @return True if \p Arg is used in node, false otherwise.
+  bool hasArg(const sycl::detail::ArgDesc &Arg) {
+    // TODO: Handle types other than exec kernel
+    assert(MCGType == sycl::detail::CG::Kernel);
+    const auto &Args =
+        static_cast<sycl::detail::CGExecKernel *>(MCommandGroup.get())->MArgs;
+    for (auto &NodeArg : Args) {
+      if (Arg.MType == NodeArg.MType && Arg.MSize == NodeArg.MSize) {
+        // Args are actually void** so we need to dereference them to compare
+        // actual values
+        void *IncomingPtr = *static_cast<void **>(Arg.MPtr);
+        void *ArgPtr = *static_cast<void **>(NodeArg.MPtr);
+        if (IncomingPtr == ArgPtr) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  /// Query if this is an empty node.
+  /// @return True if this is an empty node, false otherwise.
+  bool isEmpty() const { return MIsEmpty; }
+
+  /// Get a deep copy of this node's command group
+  /// @return A unique ptr to the new command group object.
+  std::unique_ptr<sycl::detail::CG> getCGCopy() const {
+    switch (MCGType) {
+    case sycl::detail::CG::Kernel:
+    case sycl::detail::CG::RunOnHostIntel:
+      return createCGCopy<sycl::detail::CGExecKernel>();
+    case sycl::detail::CG::CodeplayInteropTask:
+      assert(false);
+    // TODO: Uncomment this once we implement support for interop task so we can
+    // test required changes to the CG class.
+
+    // return createCGCopy<sycl::detail::CGInteropTask>();
+    case sycl::detail::CG::CopyAccToPtr:
+    case sycl::detail::CG::CopyPtrToAcc:
+    case sycl::detail::CG::CopyAccToAcc:
+      return createCGCopy<sycl::detail::CGCopy>();
+    case sycl::detail::CG::Fill:
+      return createCGCopy<sycl::detail::CGFill>();
+    case sycl::detail::CG::UpdateHost:
+      return createCGCopy<sycl::detail::CGUpdateHost>();
+    case sycl::detail::CG::CopyUSM:
+      return createCGCopy<sycl::detail::CGCopyUSM>();
+    case sycl::detail::CG::FillUSM:
+      return createCGCopy<sycl::detail::CGFillUSM>();
+    case sycl::detail::CG::PrefetchUSM:
+      return createCGCopy<sycl::detail::CGPrefetchUSM>();
+    case sycl::detail::CG::AdviseUSM:
+      return createCGCopy<sycl::detail::CGAdviseUSM>();
+    case sycl::detail::CG::Copy2DUSM:
+      return createCGCopy<sycl::detail::CGCopy2DUSM>();
+    case sycl::detail::CG::Fill2DUSM:
+      return createCGCopy<sycl::detail::CGFill2DUSM>();
+    case sycl::detail::CG::Memset2DUSM:
+      return createCGCopy<sycl::detail::CGMemset2DUSM>();
+    case sycl::detail::CG::CodeplayHostTask:
+      assert(false);
+      // TODO: Uncomment this once we implement support for host task so we can
+      // test required changes to the CG class.
+
+      // return createCGCopy<sycl::detail::CGHostTask>();
+    case sycl::detail::CG::Barrier:
+    case sycl::detail::CG::BarrierWaitlist:
+      return createCGCopy<sycl::detail::CGBarrier>();
+    case sycl::detail::CG::CopyToDeviceGlobal:
+      return createCGCopy<sycl::detail::CGCopyToDeviceGlobal>();
+    case sycl::detail::CG::CopyFromDeviceGlobal:
+      return createCGCopy<sycl::detail::CGCopyFromDeviceGlobal>();
+    case sycl::detail::CG::ReadWriteHostPipe:
+      return createCGCopy<sycl::detail::CGReadWriteHostPipe>();
+    case sycl::detail::CG::ExecCommandBuffer:
+      assert(false &&
+             "Error: Command graph submission should not be a node in a graph");
+      break;
+    case sycl::detail::CG::None:
+      assert(false &&
+             "Error: Empty nodes should not be enqueue to a command buffer");
+      break;
+    }
+    return nullptr;
+  }
+
+private:
+  /// Creates a copy of the node's CG by casting to it's actual type, then using
+  /// that to copy construct and create a new unique ptr from that copy.
+  /// @tparam CGT The derived type of the CG.
+  /// @return A new unique ptr to the copied CG.
+  template <typename CGT> std::unique_ptr<CGT> createCGCopy() const {
+    return std::make_unique<CGT>(*static_cast<CGT *>(MCommandGroup.get()));
+  }
+};
+
+/// Implementation details of command_graph<modifiable>.
+class graph_impl {
+public:
+  /// Constructor.
+  /// @param SyclContext Context to use for graph.
+  /// @param SyclDevice Device to create nodes with.
+  graph_impl(const sycl::context &SyclContext, const sycl::device &SyclDevice)
+      : MContext(SyclContext), MDevice(SyclDevice), MRecordingQueues(),
+        MEventsMap() {}
+
+  /// Insert node into list of root nodes.
+  /// @param Root Node to add to list of root nodes.
+  void addRoot(const std::shared_ptr<node_impl> &Root);
+
+  /// Remove node from list of root nodes.
+  /// @param Root Node to remove from list of root nodes.
+  void removeRoot(const std::shared_ptr<node_impl> &Root);
+
+  /// Create a kernel node in the graph.
+  /// @param CGType Type of the command-group.
+  /// @param CommandGroup The CG which stores all information for this node.
+  /// @param Dep Dependencies of the created node.
+  /// @return Created node in the graph.
+  std::shared_ptr<node_impl>
+  add(sycl::detail::CG::CGTYPE CGType,
+      std::unique_ptr<sycl::detail::CG> CommandGroup,
+      const std::vector<std::shared_ptr<node_impl>> &Dep = {});
+
+  /// Create a CGF node in the graph.
+  /// @param Impl Graph implementation pointer to create a handler with.
+  /// @param CGF Command-group function to create node with.
+  /// @param Args Node arguments.
+  /// @param Dep Dependencies of the created node.
+  /// @return Created node in the graph.
+  std::shared_ptr<node_impl>
+  add(const std::shared_ptr<graph_impl> &Impl,
+      std::function<void(handler &)> CGF,
+      const std::vector<sycl::detail::ArgDesc> &Args,
+      const std::vector<std::shared_ptr<node_impl>> &Dep = {});
+
+  /// Create an empty node in the graph.
+  /// @param Dep List of predecessor nodes.
+  /// @return Created node in the graph.
+  std::shared_ptr<node_impl>
+  add(const std::vector<std::shared_ptr<node_impl>> &Dep = {});
+
+  /// Add a queue to the set of queues which are currently recording to this
+  /// graph.
+  /// @param RecordingQueue Queue to add to set.
+  void
+  addQueue(const std::shared_ptr<sycl::detail::queue_impl> &RecordingQueue) {
+    MRecordingQueues.insert(RecordingQueue);
+  }
+
+  /// Remove a queue from the set of queues which are currently recording to
+  /// this graph.
+  /// @param RecordingQueue Queue to remove from set.
+  void
+  removeQueue(const std::shared_ptr<sycl::detail::queue_impl> &RecordingQueue) {
+    MRecordingQueues.erase(RecordingQueue);
+  }
+
+  /// Remove all queues which are recording to this graph, also sets all queues
+  /// cleared back to the executing state.
+  ///
+  /// @return True if any queues were removed.
+  bool clearQueues();
+
+  /// Associate a sycl event with a node in the graph.
+  /// @param EventImpl Event to associate with a node in map.
+  /// @param NodeImpl Node to associate with event in map.
+  void addEventForNode(std::shared_ptr<sycl::detail::event_impl> EventImpl,
+                       std::shared_ptr<node_impl> NodeImpl) {
+    MEventsMap[EventImpl] = NodeImpl;
+  }
+
+  /// Find the sycl event associated with a node.
+  /// @param NodeImpl Node to find event for.
+  /// @return Event associated with node.
+  std::shared_ptr<sycl::detail::event_impl>
+  getEventForNode(std::shared_ptr<node_impl> NodeImpl) const {
+    if (auto EventImpl = std::find_if(
+            MEventsMap.begin(), MEventsMap.end(),
+            [NodeImpl](auto &it) { return it.second == NodeImpl; });
+        EventImpl != MEventsMap.end()) {
+      return EventImpl->first;
+    }
+
+    throw sycl::exception(
+        sycl::make_error_code(errc::invalid),
+        "No event has been recorded for the specified graph node");
+  }
+
+  /// Adds sub-graph nodes from an executable graph to this graph.
+  /// @return An empty node is used to schedule dependencies on this sub-graph.
+  std::shared_ptr<node_impl>
+  addSubgraphNodes(const std::list<std::shared_ptr<node_impl>> &NodeList);
+
+  /// Query for the context tied to this graph.
+  /// @return Context associated with graph.
+  sycl::context getContext() const { return MContext; }
+
+  /// List of root nodes.
+  std::set<std::shared_ptr<node_impl>> MRoots;
+
+private:
+  /// Context associated with this graph.
+  sycl::context MContext;
+  /// Device associated with this graph. All graph nodes will execute on this
+  /// device.
+  sycl::device MDevice;
+  /// Unique set of queues which are currently recording to this graph.
+  std::set<std::shared_ptr<sycl::detail::queue_impl>> MRecordingQueues;
+  /// Map of events to their associated recorded nodes.
+  std::unordered_map<std::shared_ptr<sycl::detail::event_impl>,
+                     std::shared_ptr<node_impl>>
+      MEventsMap;
+};
+
+/// Class representing the implementation of command_graph<executable>.
+class exec_graph_impl {
+public:
+  /// Constructor.
+  /// @param Context Context to create graph with.
+  /// @param GraphImpl Modifiable graph implementation to create with.
+  exec_graph_impl(sycl::context Context,
+                  const std::shared_ptr<graph_impl> &GraphImpl)
+      : MSchedule(), MGraphImpl(GraphImpl), MContext(Context) {}
+
+  /// Destructor.
+  ///
+  /// Releases any PI command-buffers the object has created.
+  ~exec_graph_impl();
+
+  /// Add nodes to MSchedule.
+  void schedule();
+
+  /// Called by handler::ext_oneapi_command_graph() to schedule graph for
+  /// execution.
+  /// @param Queue Command-queue to schedule execution on.
+  /// @return Event associated with the execution of the graph.
+  sycl::event enqueue(const std::shared_ptr<sycl::detail::queue_impl> &Queue);
+
+  /// Query for the context tied to this graph.
+  /// @return Context associated with graph.
+  sycl::context getContext() const { return MContext; }
+
+  /// Query the scheduling of node execution.
+  /// @return List of nodes in execution order.
+  const std::list<std::shared_ptr<node_impl>> &getSchedule() const {
+    return MSchedule;
+  }
+
+private:
+  /// Execution schedule of nodes in the graph.
+  std::list<std::shared_ptr<node_impl>> MSchedule;
+  /// Pointer to the modifiable graph impl associated with this executable
+  /// graph.
+  std::shared_ptr<graph_impl> MGraphImpl;
+  /// Context associated with this executable graph.
+  sycl::context MContext;
+  /// List of requirements for enqueueing this command graph, accumulated from
+  /// all nodes enqueued to the graph.
+  std::vector<sycl::detail::AccessorImplHost *> MRequirements;
+};
+
+} // namespace detail
+} // namespace experimental
+} // namespace oneapi
+} // namespace ext
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl
diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp
index 3cb23c1df8d54..490aab12fc205 100644
--- a/sycl/source/detail/handler_impl.hpp
+++ b/sycl/source/detail/handler_impl.hpp
@@ -29,6 +29,8 @@ class handler_impl {
       : MSubmissionPrimaryQueue(std::move(SubmissionPrimaryQueue)),
         MSubmissionSecondaryQueue(std::move(SubmissionSecondaryQueue)){};
 
+  handler_impl() = default;
+
   void setStateExplicitKernelBundle() {
     if (MSubmissionState == HandlerSubmissionState::SPEC_CONST_SET_STATE)
       throw sycl::exception(
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index 91823c4bb6cd7..34cb82a8d465b 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -41,6 +41,13 @@
 
 namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
+
+// forward declaration
+
+namespace ext::oneapi::experimental::detail {
+class graph_impl;
+}
+
 namespace detail {
 
 using ContextImplPtr = std::shared_ptr<detail::context_impl>;
@@ -645,6 +652,16 @@ class queue_impl {
 
   bool isProfilingLimited() { return MLimitedProfiling; }
 
+  void setCommandGraph(
+      std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph) {
+    MGraph = Graph;
+  }
+
+  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
+  getCommandGraph() const {
+    return MGraph;
+  }
+
 protected:
   // template is needed for proper unit testing
   template <typename HandlerType = handler>
@@ -819,6 +836,13 @@ class queue_impl {
   // able to discard events, because the final decision is made right before the
   // operation itself.
   const bool MHasDiscardEventsSupport;
+
+  // Command graph which is associated with this queue for the purposes of
+  // recording commands to it.
+  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> MGraph =
+      nullptr;
+
+  friend class sycl::ext::oneapi::experimental::detail::node_impl;
 };
 
 } // namespace detail
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index a143917117d31..87595e6ff23e3 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -2293,8 +2293,8 @@ void DispatchNativeKernel(void *Blob) {
   std::vector<Requirement *> *Reqs =
       static_cast<std::vector<Requirement *> *>(CastedBlob[0]);
 
-  std::unique_ptr<HostKernelBase> *HostKernel =
-      static_cast<std::unique_ptr<HostKernelBase> *>(CastedBlob[1]);
+  std::shared_ptr<HostKernelBase> *HostKernel =
+      static_cast<std::shared_ptr<HostKernelBase> *>(CastedBlob[1]);
 
   NDRDescT *NDRDesc = static_cast<NDRDescT *>(CastedBlob[2]);
 
@@ -2571,10 +2571,13 @@ pi_int32 ExecCGCommand::enqueueImp() {
     std::vector<Requirement *> *CopyReqs =
         new std::vector<Requirement *>(HostTask->getRequirements());
 
-    // Not actually a copy, but move. Should be OK as it's not expected that
-    // MHostKernel will be used elsewhere.
-    std::unique_ptr<HostKernelBase> *CopyHostKernel =
-        new std::unique_ptr<HostKernelBase>(std::move(HostTask->MHostKernel));
+    // Create a shared_ptr on the heap so that the reference count is
+    // incremented until the DispatchNativeKernel() callback is run, which
+    // will free the heap shared_ptr and decrement the reference count. This
+    // prevents errors when the HostTask command-group is deleted before
+    // DispatchNativeKernel() can be run.
+    std::shared_ptr<HostKernelBase> *CopyHostKernel =
+        new std::shared_ptr<HostKernelBase>(HostTask->MHostKernel);
 
     NDRDescT *CopyNDRDesc = new NDRDescT(HostTask->MNDRDesc);
 
@@ -2901,6 +2904,9 @@ pi_int32 ExecCGCommand::enqueueImp() {
     return enqueueReadWriteHostPipe(MQueue, pipeName, blocking, hostPtr,
                                     typeSize, RawEvents, Event, read);
   }
+  case CG::CGTYPE::ExecCommandBuffer: {
+    throw runtime_error("CG type not implemented.", PI_ERROR_INVALID_OPERATION);
+  }
   case CG::CGTYPE::None:
     throw runtime_error("CG type not implemented.", PI_ERROR_INVALID_OPERATION);
   }
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index 1704321317d8c..cbb85a77f81b5 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -10,6 +10,7 @@
 
 #include <detail/config.hpp>
 #include <detail/global_handler.hpp>
+#include <detail/graph_impl.hpp>
 #include <detail/handler_impl.hpp>
 #include <detail/kernel_bundle_impl.hpp>
 #include <detail/kernel_impl.hpp>
@@ -52,6 +53,10 @@ handler::handler(std::shared_ptr<detail::queue_impl> Queue,
                                                    std::move(SecondaryQueue))),
       MQueue(std::move(Queue)), MIsHost(IsHost) {}
 
+handler::handler(
+    std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph)
+    : MImpl(std::make_shared<detail::handler_impl>()), MGraph(Graph) {}
+
 // Sets the submission state to indicate that an explicit kernel bundle has been
 // set. Throws a sycl::exception with errc::invalid if the current state
 // indicates that a specialization constant has been set.
@@ -106,6 +111,15 @@ event handler::finalize() {
     return MLastEvent;
   MIsFinalized = true;
 
+  // If we have a subgraph node that means that a subgraph was recorded as
+  // part of this queue submission, so we skip adding a new node here since
+  // they have already been added, and return the event associated with the
+  // subgraph node.
+  if (MQueue && MQueue->getCommandGraph() && MSubgraphNode) {
+    return detail::createSyclObjFromImpl<event>(
+        MQueue->getCommandGraph()->getEventForNode(MSubgraphNode));
+  }
+
   // According to 4.7.6.9 of SYCL2020 spec, if a placeholder accessor is passed
   // to a command without being bound to a command group, an exception should
   // be thrown. There should be as many requirements as unique accessors,
@@ -179,10 +193,11 @@ event handler::finalize() {
       }
     }
 
-    if (!MQueue->is_in_fusion_mode() && CGData.MRequirements.size() +
-                                                CGData.MEvents.size() +
-                                                MStreamStorage.size() ==
-                                            0) {
+    if (MQueue && !MQueue->getCommandGraph() && !MGraph && !MSubgraphNode &&
+        !MQueue->is_in_fusion_mode() &&
+        CGData.MRequirements.size() + CGData.MEvents.size() +
+                MStreamStorage.size() ==
+            0) {
       // if user does not add a new dependency to the dependency graph, i.e.
       // the graph is not changed, and the queue is not in fusion mode, then
       // this faster path is used to submit kernel bypassing scheduler and
@@ -347,10 +362,19 @@ event handler::finalize() {
   case detail::CG::ReadWriteHostPipe: {
     CommandGroup.reset(new detail::CGReadWriteHostPipe(
         MImpl->HostPipeName, MImpl->HostPipeBlocking, MImpl->HostPipePtr,
-        MImpl->HostPipeTypeSize, MImpl->HostPipeRead,
-        std::move(CGData), MCodeLoc));
+        MImpl->HostPipeTypeSize, MImpl->HostPipeRead, std::move(CGData),
+        MCodeLoc));
     break;
   }
+  case detail::CG::ExecCommandBuffer:
+    // If we have a subgraph node we don't want to actually execute this command
+    // graph submission.
+    if (!MSubgraphNode) {
+      event GraphCompletionEvent = MExecGraph->enqueue(MQueue);
+      MLastEvent = GraphCompletionEvent;
+      return MLastEvent;
+    }
+    break;
   case detail::CG::None:
     if (detail::pi::trace(detail::pi::TraceLevel::PI_TRACE_ALL)) {
       std::cout << "WARNING: An empty command group is submitted." << std::endl;
@@ -360,11 +384,36 @@ event handler::finalize() {
     return MLastEvent;
   }
 
-  if (!CommandGroup)
+  if (!MSubgraphNode && !CommandGroup)
     throw sycl::runtime_error(
         "Internal Error. Command group cannot be constructed.",
         PI_ERROR_INVALID_OPERATION);
 
+  // If there is a graph associated with the handler we are in the explicit
+  // graph mode, so we store the CG instead of submitting it to the scheduler,
+  // so it can be retrieved by the graph later.
+  if (MGraph) {
+    MGraphNodeCG = std::move(CommandGroup);
+    return detail::createSyclObjFromImpl<event>(
+        std::make_shared<detail::event_impl>());
+  }
+
+  // If the queue has an associated graph then we need to take the CG and pass
+  // it to the graph to create a node, rather than submit it to the scheduler.
+  if (auto GraphImpl = MQueue->getCommandGraph(); GraphImpl) {
+    auto EventImpl = std::make_shared<detail::event_impl>();
+
+    // Extract relevant data from the handler and pass to graph to create a
+    // new node representing this command group.
+    std::shared_ptr<ext::oneapi::experimental::detail::node_impl> NodeImpl =
+        GraphImpl->add(MCGType, std::move(CommandGroup));
+
+    // Associate an event with this new node and return the event.
+    GraphImpl->addEventForNode(EventImpl, NodeImpl);
+
+    return detail::createSyclObjFromImpl<event>(EventImpl);
+  }
+
   detail::EventImplPtr Event = detail::Scheduler::getInstance().addCG(
       std::move(CommandGroup), std::move(MQueue));
 
@@ -960,5 +1009,32 @@ void handler::setKernelCacheConfig(detail::RT::PiKernelCacheConfig Config) {
   MImpl->MKernelCacheConfig = Config;
 }
 
+void handler::ext_oneapi_graph(
+    ext::oneapi::experimental::command_graph<
+        ext::oneapi::experimental::graph_state::executable>
+        Graph) {
+  MCGType = detail::CG::ExecCommandBuffer;
+  auto GraphImpl = detail::getSyclObjImpl(Graph);
+  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> ParentGraph;
+  if (MQueue) {
+    ParentGraph = MQueue->getCommandGraph();
+  } else {
+    ParentGraph = MGraph;
+  }
+
+  // If a parent graph is set that means we are adding or recording a subgraph
+  if (ParentGraph) {
+    // Store the node representing the subgraph in the handler so that we can
+    // return it to the user later.
+    MSubgraphNode = ParentGraph->addSubgraphNodes(GraphImpl->getSchedule());
+    // Associate an event with the subgraph node.
+    auto SubgraphEvent = std::make_shared<event_impl>();
+    ParentGraph->addEventForNode(SubgraphEvent, MSubgraphNode);
+  } else {
+    // Set the exec graph for execution during finalize.
+    MExecGraph = GraphImpl;
+  }
+}
+
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
 } // namespace sycl
diff --git a/sycl/test/abi/layout_handler.cpp b/sycl/test/abi/layout_handler.cpp
index 0ce535b1e495c..5df3b206ed4f6 100644
--- a/sycl/test/abi/layout_handler.cpp
+++ b/sycl/test/abi/layout_handler.cpp
@@ -171,20 +171,45 @@ void foo() {
 // CHECK:      480 |           pointer _M_start
 // CHECK-NEXT: 488 |           pointer _M_finish
 // CHECK-NEXT: 496 |           pointer _M_end_of_storage
-// CHECK-NEXT: 504 |   _Bool MIsHost
-// CHECK-NEXT: 512 |   struct sycl::detail::code_location MCodeLoc
-// CHECK-NEXT: 512 |     const char * MFileName
-// CHECK-NEXT: 520 |     const char * MFunctionName
-// CHECK-NEXT: 528 |     unsigned long MLineNo
-// CHECK-NEXT: 536 |     unsigned long MColumnNo
-// CHECK-NEXT: 544 |   _Bool MIsFinalized
-// CHECK-NEXT: 552 |   class sycl::event MLastEvent
-// CHECK-NEXT: 552 |     class sycl::detail::OwnerLessBase<class sycl::event> (base) (empty)
-// CHECK-NEXT: 552 |     class std::shared_ptr<class sycl::detail::event_impl> impl
-// CHECK-NEXT: 552 |       class std::__shared_ptr<class sycl::detail::event_impl> (base)
-// CHECK-NEXT: 552 |         class std::__shared_ptr_access<class sycl::detail::event_impl, __gnu_cxx::_S_atomic> (base) (empty)
-// CHECK-NEXT: 552 |         element_type * _M_ptr
-// CHECK-NEXT: 560 |         class std::__shared_count<> _M_refcount
-// CHECK-NEXT: 560 |           _Sp_counted_base<(_Lock_policy)2U> * _M_pi
-// CHECK-NEXT:     | [sizeof=568, dsize=568, align=8,
-// CHECK-NEXT:     |  nvsize=568, nvalign=8]
+// CHECK-NEXT: 504 |   class std::shared_ptr<class sycl::ext::oneapi::experimental::detail::graph_impl> MGraph
+// CHECK-NEXT: 504 |     class std::__shared_ptr<class sycl::ext::oneapi::experimental::detail::graph_impl> (base)
+// CHECK-NEXT: 504 |       class std::__shared_ptr_access<class sycl::ext::oneapi::experimental::detail::graph_impl, __gnu_cxx::_S_atomic> (base) (empty)
+// CHECK-NEXT: 504 |       element_type * _M_ptr
+// CHECK-NEXT: 512 |       class std::__shared_count<> _M_refcount
+// CHECK-NEXT: 512 |         _Sp_counted_base<(_Lock_policy)2U> * _M_pi
+// CHECK-NEXT: 520 |   class std::shared_ptr<class sycl::ext::oneapi::experimental::detail::exec_graph_impl> MExecGraph
+// CHECK-NEXT: 520 |     class std::__shared_ptr<class sycl::ext::oneapi::experimental::detail::exec_graph_impl> (base)
+// CHECK-NEXT: 520 |       class std::__shared_ptr_access<class sycl::ext::oneapi::experimental::detail::exec_graph_impl, __gnu_cxx::_S_atomic> (base) (empty)
+// CHECK-NEXT: 520 |       element_type * _M_ptr
+// CHECK-NEXT: 528 |       class std::__shared_count<> _M_refcount
+// CHECK-NEXT: 528 |         _Sp_counted_base<(_Lock_policy)2U> * _M_pi
+// CHECK-NEXT: 536 |   class std::shared_ptr<class sycl::ext::oneapi::experimental::detail::node_impl> MSubgraphNode
+// CHECK-NEXT: 536 |     class std::__shared_ptr<class sycl::ext::oneapi::experimental::detail::node_impl> (base)
+// CHECK-NEXT: 536 |       class std::__shared_ptr_access<class sycl::ext::oneapi::experimental::detail::node_impl, __gnu_cxx::_S_atomic> (base) (empty)
+// CHECK-NEXT: 536 |       element_type * _M_ptr
+// CHECK-NEXT: 544 |       class std::__shared_count<> _M_refcount
+// CHECK-NEXT: 544 |         _Sp_counted_base<(_Lock_policy)2U> * _M_pi
+// CHECK-NEXT: 552 |   class std::unique_ptr<class sycl::detail::CG> MGraphNodeCG
+// CHECK:      552 |     class std::__uniq_ptr_impl<class sycl::detail::CG, struct std::default_delete<class sycl::detail::CG> >
+// CHECK-NEXT: 552 |       class std::tuple<class sycl::detail::CG *, struct std::default_delete<class sycl::detail::CG> > _M_t
+// CHECK-NEXT: 552 |         struct std::_Tuple_impl<0, class sycl::detail::CG *, struct std::default_delete<class sycl::detail::CG> > (base)
+// CHECK-NEXT: 552 |           struct std::_Tuple_impl<1, struct std::default_delete<class sycl::detail::CG> > (base) (empty)
+// CHECK:      552 |           struct std::_Head_base<0, class sycl::detail::CG *> (base)
+// CHECK-NEXT: 552 |             class sycl::detail::CG * _M_head_impl
+// CHECK-NEXT: 560 |   _Bool MIsHost
+// CHECK-NEXT: 568 |   struct sycl::detail::code_location MCodeLoc
+// CHECK-NEXT: 568 |     const char * MFileName
+// CHECK-NEXT: 576 |     const char * MFunctionName
+// CHECK-NEXT: 584 |     unsigned long MLineNo
+// CHECK-NEXT: 592 |     unsigned long MColumnNo
+// CHECK-NEXT: 600 |   _Bool MIsFinalized
+// CHECK-NEXT: 608 |   class sycl::event MLastEvent
+// CHECK-NEXT: 608 |     class sycl::detail::OwnerLessBase<class sycl::event> (base) (empty)
+// CHECK-NEXT: 608 |     class std::shared_ptr<class sycl::detail::event_impl> impl
+// CHECK-NEXT: 608 |       class std::__shared_ptr<class sycl::detail::event_impl> (base)
+// CHECK-NEXT: 608 |         class std::__shared_ptr_access<class sycl::detail::event_impl, __gnu_cxx::_S_atomic> (base) (empty)
+// CHECK-NEXT: 608 |         element_type * _M_ptr
+// CHECK-NEXT: 616 |         class std::__shared_count<> _M_refcount
+// CHECK-NEXT: 616 |           _Sp_counted_base<(_Lock_policy)2U> * _M_pi
+// CHECK-NEXT:     | [sizeof=624, dsize=624, align=8,
+// CHECK-NEXT:     |  nvsize=624, nvalign=8]
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index 83ac345048b8c..48240ed4c2876 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -3661,6 +3661,20 @@ _ZN4sycl3_V13ext6oneapi10level_zero10make_queueERKNS0_7contextERKNS0_6deviceEmbb
 _ZN4sycl3_V13ext6oneapi10level_zero11make_deviceERKNS0_8platformEm
 _ZN4sycl3_V13ext6oneapi10level_zero12make_contextERKSt6vectorINS0_6deviceESaIS5_EEmb
 _ZN4sycl3_V13ext6oneapi10level_zero13make_platformEm
+_ZN4sycl3_V13ext6oneapi12experimental13command_graphILNS3_11graph_stateE0EE13end_recordingERKSt6vectorINS0_5queueESaIS8_EE
+_ZN4sycl3_V13ext6oneapi12experimental13command_graphILNS3_11graph_stateE0EE13end_recordingERNS0_5queueE
+_ZN4sycl3_V13ext6oneapi12experimental13command_graphILNS3_11graph_stateE0EE13end_recordingEv
+_ZN4sycl3_V13ext6oneapi12experimental13command_graphILNS3_11graph_stateE0EE15begin_recordingERKSt6vectorINS0_5queueESaIS8_EE
+_ZN4sycl3_V13ext6oneapi12experimental13command_graphILNS3_11graph_stateE0EE15begin_recordingERNS0_5queueE
+_ZN4sycl3_V13ext6oneapi12experimental13command_graphILNS3_11graph_stateE0EE7addImplERKSt6vectorINS3_4nodeESaIS8_EE
+_ZN4sycl3_V13ext6oneapi12experimental13command_graphILNS3_11graph_stateE0EE7addImplESt8functionIFvRNS0_7handlerEEERKSt6vectorINS3_4nodeESaISD_EE
+_ZN4sycl3_V13ext6oneapi12experimental13command_graphILNS3_11graph_stateE0EE9make_edgeERNS3_4nodeES8_
+_ZN4sycl3_V13ext6oneapi12experimental13command_graphILNS3_11graph_stateE0EEC1ERKNS0_7contextERKNS0_6deviceERKNS0_13property_listE
+_ZN4sycl3_V13ext6oneapi12experimental13command_graphILNS3_11graph_stateE0EEC2ERKNS0_7contextERKNS0_6deviceERKNS0_13property_listE
+_ZN4sycl3_V13ext6oneapi12experimental13command_graphILNS3_11graph_stateE1EE12finalizeImplEv
+_ZN4sycl3_V13ext6oneapi12experimental13command_graphILNS3_11graph_stateE1EE6updateERKNS4_ILS5_0EEE
+_ZN4sycl3_V13ext6oneapi12experimental13command_graphILNS3_11graph_stateE1EEC1ERKSt10shared_ptrINS3_6detail10graph_implEERKNS0_7contextE
+_ZN4sycl3_V13ext6oneapi12experimental13command_graphILNS3_11graph_stateE1EEC2ERKSt10shared_ptrINS3_6detail10graph_implEERKNS0_7contextE
 _ZN4sycl3_V13ext6oneapi15filter_selectorC1ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE
 _ZN4sycl3_V13ext6oneapi15filter_selectorC2ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE
 _ZN4sycl3_V13ext8codeplay12experimental14fusion_wrapper12start_fusionEv
@@ -3982,6 +3996,7 @@ _ZN4sycl3_V17handler10mem_adviseEPKvmi
 _ZN4sycl3_V17handler10processArgEPvRKNS0_6detail19kernel_param_kind_tEimRmbb
 _ZN4sycl3_V17handler12addReductionERKSt10shared_ptrIKvE
 _ZN4sycl3_V17handler13getKernelNameB5cxx11Ev
+_ZN4sycl3_V17handler16ext_oneapi_graphENS0_3ext6oneapi12experimental13command_graphILNS4_11graph_stateE1EEE
 _ZN4sycl3_V17handler17supportsUSMFill2DEv
 _ZN4sycl3_V17handler17use_kernel_bundleERKNS0_13kernel_bundleILNS0_12bundle_stateE2EEE
 _ZN4sycl3_V17handler18RangeRoundingTraceEv
@@ -4017,8 +4032,10 @@ _ZN4sycl3_V17handler6memsetEPvim
 _ZN4sycl3_V17handler7barrierERKSt6vectorINS0_5eventESaIS3_EE
 _ZN4sycl3_V17handler8finalizeEv
 _ZN4sycl3_V17handler8prefetchEPKvm
+_ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_3ext6oneapi12experimental6detail10graph_implEE
 _ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_6detail10queue_implEES5_S5_b
 _ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_6detail10queue_implEEb
+_ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_3ext6oneapi12experimental6detail10graph_implEE
 _ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_6detail10queue_implEES5_S5_b
 _ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_6detail10queue_implEEb
 _ZN4sycl3_V17samplerC1ENS0_29coordinate_normalization_modeENS0_15addressing_modeENS0_14filtering_modeERKNS0_13property_listE
@@ -4081,6 +4098,7 @@ _ZNK4sycl3_V115interop_handler12GetNativeMemEPNS0_6detail16AccessorImplHostE
 _ZNK4sycl3_V115interop_handler14GetNativeQueueERi
 _ZNK4sycl3_V116default_selectorclERKNS0_6deviceE
 _ZNK4sycl3_V120accelerator_selectorclERKNS0_6deviceE
+_ZNK4sycl3_V13ext6oneapi12experimental13command_graphILNS3_11graph_stateE0EE8finalizeERKNS0_13property_listE
 _ZNK4sycl3_V13ext6oneapi15filter_selector13select_deviceEv
 _ZNK4sycl3_V13ext6oneapi15filter_selector5resetEv
 _ZNK4sycl3_V13ext6oneapi15filter_selectorclERKNS0_6deviceE
diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump
index 010b1b966ec74..d79aecbb338fc 100644
--- a/sycl/test/abi/sycl_symbols_windows.dump
+++ b/sycl/test/abi/sycl_symbols_windows.dump
@@ -382,6 +382,12 @@
 ??$has_property@Vuse_primary_context@cuda@context@property@_V1@sycl@@@image_plain@detail@_V1@sycl@@IEBA_NXZ
 ??$has_property@Vuse_primary_context@cuda@context@property@_V1@sycl@@@sampler@_V1@sycl@@QEBA_NXZ
 ??$has_property@Vuse_primary_context@cuda@context@property@_V1@sycl@@@stream@_V1@sycl@@QEBA_NXZ
+??0?$command_graph@$00@experimental@oneapi@ext@_V1@sycl@@AEAA@AEBV?$shared_ptr@Vgraph_impl@detail@experimental@oneapi@ext@_V1@sycl@@@std@@AEBVcontext@45@@Z
+??0?$command_graph@$00@experimental@oneapi@ext@_V1@sycl@@QEAA@$$QEAV012345@@Z
+??0?$command_graph@$00@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV012345@@Z
+??0?$command_graph@$0A@@experimental@oneapi@ext@_V1@sycl@@QEAA@$$QEAV012345@@Z
+??0?$command_graph@$0A@@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV012345@@Z
+??0?$command_graph@$0A@@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBVcontext@45@AEBVdevice@45@AEBVproperty_list@45@@Z
 ??0AccessorBaseHost@detail@_V1@sycl@@IEAA@AEBV?$shared_ptr@VAccessorImplHost@detail@_V1@sycl@@@std@@@Z
 ??0AccessorBaseHost@detail@_V1@sycl@@QEAA@$$QEAV0123@@Z
 ??0AccessorBaseHost@detail@_V1@sycl@@QEAA@AEBV0123@@Z
@@ -503,6 +509,7 @@
 ??0gpu_selector@_V1@sycl@@QEAA@XZ
 ??0half@host_half_impl@detail@_V1@sycl@@QEAA@AEBM@Z
 ??0half@host_half_impl@detail@_V1@sycl@@QEAA@G@Z
+??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vgraph_impl@detail@experimental@oneapi@ext@_V1@sycl@@@std@@@Z
 ??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@00_N@Z
 ??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_N@Z
 ??0host_selector@_V1@sycl@@QEAA@$$QEAV012@@Z
@@ -548,6 +555,9 @@
 ??0kernel_id@_V1@sycl@@AEAA@PEBD@Z
 ??0kernel_id@_V1@sycl@@QEAA@$$QEAV012@@Z
 ??0kernel_id@_V1@sycl@@QEAA@AEBV012@@Z
+??0node@experimental@oneapi@ext@_V1@sycl@@AEAA@AEBV?$shared_ptr@Vnode_impl@detail@experimental@oneapi@ext@_V1@sycl@@@std@@@Z
+??0node@experimental@oneapi@ext@_V1@sycl@@QEAA@$$QEAV012345@@Z
+??0node@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV012345@@Z
 ??0platform@_V1@sycl@@AEAA@AEBVdevice@12@@Z
 ??0platform@_V1@sycl@@AEAA@V?$shared_ptr@Vplatform_impl@detail@_V1@sycl@@@std@@@Z
 ??0platform@_V1@sycl@@QEAA@$$QEAV012@@Z
@@ -586,6 +596,8 @@
 ??0stream_impl@detail@_V1@sycl@@QEAA@_K0AEBVproperty_list@23@@Z
 ??0tls_code_loc_t@detail@_V1@sycl@@QEAA@AEBUcode_location@123@@Z
 ??0tls_code_loc_t@detail@_V1@sycl@@QEAA@XZ
+??1?$command_graph@$00@experimental@oneapi@ext@_V1@sycl@@QEAA@XZ
+??1?$command_graph@$0A@@experimental@oneapi@ext@_V1@sycl@@QEAA@XZ
 ??1AccessorBaseHost@detail@_V1@sycl@@QEAA@XZ
 ??1AccessorImplHost@detail@_V1@sycl@@QEAA@XZ
 ??1LocalAccessorBaseHost@detail@_V1@sycl@@QEAA@XZ
@@ -617,6 +629,7 @@
 ??1kernel@_V1@sycl@@QEAA@XZ
 ??1kernel_bundle_plain@detail@_V1@sycl@@QEAA@XZ
 ??1kernel_id@_V1@sycl@@QEAA@XZ
+??1node@experimental@oneapi@ext@_V1@sycl@@QEAA@XZ
 ??1platform@_V1@sycl@@QEAA@XZ
 ??1queue@_V1@sycl@@QEAA@XZ
 ??1sampler@_V1@sycl@@QEAA@XZ
@@ -640,6 +653,10 @@
 ??4?$OwnerLessBase@Vqueue@_V1@sycl@@@detail@_V1@sycl@@QEAAAEAV0123@AEBV0123@@Z
 ??4?$OwnerLessBase@Vstream@_V1@sycl@@@detail@_V1@sycl@@QEAAAEAV0123@$$QEAV0123@@Z
 ??4?$OwnerLessBase@Vstream@_V1@sycl@@@detail@_V1@sycl@@QEAAAEAV0123@AEBV0123@@Z
+??4?$command_graph@$00@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@$$QEAV012345@@Z
+??4?$command_graph@$00@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@AEBV012345@@Z
+??4?$command_graph@$0A@@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@$$QEAV012345@@Z
+??4?$command_graph@$0A@@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@AEBV012345@@Z
 ??4AccessorBaseHost@detail@_V1@sycl@@QEAAAEAV0123@$$QEAV0123@@Z
 ??4AccessorBaseHost@detail@_V1@sycl@@QEAAAEAV0123@AEBV0123@@Z
 ??4AccessorImplHost@detail@_V1@sycl@@QEAAAEAV0123@AEBV0123@@Z
@@ -696,6 +713,8 @@
 ??4kernel_bundle_plain@detail@_V1@sycl@@QEAAAEAV0123@AEBV0123@@Z
 ??4kernel_id@_V1@sycl@@QEAAAEAV012@$$QEAV012@@Z
 ??4kernel_id@_V1@sycl@@QEAAAEAV012@AEBV012@@Z
+??4node@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@$$QEAV012345@@Z
+??4node@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@AEBV012345@@Z
 ??4platform@_V1@sycl@@QEAAAEAV012@$$QEAV012@@Z
 ??4platform@_V1@sycl@@QEAAAEAV012@AEBV012@@Z
 ??4queue@_V1@sycl@@QEAAAEAV012@$$QEAV012@@Z
@@ -767,8 +786,8 @@
 ?Clear@exception_list@_V1@sycl@@AEAAXXZ
 ?DirSep@OSUtil@detail@_V1@sycl@@2QEBDEB
 ?DisableRangeRounding@handler@_V1@sycl@@AEAA_NXZ
-?GDBMethodsAnchor@UnsampledImageAccessorBaseHost@detail@_V1@sycl@@IEAAXXZ
 ?GDBMethodsAnchor@SampledImageAccessorBaseHost@detail@_V1@sycl@@IEAAXXZ
+?GDBMethodsAnchor@UnsampledImageAccessorBaseHost@detail@_V1@sycl@@IEAAXXZ
 ?GetNativeMem@interop_handler@_V1@sycl@@AEBA_KPEAVAccessorImplHost@detail@23@@Z
 ?GetNativeQueue@interop_handler@_V1@sycl@@AEBA_KAEAH@Z
 ?GetRangeRoundingSettings@handler@_V1@sycl@@AEAAXAEA_K00@Z
@@ -807,6 +826,8 @@
 ?addHostAccessorAndWait@detail@_V1@sycl@@YAXPEAVAccessorImplHost@123@@Z
 ?addHostSampledImageAccessorAndWait@detail@_V1@sycl@@YAXPEAVSampledImageAccessorImplHost@123@@Z
 ?addHostUnsampledImageAccessorAndWait@detail@_V1@sycl@@YAXPEAVUnsampledImageAccessorImplHost@123@@Z
+?addImpl@?$command_graph@$0A@@experimental@oneapi@ext@_V1@sycl@@AEAA?AVnode@23456@AEBV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@@Z
+?addImpl@?$command_graph@$0A@@experimental@oneapi@ext@_V1@sycl@@AEAA?AVnode@23456@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@9@@Z
 ?addInteropObject@buffer_impl@detail@_V1@sycl@@QEBAXAEAV?$vector@_KV?$allocator@_K@std@@@std@@@Z
 ?addOrReplaceAccessorProperties@SYCLMemObjT@detail@_V1@sycl@@QEAAXAEBVproperty_list@34@@Z
 ?addOrReplaceAccessorProperties@buffer_plain@detail@_V1@sycl@@IEAAXAEBVproperty_list@34@@Z
@@ -856,6 +877,8 @@
 ?barrier@handler@_V1@sycl@@QEAAXXZ
 ?begin@exception_list@_V1@sycl@@QEBA?AV?$_Vector_const_iterator@V?$_Vector_val@U?$_Simple_types@Vexception_ptr@std@@@std@@@std@@@std@@XZ
 ?begin@kernel_bundle_plain@detail@_V1@sycl@@IEBAPEBVdevice_image_plain@234@XZ
+?begin_recording@?$command_graph@$0A@@experimental@oneapi@ext@_V1@sycl@@QEAA_NAEAVqueue@56@@Z
+?begin_recording@?$command_graph@$0A@@experimental@oneapi@ext@_V1@sycl@@QEAA_NAEBV?$vector@Vqueue@_V1@sycl@@V?$allocator@Vqueue@_V1@sycl@@@std@@@std@@@Z
 ?build_impl@detail@_V1@sycl@@YA?AV?$shared_ptr@Vkernel_bundle_impl@detail@_V1@sycl@@@std@@AEBV?$kernel_bundle@$0A@@23@AEBV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@5@AEBVproperty_list@23@@Z
 ?canReuseHostPtr@SYCLMemObjT@detail@_V1@sycl@@QEAA_NPEAX_K@Z
 ?cancel_fusion@fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEAAXXZ
@@ -896,6 +919,9 @@
 ?end@HostProfilingInfo@detail@_V1@sycl@@QEAAXXZ
 ?end@exception_list@_V1@sycl@@QEBA?AV?$_Vector_const_iterator@V?$_Vector_val@U?$_Simple_types@Vexception_ptr@std@@@std@@@std@@@std@@XZ
 ?end@kernel_bundle_plain@detail@_V1@sycl@@IEBAPEBVdevice_image_plain@234@XZ
+?end_recording@?$command_graph@$0A@@experimental@oneapi@ext@_V1@sycl@@QEAA_NAEAVqueue@56@@Z
+?end_recording@?$command_graph@$0A@@experimental@oneapi@ext@_V1@sycl@@QEAA_NAEBV?$vector@Vqueue@_V1@sycl@@V?$allocator@Vqueue@_V1@sycl@@@std@@@std@@@Z
+?end_recording@?$command_graph@$0A@@experimental@oneapi@ext@_V1@sycl@@QEAA_NXZ
 ?ext_codeplay_supports_fusion@queue@_V1@sycl@@QEBA_NXZ
 ?ext_intel_read_host_pipe@handler@_V1@sycl@@AEAAXAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAX_K_N@Z
 ?ext_intel_write_host_pipe@handler@_V1@sycl@@AEAAXAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAX_K_N@Z
@@ -904,6 +930,10 @@
 ?ext_oneapi_empty@queue@_V1@sycl@@QEBA_NXZ
 ?ext_oneapi_fill2d_impl@handler@_V1@sycl@@AEAAXPEAX_KPEBX111@Z
 ?ext_oneapi_get_default_context@platform@_V1@sycl@@QEBA?AVcontext@23@XZ
+?ext_oneapi_graph@handler@_V1@sycl@@QEAAXV?$command_graph@$00@experimental@oneapi@ext@23@@Z
+?ext_oneapi_graph@queue@_V1@sycl@@QEAA?AVevent@23@V?$command_graph@$00@experimental@oneapi@ext@23@AEBUcode_location@detail@23@@Z
+?ext_oneapi_graph@queue@_V1@sycl@@QEAA?AVevent@23@V?$command_graph@$00@experimental@oneapi@ext@23@AEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@AEBUcode_location@detail@23@@Z
+?ext_oneapi_graph@queue@_V1@sycl@@QEAA?AVevent@23@V?$command_graph@$00@experimental@oneapi@ext@23@V423@AEBUcode_location@detail@23@@Z
 ?ext_oneapi_memcpy2d_impl@handler@_V1@sycl@@AEAAXPEAX_KPEBX111@Z
 ?ext_oneapi_memset2d_impl@handler@_V1@sycl@@AEAAXPEAX_KH11@Z
 ?ext_oneapi_owner_before@?$OwnerLessBase@Vcontext@_V1@sycl@@@detail@_V1@sycl@@QEBA_NAEBV?$weak_object_base@Vcontext@_V1@sycl@@@2oneapi@ext@34@@Z
@@ -929,7 +959,9 @@
 ?fill@MemoryManager@detail@_V1@sycl@@SAXPEAVSYCLMemObjI@234@PEAXV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_KPEBDIV?$range@$02@34@5V?$id@$02@34@IV?$vector@PEAU_pi_event@@V?$allocator@PEAU_pi_event@@@std@@@7@AEAPEAU_pi_event@@@Z
 ?fill_2d_usm@MemoryManager@detail@_V1@sycl@@SAXPEAXV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_K22AEBV?$vector@DV?$allocator@D@std@@@6@V?$vector@PEAU_pi_event@@V?$allocator@PEAU_pi_event@@@std@@@6@PEAPEAU_pi_event@@@Z
 ?fill_usm@MemoryManager@detail@_V1@sycl@@SAXPEAXV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_KHV?$vector@PEAU_pi_event@@V?$allocator@PEAU_pi_event@@@std@@@6@PEAPEAU_pi_event@@@Z
+?finalize@?$command_graph@$0A@@experimental@oneapi@ext@_V1@sycl@@QEBA?AV?$command_graph@$00@23456@AEBVproperty_list@56@@Z
 ?finalize@handler@_V1@sycl@@AEAA?AVevent@23@XZ
+?finalizeImpl@?$command_graph@$00@experimental@oneapi@ext@_V1@sycl@@AEAAXXZ
 ?find_device_intersection@detail@_V1@sycl@@YA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@AEBV?$vector@V?$kernel_bundle@$00@_V1@sycl@@V?$allocator@V?$kernel_bundle@$00@_V1@sycl@@@std@@@5@@Z
 ?flush@stream_impl@detail@_V1@sycl@@QEAAXAEBV?$shared_ptr@Vevent_impl@detail@_V1@sycl@@@std@@@Z
 ?flush@stream_impl@detail@_V1@sycl@@QEAAXXZ
@@ -1163,6 +1195,7 @@
 ?make_device@detail@_V1@sycl@@YA?AVdevice@23@_KW4backend@23@@Z
 ?make_device@level_zero@oneapi@ext@_V1@sycl@@YA?AVdevice@45@AEBVplatform@45@_K@Z
 ?make_device@opencl@_V1@sycl@@YA?AVdevice@23@_K@Z
+?make_edge@?$command_graph@$0A@@experimental@oneapi@ext@_V1@sycl@@QEAAXAEAVnode@23456@0@Z
 ?make_error_code@_V1@sycl@@YA?AVerror_code@std@@W4errc@12@@Z
 ?make_event@detail@_V1@sycl@@YA?AVevent@23@_KAEBVcontext@23@W4backend@23@@Z
 ?make_event@detail@_V1@sycl@@YA?AVevent@23@_KAEBVcontext@23@_NW4backend@23@@Z
@@ -4878,6 +4911,7 @@
 ?throw_asynchronous@queue@_V1@sycl@@QEAAXXZ
 ?unmap@MemoryManager@detail@_V1@sycl@@SAXPEAVSYCLMemObjI@234@PEAXV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@1V?$vector@PEAU_pi_event@@V?$allocator@PEAU_pi_event@@@std@@@7@AEAPEAU_pi_event@@@Z
 ?unset_flag@stream@_V1@sycl@@AEBAXI@Z
+?update@?$command_graph@$00@experimental@oneapi@ext@_V1@sycl@@QEAAXAEBV?$command_graph@$0A@@23456@@Z
 ?updateHostMemory@SYCLMemObjT@detail@_V1@sycl@@IEAAXQEAX@Z
 ?updateHostMemory@SYCLMemObjT@detail@_V1@sycl@@IEAAXXZ
 ?useHostPtr@SYCLMemObjT@detail@_V1@sycl@@QEAA_NXZ
diff --git a/sycl/test/abi/symbol_size_alignment.cpp b/sycl/test/abi/symbol_size_alignment.cpp
index 50f801db31939..1d80680340bd1 100644
--- a/sycl/test/abi/symbol_size_alignment.cpp
+++ b/sycl/test/abi/symbol_size_alignment.cpp
@@ -52,9 +52,9 @@ int main() {
   check<event, 16, 8>();
   check<gpu_selector, 8, 8>();
 #ifdef _MSC_VER
-  check<handler, 560, 8>();
+  check<handler, 616, 8>();
 #else
-  check<handler, 568, 8>();
+  check<handler, 624, 8>();
 #endif
   check<image<1>, 16, 8>();
   check<kernel, 16, 8>();
diff --git a/sycl/unittests/Extensions/CMakeLists.txt b/sycl/unittests/Extensions/CMakeLists.txt
index ebe99f15ec84d..85bcd63e874ed 100644
--- a/sycl/unittests/Extensions/CMakeLists.txt
+++ b/sycl/unittests/Extensions/CMakeLists.txt
@@ -7,5 +7,6 @@ add_sycl_unittest(ExtensionsTests OBJECT
   USMMemcpy2D.cpp
   DeviceGlobal.cpp
   OneAPISubGroupMask.cpp
+  CommandGraph.cpp
 )
 
diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp
new file mode 100644
index 0000000000000..bb876f14484ed
--- /dev/null
+++ b/sycl/unittests/Extensions/CommandGraph.cpp
@@ -0,0 +1,159 @@
+//==--------------------- CommandGraph.cpp -------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "sycl/ext/oneapi/experimental/graph.hpp"
+#include <sycl/sycl.hpp>
+
+#include "detail/graph_impl.hpp"
+
+#include <detail/config.hpp>
+#include <helpers/PiMock.hpp>
+#include <helpers/ScopedEnvVar.hpp>
+
+#include <gtest/gtest.h>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi;
+
+class CommandGraphTest : public ::testing::Test {
+public:
+  CommandGraphTest()
+      : Mock{}, Plat{Mock.getPlatform()}, Dev{Plat.get_devices()[0]},
+        Queue{Dev}, Graph{Queue.get_context(), Dev} {}
+
+protected:
+  void SetUp() override {}
+
+protected:
+  unittest::PiMock Mock;
+  sycl::platform Plat;
+  sycl::device Dev;
+  sycl::queue Queue;
+  experimental::command_graph<experimental::graph_state::modifiable> Graph;
+};
+
+TEST_F(CommandGraphTest, AddNode) {
+  auto GraphImpl = sycl::detail::getSyclObjImpl(Graph);
+
+  ASSERT_TRUE(GraphImpl->MRoots.size() == 0);
+
+  auto Node1 = Graph.add([&](sycl::handler &cgh) {});
+
+  ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1) != nullptr);
+  ASSERT_TRUE(GraphImpl->MRoots.size() == 1);
+  ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.size() == 0);
+
+  // Add a node which depends on the first
+  auto Node2 = Graph.add([&](sycl::handler &cgh) {},
+                         {experimental::property::node::depends_on(Node1)});
+  ASSERT_TRUE(GraphImpl->MRoots.size() == 1);
+  ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.size() == 1);
+  ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.front() ==
+              sycl::detail::getSyclObjImpl(Node2));
+
+  // Add a third node which depends on both
+  auto Node3 =
+      Graph.add([&](sycl::handler &cgh) {},
+                {experimental::property::node::depends_on(Node1, Node2)});
+  ASSERT_TRUE(GraphImpl->MRoots.size() == 1);
+  ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.size() == 2);
+  ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node2)->MSuccessors.size() == 1);
+
+  // Add a fourth node without any dependencies on the others
+  auto Node4 = Graph.add([&](sycl::handler &cgh) {});
+  ASSERT_TRUE(GraphImpl->MRoots.size() == 2);
+  ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.size() == 2);
+  ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node2)->MSuccessors.size() == 1);
+  ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node3)->MSuccessors.size() == 0);
+}
+
+TEST_F(CommandGraphTest, MakeEdge) {
+  auto GraphImpl = sycl::detail::getSyclObjImpl(Graph);
+
+  auto Node1 = Graph.add([&](sycl::handler &cgh) {});
+  auto Node2 = Graph.add([&](sycl::handler &cgh) {});
+  ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.size() == 0);
+  ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node2)->MPredecessors.size() == 0);
+
+  Graph.make_edge(Node1, Node2);
+
+  ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node1)->MSuccessors.size() == 1);
+  ASSERT_TRUE(sycl::detail::getSyclObjImpl(Node2)->MPredecessors.size() == 1);
+}
+
+TEST_F(CommandGraphTest, BeginEndRecording) {
+  sycl::queue Queue2{Dev};
+
+  // Test throwing behaviour
+  // Check we can repeatedly begin recording on the same queues
+  ASSERT_NO_THROW(Graph.begin_recording(Queue));
+  ASSERT_NO_THROW(Graph.begin_recording(Queue));
+  ASSERT_NO_THROW(Graph.begin_recording(Queue2));
+  ASSERT_NO_THROW(Graph.begin_recording(Queue2));
+  // Check we can repeatedly end recording on the same queues
+  ASSERT_NO_THROW(Graph.end_recording(Queue));
+  ASSERT_NO_THROW(Graph.end_recording(Queue));
+  ASSERT_NO_THROW(Graph.end_recording(Queue2));
+  ASSERT_NO_THROW(Graph.end_recording(Queue2));
+  // Vector versions
+  ASSERT_NO_THROW(Graph.begin_recording({Queue, Queue2}));
+  ASSERT_NO_THROW(Graph.begin_recording({Queue, Queue2}));
+  ASSERT_NO_THROW(Graph.end_recording({Queue, Queue2}));
+  ASSERT_NO_THROW(Graph.end_recording({Queue, Queue2}));
+
+  experimental::command_graph Graph2(Queue.get_context(), Dev);
+
+  Graph.begin_recording(Queue);
+  // Trying to record to a second Graph should throw
+  ASSERT_ANY_THROW(Graph2.begin_recording(Queue));
+  // Trying to end when it is recording to a different graph should throw
+  ASSERT_ANY_THROW(Graph2.end_recording(Queue));
+  Graph.end_recording(Queue);
+
+  // Testing return values of begin and end recording
+  // Queue should change state so should return true here
+  ASSERT_TRUE(Graph.begin_recording(Queue));
+  // But not changed state here
+  ASSERT_FALSE(Graph.begin_recording(Queue));
+
+  // Queue2 should change state so should return true here
+  ASSERT_TRUE(Graph.begin_recording(Queue2));
+  // But not changed state here
+  ASSERT_FALSE(Graph.begin_recording(Queue2));
+
+  // Queue should have changed state so should return true
+  ASSERT_TRUE(Graph.end_recording(Queue));
+  // But not changed state here
+  ASSERT_FALSE(Graph.end_recording(Queue));
+
+  // Should end recording on Queue2
+  ASSERT_TRUE(Graph.end_recording());
+  // State should not change on Queue2 now
+  ASSERT_FALSE(Graph.end_recording(Queue2));
+
+  // Testing vector begin and end
+  ASSERT_TRUE(Graph.begin_recording({Queue, Queue2}));
+  // Both shoudl now not have state changed
+  ASSERT_FALSE(Graph.begin_recording(Queue));
+  ASSERT_FALSE(Graph.begin_recording(Queue2));
+
+  // End recording on both
+  ASSERT_TRUE(Graph.end_recording({Queue, Queue2}));
+  // Both shoudl now not have state changed
+  ASSERT_FALSE(Graph.end_recording(Queue));
+  ASSERT_FALSE(Graph.end_recording(Queue2));
+
+  // First add one single queue
+  ASSERT_TRUE(Graph.begin_recording(Queue));
+  // Vector begin should still return true as Queue2 has state changed
+  ASSERT_TRUE(Graph.begin_recording({Queue, Queue2}));
+  // End recording on Queue2
+  ASSERT_TRUE(Graph.end_recording(Queue2));
+  // Vector end should still return true as Queue will have state changed
+  ASSERT_TRUE(Graph.end_recording({Queue, Queue2}));
+}
diff --git a/sycl/unittests/scheduler/Regression.cpp b/sycl/unittests/scheduler/Regression.cpp
index fafaab81cd80f..4bca3c4757121 100644
--- a/sycl/unittests/scheduler/Regression.cpp
+++ b/sycl/unittests/scheduler/Regression.cpp
@@ -38,8 +38,8 @@ static pi_result redefinedEnqueueNativeKernel(
   EXPECT_EQ(Reqs[0]->MAccessRange[1], MockReq.MAccessRange[1]);
   EXPECT_EQ(Reqs[0]->MAccessRange[2], MockReq.MAccessRange[2]);
 
-  std::unique_ptr<detail::HostKernelBase> *HostKernel =
-      static_cast<std::unique_ptr<detail::HostKernelBase> *>(CastedBlob[1]);
+  std::shared_ptr<detail::HostKernelBase> *HostKernel =
+      static_cast<std::shared_ptr<detail::HostKernelBase> *>(CastedBlob[1]);
   testing::internal::CaptureStdout();
   (*HostKernel)->call(NDRDesc, nullptr);
   std::string Output = testing::internal::GetCapturedStdout();
@@ -73,7 +73,7 @@ TEST_F(SchedulerTest, CheckArgsBlobInPiEnqueueNativeKernelIsValid) {
   std::unique_ptr<detail::CG> CG{new detail::CGExecKernel(
       /*NDRDesc*/ NDRDesc,
       /*HKernel*/
-      std::make_unique<detail::HostKernel<decltype(Kernel), void, 1>>(HKernel),
+      std::make_shared<detail::HostKernel<decltype(Kernel), void, 1>>(HKernel),
       /*SyclKernel*/ nullptr,
       /*KernelBundle*/ nullptr, std::move(CGData),
       /*Args*/ {},

From 6c1125ff8bf8614de56e150171872cadf2043f5d Mon Sep 17 00:00:00 2001
From: aelovikov-intel <andrei.elovikov@intel.com>
Date: Fri, 16 Jun 2023 13:06:34 -0700
Subject: [PATCH 52/55] [CI] Change trigger events for post commit task (#9950)

1) Don't run in pre-commit to enhance security
2) Run automatically when on a push to a branch starting with
   "test-devops-pr" to enable testing for CI-related changes
---
 .github/workflows/sycl_post_commit.yml | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/.github/workflows/sycl_post_commit.yml b/.github/workflows/sycl_post_commit.yml
index 965fae8efb584..ff6f993640888 100644
--- a/.github/workflows/sycl_post_commit.yml
+++ b/.github/workflows/sycl_post_commit.yml
@@ -4,16 +4,7 @@ on:
   push:
     branches:
     - sycl
-  pull_request:
-    branches:
-    - sycl
-    paths:
-    - .github/workflows/sycl_post_commit.yml
-    - .github/workflows/sycl_gen_test_matrix.yml
-    - .github/workflows/sycl_linux_build_and_test.yml
-    - .github/workflows/sycl_windows_build_and_test.yml
-    - .github/workflows/sycl_macos_build_and_test.yml
-  workflow_dispatch:
+    - test-devops-pr/**
 
 jobs:
   # This job generates matrix of tests for SYCL End-to-End tests

From 2f260c2c094ddc4b7d07e5a11bc5c713558f8939 Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Mon, 19 Jun 2023 01:51:51 -0700
Subject: [PATCH 53/55] [SYCL] Fix post-commit failures after PR#9728 (#9949)

---
 sycl/source/detail/graph_impl.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp
index 5fb47774dc6db..fc899132eaf81 100644
--- a/sycl/source/detail/graph_impl.cpp
+++ b/sycl/source/detail/graph_impl.cpp
@@ -131,6 +131,7 @@ graph_impl::add(const std::shared_ptr<graph_impl> &Impl,
                 std::function<void(handler &)> CGF,
                 const std::vector<sycl::detail::ArgDesc> &Args,
                 const std::vector<std::shared_ptr<node_impl>> &Dep) {
+  (void)Args;
   sycl::handler Handler{Impl};
   CGF(Handler);
   Handler.finalize();
@@ -258,8 +259,8 @@ sycl::event exec_graph_impl::enqueue(
       } else {
 
         sycl::detail::EventImplPtr EventImpl =
-            sycl::detail::Scheduler::getInstance().addCG(
-                std::move(NodeImpl->getCGCopy()), Queue);
+            sycl::detail::Scheduler::getInstance().addCG(NodeImpl->getCGCopy(),
+                                                         Queue);
 
         ScheduledEvents.push_back(EventImpl);
       }

From 1a6f7a51502bfe6bf177d7302cd1f08de4a8ec02 Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Mon, 19 Jun 2023 03:52:17 -0500
Subject: [PATCH 54/55] [SYCL][Graph] Fix unannotated fall-through between
 switch labels (#9966)

Together with [PR#9949](https://github.com/intel/llvm/pull/9949):
Fix post-commit failures after
[PR#9728](https://github.com/intel/llvm/pull/9728)
---
 sycl/source/detail/graph_impl.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp
index ac4dca9e395dd..ba528dcec6765 100644
--- a/sycl/source/detail/graph_impl.hpp
+++ b/sycl/source/detail/graph_impl.hpp
@@ -125,6 +125,7 @@ class node_impl {
       return createCGCopy<sycl::detail::CGExecKernel>();
     case sycl::detail::CG::CodeplayInteropTask:
       assert(false);
+      break;
     // TODO: Uncomment this once we implement support for interop task so we can
     // test required changes to the CG class.
 
@@ -153,6 +154,7 @@ class node_impl {
       return createCGCopy<sycl::detail::CGMemset2DUSM>();
     case sycl::detail::CG::CodeplayHostTask:
       assert(false);
+      break;
       // TODO: Uncomment this once we implement support for host task so we can
       // test required changes to the CG class.
 

From 83f877975c97acdb38d84f94dc146571cd522e0e Mon Sep 17 00:00:00 2001
From: Steffen Larsen <steffen.larsen@intel.com>
Date: Mon, 19 Jun 2023 13:28:33 +0100
Subject: [PATCH 55/55] [SYCL] Revert recent copy requirements temporarily
 (#9970)

This commit reverts the recently introduced requirements on
`is_device_copyable` in copy commands. This is a temporary revert until
`sycl::vec` is made trivially copyable, to appropriately satisfy
`is_device_copyable` implicitly.

---------

Signed-off-by: Larsen, Steffen <steffen.larsen@intel.com>
---
 sycl/include/sycl/handler.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp
index 1463babc8db64..91f3a9aa336fd 100644
--- a/sycl/include/sycl/handler.hpp
+++ b/sycl/include/sycl/handler.hpp
@@ -2289,8 +2289,8 @@ class __SYCL_EXPORT handler {
                   "Invalid accessor target for the copy method.");
     static_assert(isValidModeForDestinationAccessor(AccessMode),
                   "Invalid accessor mode for the copy method.");
-    static_assert(is_device_copyable<T_Src>::value,
-                  "Pattern must be device copyable");
+    // TODO: Add static_assert with is_device_copyable when vec is
+    // device-copyable.
     // Make sure data shared_ptr points to is not released until we finish
     // work with it.
     CGData.MSharedPtrStorage.push_back(Src);
@@ -2360,8 +2360,8 @@ class __SYCL_EXPORT handler {
                   "Invalid accessor target for the copy method.");
     static_assert(isValidModeForDestinationAccessor(AccessMode),
                   "Invalid accessor mode for the copy method.");
-    static_assert(is_device_copyable<T_Src>::value,
-                  "Pattern must be device copyable");
+    // TODO: Add static_assert with is_device_copyable when vec is
+    // device-copyable.
 #ifndef __SYCL_DEVICE_ONLY__
     if (MIsHost) {
       // TODO: Temporary implementation for host. Should be handled by memory