From c46ec835e525a43a9f60813f2afd0f4a2ad4a844 Mon Sep 17 00:00:00 2001
From: Maxime France-Pillois <maxime.francepillois@codeplay.com>
Date: Thu, 24 Aug 2023 11:03:15 +0100
Subject: [PATCH] [SYCL][Graph] Implement graph enqueue for CUDA backend (#305)

* [SYCL][Graph] Implement graph enqueue for CUDA backend

Implements urCommandBufferAppendKernelLaunchExp function to append kernel to a command buffer.
Implements urCommandBufferAppendKernelLaunchExp function to enqueue a command buffer to a cuda stream.
Enable all tests supported by the current state of the Cuda backend.

* [SYCL][Graph] Refactors the code using helper to set Kernel params

* [SYCL][Graph] Adds exception catching

* [SYCL][Graph] Reformats paramaters of internal functions in PascalCase

* [SYCL][Graph] Fixes command buffer enqueuing issue in multithreaded environment.

Sets a ScopedContext before enqueuing command buffer.

* [SYCL][Graph] Adds missing exception catch
---
 .../ur/adapters/cuda/command_buffer.cpp       | 180 +++++++++++--
 .../ur/adapters/cuda/command_buffer.hpp       | 179 ++++++++++++-
 .../ur/adapters/cuda/enqueue.cpp              | 250 ++++++++++--------
 .../ur/adapters/cuda/enqueue.hpp              |  17 ++
 .../Explicit/add_node_while_recording.cpp     |   2 +-
 .../Explicit/add_nodes_after_finalize.cpp     |   2 +-
 sycl/test-e2e/Graph/Explicit/basic_buffer.cpp |   2 +-
 sycl/test-e2e/Graph/Explicit/basic_usm.cpp    |   2 +-
 .../Graph/Explicit/basic_usm_host.cpp         |   2 +-
 .../Graph/Explicit/basic_usm_mixed.cpp        |   2 +-
 .../Graph/Explicit/basic_usm_shared.cpp       |   2 +-
 .../Graph/Explicit/basic_usm_system.cpp       |   2 +-
 .../Graph/Explicit/buffer_ordering.cpp        |   2 +-
 sycl/test-e2e/Graph/Explicit/depends_on.cpp   |   2 +-
 sycl/test-e2e/Graph/Explicit/dotp.cpp         |   2 +-
 sycl/test-e2e/Graph/Explicit/dotp_buffer.cpp  |   2 +-
 .../test-e2e/Graph/Explicit/dotp_host_mem.cpp |   2 +-
 .../Graph/Explicit/dotp_host_shared.cpp       |   2 +-
 sycl/test-e2e/Graph/Explicit/dotp_mixed.cpp   |   2 +-
 .../Graph/Explicit/dotp_shared_mem.cpp        |   2 +-
 .../Graph/Explicit/dotp_system_mem.cpp        |   2 +-
 sycl/test-e2e/Graph/Explicit/empty.cpp        |   2 +-
 sycl/test-e2e/Graph/Explicit/empty_node.cpp   |   2 +-
 .../Graph/Explicit/empty_with_deps.cpp        |   2 +-
 .../Graph/Explicit/enqueue_ordering.cpp       |   2 +-
 .../Graph/Explicit/multiple_exec_graphs.cpp   |   2 +-
 .../test-e2e/Graph/Explicit/node_ordering.cpp |   2 +-
 .../test-e2e/Graph/Explicit/repeated_exec.cpp |   2 +-
 sycl/test-e2e/Graph/Explicit/saxpy.cpp        |   2 +-
 sycl/test-e2e/Graph/Explicit/single_node.cpp  |   2 +-
 sycl/test-e2e/Graph/Explicit/sub_graph.cpp    |   2 +-
 .../sub_graph_execute_without_parent.cpp      |   2 +-
 .../sub_graph_multiple_submission.cpp         |   2 +-
 .../Graph/Explicit/sub_graph_nested.cpp       |   2 +-
 .../Explicit/sub_graph_two_parent_graphs.cpp  |   2 +-
 .../Explicit/temp_buffer_reinterpret.cpp      |   2 +-
 .../Graph/Explicit/while_recording.cpp        |   2 +-
 .../RecordReplay/add_nodes_after_finalize.cpp |   2 +-
 .../Graph/RecordReplay/basic_buffer.cpp       |   2 +-
 .../test-e2e/Graph/RecordReplay/basic_usm.cpp |   2 +-
 .../Graph/RecordReplay/basic_usm_host.cpp     |   2 +-
 .../Graph/RecordReplay/basic_usm_mixed.cpp    |   2 +-
 .../Graph/RecordReplay/basic_usm_shared.cpp   |   2 +-
 .../Graph/RecordReplay/basic_usm_system.cpp   |   2 +-
 .../Graph/RecordReplay/buffer_ordering.cpp    |   2 +-
 .../Graph/RecordReplay/concurrent_queue.cpp   |   2 +-
 sycl/test-e2e/Graph/RecordReplay/dotp.cpp     |   2 +-
 .../Graph/RecordReplay/dotp_buffer.cpp        |   2 +-
 .../Graph/RecordReplay/dotp_host_mem.cpp      |   2 +-
 .../Graph/RecordReplay/dotp_host_shared.cpp   |   2 +-
 .../Graph/RecordReplay/dotp_in_order.cpp      |   2 +-
 .../dotp_in_order_with_empty_nodes.cpp        |   2 +-
 .../RecordReplay/dotp_multiple_queues.cpp     |   2 +-
 .../Graph/RecordReplay/dotp_shared_mem.cpp    |   2 +-
 .../Graph/RecordReplay/dotp_system_mem.cpp    |   2 +-
 sycl/test-e2e/Graph/RecordReplay/empty.cpp    |   2 +-
 .../Graph/RecordReplay/empty_node.cpp         |   2 +-
 .../RecordReplay/empty_node_with_dep.cpp      |   2 +-
 .../RecordReplay/multiple_exec_graphs.cpp     |   2 +-
 .../Graph/RecordReplay/repeated_exec.cpp      |   2 +-
 sycl/test-e2e/Graph/RecordReplay/saxpy.cpp    |   2 +-
 .../Graph/RecordReplay/simple_shared_usm.cpp  |   2 +-
 .../test-e2e/Graph/RecordReplay/sub_graph.cpp |   2 +-
 .../sub_graph_execute_without_parent.cpp      |   2 +-
 .../Graph/RecordReplay/sub_graph_in_order.cpp |   2 +-
 .../sub_graph_multiple_submission.cpp         |   2 +-
 .../Graph/RecordReplay/sub_graph_nested.cpp   |   2 +-
 .../sub_graph_two_parent_graphs.cpp           |   2 +-
 .../RecordReplay/temp_buffer_reinterpret.cpp  |   2 +-
 .../Graph/RecordReplay/temp_scope.cpp         |   2 +-
 sycl/test-e2e/Graph/Threading/finalize.cpp    |   2 +-
 sycl/test-e2e/Graph/Threading/queue_state.cpp |   2 +-
 sycl/test-e2e/Graph/Threading/submit.cpp      |   2 +-
 sycl/test-e2e/Graph/Threading/update.cpp      |   2 +-
 ...ackend_create_and_finalize_empty_graph.cpp |  35 ---
 sycl/test-e2e/Graph/empty_graph.cpp           |   2 +-
 sycl/test-e2e/Graph/finalize_empty.cpp        |   2 +-
 sycl/test-e2e/Graph/finalize_twice.cpp        |   2 +-
 78 files changed, 562 insertions(+), 245 deletions(-)
 delete mode 100644 sycl/test-e2e/Graph/cuda_backend_create_and_finalize_empty_graph.cpp

diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp
index 66d193198a3c4..32b7a8b78b224 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp
@@ -8,13 +8,16 @@
 
 #include "command_buffer.hpp"
 #include "common.hpp"
-
-/// Stub implementations of UR experimental feature command-buffers
+#include "enqueue.hpp"
+#include "event.hpp"
+#include "kernel.hpp"
+#include "memory.hpp"
+#include "queue.hpp"
 
 ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_(
     ur_context_handle_t hContext, ur_device_handle_t hDevice)
     : Context(hContext),
-      Device(hDevice), cudaGraph{nullptr}, cudaGraphExec{nullptr}, RefCount{1} {
+      Device(hDevice), CudaGraph{nullptr}, CudaGraphExec{nullptr}, RefCount{1} {
   urContextRetain(hContext);
   urDeviceRetain(hDevice);
 }
@@ -29,10 +32,43 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() {
   urDeviceRelease(Device);
 
   // Release the memory allocated to the CudaGraph
-  cuGraphDestroy(cudaGraph);
+  cuGraphDestroy(CudaGraph);
 
   // Release the memory allocated to the CudaGraphExec
-  cuGraphExecDestroy(cudaGraphExec);
+  cuGraphExecDestroy(CudaGraphExec);
+}
+
+/// Helper function for finding the Cuda Nodes associated with the
+/// commands in a command-buffer, each event is pointed to by a sync-point in
+/// the wait list.
+///
+/// @param[in] CommandBuffer to lookup the events from.
+/// @param[in] NumSyncPointsInWaitList Length of \p SyncPointWaitList.
+/// @param[in] SyncPointWaitList List of sync points in \p CommandBuffer
+/// to find the events for.
+/// @param[out] CuNodesList Return parameter for the Cuda Nodes associated with
+/// each sync-point in \p SyncPointWaitList.
+///
+/// @return UR_RESULT_SUCCESS or an error code on failure
+static ur_result_t getNodesFromSyncPoints(
+    const ur_exp_command_buffer_handle_t &CommandBuffer,
+    size_t NumSyncPointsInWaitList,
+    const ur_exp_command_buffer_sync_point_t *SyncPointWaitList,
+    std::vector<CUgraphNode> &CuNodesList) {
+  // Map of ur_exp_command_buffer_sync_point_t to ur_event_handle_t defining
+  // the event associated with each sync-point
+  auto SyncPoints = CommandBuffer->SyncPoints;
+
+  // For each sync-point add associated L0 event to the return list.
+  for (size_t i = 0; i < NumSyncPointsInWaitList; i++) {
+    if (auto NodeHandle = SyncPoints.find(SyncPointWaitList[i]);
+        NodeHandle != SyncPoints.end()) {
+      CuNodesList.push_back(*NodeHandle->second.get());
+    } else {
+      return UR_RESULT_ERROR_INVALID_VALUE;
+    }
+  }
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
@@ -51,7 +87,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
 
   auto RetCommandBuffer = *hCommandBuffer;
   try {
-    UR_CHECK_ERROR(cuGraphCreate(&RetCommandBuffer->cudaGraph, 0));
+    UR_CHECK_ERROR(cuGraphCreate(&RetCommandBuffer->CudaGraph, 0));
   } catch (...) {
     return UR_RESULT_ERROR_OUT_OF_RESOURCES;
   }
@@ -77,8 +113,8 @@ urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
 UR_APIEXPORT ur_result_t UR_APICALL
 urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
   try {
-    UR_CHECK_ERROR(cuGraphInstantiate(&hCommandBuffer->cudaGraphExec,
-                                      hCommandBuffer->cudaGraph, 0));
+    UR_CHECK_ERROR(cuGraphInstantiate(&hCommandBuffer->CudaGraphExec,
+                                      hCommandBuffer->CudaGraph, 0));
   } catch (...) {
     return UR_RESULT_ERROR_UNKNOWN;
   }
@@ -92,19 +128,82 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
     uint32_t numSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint) {
-  (void)hCommandBuffer;
-  (void)hKernel;
-  (void)workDim;
-  (void)pGlobalWorkOffset;
-  (void)pGlobalWorkSize;
-  (void)pLocalWorkSize;
-  (void)numSyncPointsInWaitList;
-  (void)pSyncPointWaitList;
-  (void)pSyncPoint;
+  // Preconditions
+  UR_ASSERT(hCommandBuffer->Context == hKernel->getContext(),
+            UR_RESULT_ERROR_INVALID_KERNEL);
+  UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+  UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  CUgraphNode GraphNode;
+
+  std::vector<CUgraphNode> DepsList;
+  UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
+                                 pSyncPointWaitList, DepsList));
+
+  if (*pGlobalWorkSize == 0) {
+    try {
+      // Create a empty node if the kernel worload size is zero
+      Result = UR_CHECK_ERROR(
+          cuGraphAddEmptyNode(&GraphNode, hCommandBuffer->CudaGraph,
+                              DepsList.data(), DepsList.size()));
+
+      // Get sync point and register the event with it.
+      *pSyncPoint = hCommandBuffer->GetNextSyncPoint();
+      hCommandBuffer->RegisterSyncPoint(
+          *pSyncPoint, std::make_shared<CUgraphNode>(GraphNode));
+    } catch (ur_result_t Err) {
+      Result = Err;
+    }
+    return Result;
+  }
 
-  detail::ur::die("Experimental Command-buffer feature is not "
-                  "implemented for CUDA adapter.");
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  // Set the number of threads per block to the number of threads per warp
+  // by default unless user has provided a better number
+  size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
+  size_t BlocksPerGrid[3] = {1u, 1u, 1u};
+
+  uint32_t LocalSize = hKernel->getLocalSize();
+  CUfunction CuFunc = hKernel->get();
+
+  if ((Result = setKernelParams(
+           hCommandBuffer->Context, hCommandBuffer->Device, workDim,
+           pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, hKernel, CuFunc,
+           ThreadsPerBlock, BlocksPerGrid)) != UR_RESULT_SUCCESS) {
+    return Result;
+  }
+
+  try {
+    // Set node param structure with the kernel related data
+    auto &ArgIndices = hKernel->getArgIndices();
+    CUDA_KERNEL_NODE_PARAMS nodeParams;
+    nodeParams.func = CuFunc;
+    nodeParams.gridDimX = BlocksPerGrid[0];
+    nodeParams.gridDimY = BlocksPerGrid[1];
+    nodeParams.gridDimZ = BlocksPerGrid[2];
+    nodeParams.blockDimX = ThreadsPerBlock[0];
+    nodeParams.blockDimY = ThreadsPerBlock[1];
+    nodeParams.blockDimZ = ThreadsPerBlock[2];
+    nodeParams.sharedMemBytes = LocalSize;
+    nodeParams.kernelParams = const_cast<void **>(ArgIndices.data());
+    nodeParams.extra = nullptr;
+
+    // Create and add an new kernel node to the Cuda graph
+    Result = UR_CHECK_ERROR(
+        cuGraphAddKernelNode(&GraphNode, hCommandBuffer->CudaGraph,
+                             DepsList.data(), DepsList.size(), &nodeParams));
+
+    if (LocalSize != 0)
+      hKernel->clearLocalSize();
+
+    // Get sync point and register the event with it.
+    *pSyncPoint = hCommandBuffer->GetNextSyncPoint();
+    hCommandBuffer->RegisterSyncPoint(*pSyncPoint,
+                                      std::make_shared<CUgraphNode>(GraphNode));
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemcpyUSMExp(
@@ -275,13 +374,38 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
     ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
-  (void)hCommandBuffer;
-  (void)hQueue;
-  (void)numEventsInWaitList;
-  (void)phEventWaitList;
-  (void)phEvent;
+  ur_result_t Result = UR_RESULT_SUCCESS;
 
-  detail::ur::die("Experimental Command-buffer feature is not "
-                  "implemented for CUDA adapter.");
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  try {
+    std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+    ScopedContext Active(hQueue->getContext());
+    uint32_t StreamToken;
+    ur_stream_guard_ Guard;
+    CUstream CuStream = hQueue->getNextComputeStream(
+        numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
+
+    if ((Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                                    phEventWaitList)) != UR_RESULT_SUCCESS) {
+      return Result;
+    }
+
+    if (phEvent) {
+      RetImplEvent =
+          std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+              UR_COMMAND_KERNEL_LAUNCH, hQueue, CuStream, StreamToken));
+      RetImplEvent->start();
+    }
+
+    // Launch graph
+    Result =
+        UR_CHECK_ERROR(cuGraphLaunch(hCommandBuffer->CudaGraphExec, CuStream));
+
+    if (phEvent) {
+      Result = RetImplEvent->record();
+      *phEvent = RetImplEvent.release();
+    }
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
 }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.hpp
index 6049ceaed88a4..d2d5079005bc0 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.hpp
@@ -9,7 +9,164 @@
 #include <ur/ur.hpp>
 #include <ur_api.h>
 
+#include "context.hpp"
 #include <cuda.h>
+#include <memory>
+
+static auto getUrResultString = [](ur_result_t Result) {
+  switch (Result) {
+  case UR_RESULT_SUCCESS:
+    return "UR_RESULT_SUCCESS";
+  case UR_RESULT_ERROR_INVALID_OPERATION:
+    return "UR_RESULT_ERROR_INVALID_OPERATION";
+  case UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES:
+    return "UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES";
+  case UR_RESULT_ERROR_INVALID_QUEUE:
+    return "UR_RESULT_ERROR_INVALID_QUEUE";
+  case UR_RESULT_ERROR_INVALID_VALUE:
+    return "UR_RESULT_ERROR_INVALID_VALUE";
+  case UR_RESULT_ERROR_INVALID_CONTEXT:
+    return "UR_RESULT_ERROR_INVALID_CONTEXT";
+  case UR_RESULT_ERROR_INVALID_PLATFORM:
+    return "UR_RESULT_ERROR_INVALID_PLATFORM";
+  case UR_RESULT_ERROR_INVALID_BINARY:
+    return "UR_RESULT_ERROR_INVALID_BINARY";
+  case UR_RESULT_ERROR_INVALID_PROGRAM:
+    return "UR_RESULT_ERROR_INVALID_PROGRAM";
+  case UR_RESULT_ERROR_INVALID_SAMPLER:
+    return "UR_RESULT_ERROR_INVALID_SAMPLER";
+  case UR_RESULT_ERROR_INVALID_BUFFER_SIZE:
+    return "UR_RESULT_ERROR_INVALID_BUFFER_SIZE";
+  case UR_RESULT_ERROR_INVALID_MEM_OBJECT:
+    return "UR_RESULT_ERROR_INVALID_MEM_OBJECT";
+  case UR_RESULT_ERROR_INVALID_EVENT:
+    return "UR_RESULT_ERROR_INVALID_EVENT";
+  case UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST:
+    return "UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST";
+  case UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET:
+    return "UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET";
+  case UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE:
+    return "UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE";
+  case UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE:
+    return "UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE";
+  case UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE:
+    return "UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE";
+  case UR_RESULT_ERROR_DEVICE_NOT_FOUND:
+    return "UR_RESULT_ERROR_DEVICE_NOT_FOUND";
+  case UR_RESULT_ERROR_INVALID_DEVICE:
+    return "UR_RESULT_ERROR_INVALID_DEVICE";
+  case UR_RESULT_ERROR_DEVICE_LOST:
+    return "UR_RESULT_ERROR_DEVICE_LOST";
+  case UR_RESULT_ERROR_DEVICE_REQUIRES_RESET:
+    return "UR_RESULT_ERROR_DEVICE_REQUIRES_RESET";
+  case UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE:
+    return "UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE";
+  case UR_RESULT_ERROR_DEVICE_PARTITION_FAILED:
+    return "UR_RESULT_ERROR_DEVICE_PARTITION_FAILED";
+  case UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT:
+    return "UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT";
+  case UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE:
+    return "UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE";
+  case UR_RESULT_ERROR_INVALID_WORK_DIMENSION:
+    return "UR_RESULT_ERROR_INVALID_WORK_DIMENSION";
+  case UR_RESULT_ERROR_INVALID_KERNEL_ARGS:
+    return "UR_RESULT_ERROR_INVALID_KERNEL_ARGS";
+  case UR_RESULT_ERROR_INVALID_KERNEL:
+    return "UR_RESULT_ERROR_INVALID_KERNEL";
+  case UR_RESULT_ERROR_INVALID_KERNEL_NAME:
+    return "UR_RESULT_ERROR_INVALID_KERNEL_NAME";
+  case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX:
+    return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX";
+  case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE:
+    return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE";
+  case UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE:
+    return "UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE";
+  case UR_RESULT_ERROR_INVALID_IMAGE_SIZE:
+    return "UR_RESULT_ERROR_INVALID_IMAGE_SIZE";
+  case UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR:
+    return "UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+  case UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED:
+    return "UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED";
+  case UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE:
+    return "UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE";
+  case UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE:
+    return "UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE";
+  case UR_RESULT_ERROR_UNINITIALIZED:
+    return "UR_RESULT_ERROR_UNINITIALIZED";
+  case UR_RESULT_ERROR_OUT_OF_HOST_MEMORY:
+    return "UR_RESULT_ERROR_OUT_OF_HOST_MEMORY";
+  case UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY:
+    return "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY";
+  case UR_RESULT_ERROR_OUT_OF_RESOURCES:
+    return "UR_RESULT_ERROR_OUT_OF_RESOURCES";
+  case UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE:
+    return "UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE";
+  case UR_RESULT_ERROR_PROGRAM_LINK_FAILURE:
+    return "UR_RESULT_ERROR_PROGRAM_LINK_FAILURE";
+  case UR_RESULT_ERROR_UNSUPPORTED_VERSION:
+    return "UR_RESULT_ERROR_UNSUPPORTED_VERSION";
+  case UR_RESULT_ERROR_UNSUPPORTED_FEATURE:
+    return "UR_RESULT_ERROR_UNSUPPORTED_FEATURE";
+  case UR_RESULT_ERROR_INVALID_ARGUMENT:
+    return "UR_RESULT_ERROR_INVALID_ARGUMENT";
+  case UR_RESULT_ERROR_INVALID_NULL_HANDLE:
+    return "UR_RESULT_ERROR_INVALID_NULL_HANDLE";
+  case UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE:
+    return "UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE";
+  case UR_RESULT_ERROR_INVALID_NULL_POINTER:
+    return "UR_RESULT_ERROR_INVALID_NULL_POINTER";
+  case UR_RESULT_ERROR_INVALID_SIZE:
+    return "UR_RESULT_ERROR_INVALID_SIZE";
+  case UR_RESULT_ERROR_UNSUPPORTED_SIZE:
+    return "UR_RESULT_ERROR_UNSUPPORTED_SIZE";
+  case UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT:
+    return "UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT";
+  case UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT:
+    return "UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT";
+  case UR_RESULT_ERROR_INVALID_ENUMERATION:
+    return "UR_RESULT_ERROR_INVALID_ENUMERATION";
+  case UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION:
+    return "UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION";
+  case UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT:
+    return "UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT";
+  case UR_RESULT_ERROR_INVALID_NATIVE_BINARY:
+    return "UR_RESULT_ERROR_INVALID_NATIVE_BINARY";
+  case UR_RESULT_ERROR_INVALID_GLOBAL_NAME:
+    return "UR_RESULT_ERROR_INVALID_GLOBAL_NAME";
+  case UR_RESULT_ERROR_INVALID_FUNCTION_NAME:
+    return "UR_RESULT_ERROR_INVALID_FUNCTION_NAME";
+  case UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION:
+    return "UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION";
+  case UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION:
+    return "UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION";
+  case UR_RESULT_ERROR_PROGRAM_UNLINKED:
+    return "UR_RESULT_ERROR_PROGRAM_UNLINKED";
+  case UR_RESULT_ERROR_OVERLAPPING_REGIONS:
+    return "UR_RESULT_ERROR_OVERLAPPING_REGIONS";
+  case UR_RESULT_ERROR_INVALID_HOST_PTR:
+    return "UR_RESULT_ERROR_INVALID_HOST_PTR";
+  case UR_RESULT_ERROR_INVALID_USM_SIZE:
+    return "UR_RESULT_ERROR_INVALID_USM_SIZE";
+  case UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE:
+    return "UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE";
+  case UR_RESULT_ERROR_ADAPTER_SPECIFIC:
+    return "UR_RESULT_ERROR_ADAPTER_SPECIFIC";
+  default:
+    return "UR_RESULT_ERROR_UNKNOWN";
+  }
+};
+
+// Trace an internal PI call; returns in case of an error.
+#define UR_CALL(Call)                                                          \
+  {                                                                            \
+    if (PrintTrace)                                                            \
+      fprintf(stderr, "UR ---> %s\n", #Call);                                  \
+    ur_result_t Result = (Call);                                               \
+    if (PrintTrace)                                                            \
+      fprintf(stderr, "UR <--- %s(%s)\n", #Call, getUrResultString(Result));   \
+    if (Result != UR_RESULT_SUCCESS)                                           \
+      return Result;                                                           \
+  }
 
 struct ur_exp_command_buffer_handle_t_ {
 
@@ -18,18 +175,36 @@ struct ur_exp_command_buffer_handle_t_ {
 
   ~ur_exp_command_buffer_handle_t_();
 
+  void RegisterSyncPoint(ur_exp_command_buffer_sync_point_t SyncPoint,
+                         std::shared_ptr<CUgraphNode> CuNode) {
+    SyncPoints[SyncPoint] = CuNode;
+    NextSyncPoint++;
+  }
+
+  ur_exp_command_buffer_sync_point_t GetNextSyncPoint() const {
+    return NextSyncPoint;
+  }
+
   // UR context associated with this command-buffer
   ur_context_handle_t Context;
   // Device associated with this command buffer
   ur_device_handle_t Device;
   // Cuda Graph handle
-  CUgraph cudaGraph;
+  CUgraph CudaGraph;
   // Cuda Graph Exec handle
-  CUgraphExec cudaGraphExec;
+  CUgraphExec CudaGraphExec;
   // Atomic variable counting the number of reference to this command_buffer
   // using std::atomic prevents data race when incrementing/decrementing.
   std::atomic_uint32_t RefCount;
 
+  // Map of sync_points to ur_events
+  std::unordered_map<ur_exp_command_buffer_sync_point_t,
+                     std::shared_ptr<CUgraphNode>>
+      SyncPoints;
+  // Next sync_point value (may need to consider ways to reuse values if 32-bits
+  // is not enough)
+  ur_exp_command_buffer_sync_point_t NextSyncPoint;
+
   // Used when retaining an object.
   uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
   // Used when releasing an object.
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
index f1a1f25c5b221..085da92c34370 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp
@@ -184,6 +184,136 @@ bool hasExceededMaxRegistersPerBlock(ur_device_handle_t Device,
   return BlockSize * Kernel->getRegsPerThread() > Device->getMaxRegsPerBlock();
 }
 
+// Helper to compute and put in good shape kernel parameters from workload
+// dimensions.
+// @param [in]  Context handler to the target Context
+// @param [in]  Device handler to the target Device
+// @param [in]  WorkDim workload dimension
+// @param [in]  GlobalWorkOffset pointer workload global offsets
+// @param [in]  LocalWorkOffset pointer workload local offsets
+// @param [inout] Kernel handler to the kernel
+// @param [inout] CuFunc handler to the cuda function attached to the kernel
+// @param [out] ThreadsPerBlock Number of threads per block we should run
+// @param [out] BlocksPerGrid Number of blocks per grid we should run
+ur_result_t
+setKernelParams(const ur_context_handle_t Context,
+                const ur_device_handle_t Device, const uint32_t WorkDim,
+                const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
+                const size_t *LocalWorkSize, ur_kernel_handle_t &Kernel,
+                CUfunction &CuFunc, size_t (&ThreadsPerBlock)[3],
+                size_t (&BlocksPerGrid)[3]) {
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  size_t MaxWorkGroupSize = 0u;
+  size_t MaxThreadsPerBlock[3] = {};
+  bool ProvidedLocalWorkGroupSize = (LocalWorkSize != nullptr);
+  uint32_t LocalSize = Kernel->getLocalSize();
+
+  try {
+    // Set the active context here as guessLocalWorkSize needs an active context
+    ScopedContext Active(Context);
+    {
+      size_t *ReqdThreadsPerBlock = Kernel->ReqdThreadsPerBlock;
+      MaxWorkGroupSize = Device->getMaxWorkGroupSize();
+      Device->getMaxWorkItemSizes(sizeof(MaxThreadsPerBlock),
+                                  MaxThreadsPerBlock);
+
+      if (ProvidedLocalWorkGroupSize) {
+        auto IsValid = [&](int Dim) {
+          if (ReqdThreadsPerBlock[Dim] != 0 &&
+              LocalWorkSize[Dim] != ReqdThreadsPerBlock[Dim])
+            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+
+          if (LocalWorkSize[Dim] > MaxThreadsPerBlock[Dim])
+            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+          // Checks that local work sizes are a divisor of the global work sizes
+          // which includes that the local work sizes are neither larger than
+          // the global work sizes and not 0.
+          if (0u == LocalWorkSize[Dim])
+            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+          if (0u != (GlobalWorkSize[Dim] % LocalWorkSize[Dim]))
+            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+          ThreadsPerBlock[Dim] = LocalWorkSize[Dim];
+          return UR_RESULT_SUCCESS;
+        };
+
+        size_t KernelLocalWorkGroupSize = 0;
+        for (size_t Dim = 0; Dim < WorkDim; Dim++) {
+          auto Err = IsValid(Dim);
+          if (Err != UR_RESULT_SUCCESS)
+            return Err;
+          // If no error then sum the total local work size per dim.
+          KernelLocalWorkGroupSize += LocalWorkSize[Dim];
+        }
+
+        if (hasExceededMaxRegistersPerBlock(Device, Kernel,
+                                            KernelLocalWorkGroupSize)) {
+          return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+        }
+      } else {
+        guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
+                           MaxThreadsPerBlock, Kernel, LocalSize);
+      }
+    }
+
+    if (MaxWorkGroupSize <
+        ThreadsPerBlock[0] * ThreadsPerBlock[1] * ThreadsPerBlock[2]) {
+      return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+    }
+
+    for (size_t i = 0; i < WorkDim; i++) {
+      BlocksPerGrid[i] =
+          (GlobalWorkSize[i] + ThreadsPerBlock[i] - 1) / ThreadsPerBlock[i];
+    }
+
+    // Set the implicit global offset parameter if kernel has offset variant
+    if (Kernel->get_with_offset_parameter()) {
+      std::uint32_t CudaImplicitOffset[3] = {0, 0, 0};
+      if (GlobalWorkOffset) {
+        for (size_t i = 0; i < WorkDim; i++) {
+          CudaImplicitOffset[i] =
+              static_cast<std::uint32_t>(GlobalWorkOffset[i]);
+          if (GlobalWorkOffset[i] != 0) {
+            CuFunc = Kernel->get_with_offset_parameter();
+          }
+        }
+      }
+      Kernel->setImplicitOffsetArg(sizeof(CudaImplicitOffset),
+                                   CudaImplicitOffset);
+    }
+
+    if (Context->getDevice()->maxLocalMemSizeChosen()) {
+      // Set up local memory requirements for kernel.
+      auto Device = Context->getDevice();
+      if (Device->getMaxChosenLocalMem() < 0) {
+        setErrorMessage("Invalid value specified for "
+                        "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE",
+                        UR_RESULT_ERROR_ADAPTER_SPECIFIC);
+        return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+      }
+      if (LocalSize > static_cast<uint32_t>(Device->getMaxCapacityLocalMem())) {
+        setErrorMessage("Too much local memory allocated for device",
+                        UR_RESULT_ERROR_ADAPTER_SPECIFIC);
+        return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+      }
+      if (LocalSize > static_cast<uint32_t>(Device->getMaxChosenLocalMem())) {
+        setErrorMessage(
+            "Local memory for kernel exceeds the amount requested using "
+            "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE. Try increasing the value for "
+            "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE.",
+            UR_RESULT_ERROR_ADAPTER_SPECIFIC);
+        return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
+      }
+      UR_CHECK_ERROR(cuFuncSetAttribute(
+          CuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+          Device->getMaxChosenLocalMem()));
+    }
+
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
+
 /// Enqueues a wait on the given CUstream for all specified events (See
 /// \ref enqueueEventWaitWithBarrier.) If the events list is empty, the enqueued
 /// wait will wait on all previous events in the queue.
@@ -292,100 +422,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   // Set the number of threads per block to the number of threads per warp
   // by default unless user has provided a better number
   size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
-  size_t MaxWorkGroupSize = 0u;
-  size_t MaxThreadsPerBlock[3] = {};
-  bool ProvidedLocalWorkGroupSize = (pLocalWorkSize != nullptr);
+  size_t BlocksPerGrid[3] = {1u, 1u, 1u};
+
   uint32_t LocalSize = hKernel->getLocalSize();
   ur_result_t Result = UR_RESULT_SUCCESS;
+  CUfunction CuFunc = hKernel->get();
 
-  try {
-    // Set the active context here as guessLocalWorkSize needs an active context
-    ScopedContext Active(hQueue->getContext());
-    {
-      size_t *ReqdThreadsPerBlock = hKernel->ReqdThreadsPerBlock;
-      MaxWorkGroupSize = hQueue->Device->getMaxWorkGroupSize();
-      hQueue->Device->getMaxWorkItemSizes(sizeof(MaxThreadsPerBlock),
-                                          MaxThreadsPerBlock);
-
-      if (ProvidedLocalWorkGroupSize) {
-        auto IsValid = [&](int Dim) {
-          if (ReqdThreadsPerBlock[Dim] != 0 &&
-              pLocalWorkSize[Dim] != ReqdThreadsPerBlock[Dim])
-            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
-
-          if (pLocalWorkSize[Dim] > MaxThreadsPerBlock[Dim])
-            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
-          // Checks that local work sizes are a divisor of the global work sizes
-          // which includes that the local work sizes are neither larger than
-          // the global work sizes and not 0.
-          if (0u == pLocalWorkSize[Dim])
-            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
-          if (0u != (pGlobalWorkSize[Dim] % pLocalWorkSize[Dim]))
-            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
-          ThreadsPerBlock[Dim] = pLocalWorkSize[Dim];
-          return UR_RESULT_SUCCESS;
-        };
-
-        size_t KernelLocalWorkGroupSize = 0;
-        for (size_t Dim = 0; Dim < workDim; Dim++) {
-          auto Err = IsValid(Dim);
-          if (Err != UR_RESULT_SUCCESS)
-            return Err;
-          // If no error then sum the total local work size per dim.
-          KernelLocalWorkGroupSize += pLocalWorkSize[Dim];
-        }
-
-        if (hasExceededMaxRegistersPerBlock(hQueue->Device, hKernel,
-                                            KernelLocalWorkGroupSize)) {
-          return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
-        }
-      } else {
-        guessLocalWorkSize(hQueue->Device, ThreadsPerBlock, pGlobalWorkSize,
-                           workDim, MaxThreadsPerBlock, hKernel, LocalSize);
-      }
-    }
-
-    if (MaxWorkGroupSize <
-        ThreadsPerBlock[0] * ThreadsPerBlock[1] * ThreadsPerBlock[2]) {
-      return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
-    }
-
-    size_t BlocksPerGrid[3] = {1u, 1u, 1u};
-
-    for (size_t i = 0; i < workDim; i++) {
-      BlocksPerGrid[i] =
-          (pGlobalWorkSize[i] + ThreadsPerBlock[i] - 1) / ThreadsPerBlock[i];
-    }
+  if ((Result = setKernelParams(
+           hQueue->getContext(), hQueue->Device, workDim, pGlobalWorkOffset,
+           pGlobalWorkSize, pLocalWorkSize, hKernel, CuFunc, ThreadsPerBlock,
+           BlocksPerGrid)) != UR_RESULT_SUCCESS) {
+    return Result;
+  }
 
+  try {
     std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
     uint32_t StreamToken;
     ur_stream_guard_ Guard;
     CUstream CuStream = hQueue->getNextComputeStream(
         numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
-    CUfunction CuFunc = hKernel->get();
 
     Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
                                phEventWaitList);
 
-    // Set the implicit global offset parameter if kernel has offset variant
-    if (hKernel->get_with_offset_parameter()) {
-      std::uint32_t CudaImplicitOffset[3] = {0, 0, 0};
-      if (pGlobalWorkOffset) {
-        for (size_t i = 0; i < workDim; i++) {
-          CudaImplicitOffset[i] =
-              static_cast<std::uint32_t>(pGlobalWorkOffset[i]);
-          if (pGlobalWorkOffset[i] != 0) {
-            CuFunc = hKernel->get_with_offset_parameter();
-          }
-        }
-      }
-      hKernel->setImplicitOffsetArg(sizeof(CudaImplicitOffset),
-                                    CudaImplicitOffset);
-    }
-
-    auto &ArgIndices = hKernel->getArgIndices();
-
     if (phEvent) {
       RetImplEvent =
           std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
@@ -393,37 +453,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       RetImplEvent->start();
     }
 
-    if (hQueue->getContext()->getDevice()->maxLocalMemSizeChosen()) {
-      // Set up local memory requirements for kernel.
-      auto Device = hQueue->getContext()->getDevice();
-      if (Device->getMaxChosenLocalMem() < 0) {
-        setErrorMessage("Invalid value specified for "
-                        "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE",
-                        UR_RESULT_ERROR_ADAPTER_SPECIFIC);
-        return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
-      }
-      if (LocalSize > static_cast<uint32_t>(Device->getMaxCapacityLocalMem())) {
-        setErrorMessage("Too much local memory allocated for device",
-                        UR_RESULT_ERROR_ADAPTER_SPECIFIC);
-        return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
-      }
-      if (LocalSize > static_cast<uint32_t>(Device->getMaxChosenLocalMem())) {
-        setErrorMessage(
-            "Local memory for kernel exceeds the amount requested using "
-            "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE. Try increasing the value for "
-            "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE.",
-            UR_RESULT_ERROR_ADAPTER_SPECIFIC);
-        return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
-      }
-      UR_CHECK_ERROR(cuFuncSetAttribute(
-          CuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
-          Device->getMaxChosenLocalMem()));
-    }
-
+    auto &ArgIndices = hKernel->getArgIndices();
     Result = UR_CHECK_ERROR(cuLaunchKernel(
         CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2],
         ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize,
         CuStream, const_cast<void **>(ArgIndices.data()), nullptr));
+
     if (LocalSize != 0)
       hKernel->clearLocalSize();
 
@@ -431,6 +466,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       Result = RetImplEvent->record();
       *phEvent = RetImplEvent.release();
     }
+
   } catch (ur_result_t Err) {
     Result = Err;
   }
diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.hpp
index 393085ce42eb2..b6a458eb53ac7 100644
--- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.hpp
+++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.hpp
@@ -14,3 +14,20 @@
 ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
                               uint32_t NumEventsInWaitList,
                               const ur_event_handle_t *EventWaitList);
+
+void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
+                        const size_t *GlobalWorkSize, const uint32_t WorkDim,
+                        const size_t MaxThreadsPerBlock[3],
+                        ur_kernel_handle_t Kernel, uint32_t LocalSize);
+
+bool hasExceededMaxRegistersPerBlock(ur_device_handle_t Device,
+                                     ur_kernel_handle_t Kernel,
+                                     size_t BlockSize);
+
+ur_result_t
+setKernelParams(const ur_context_handle_t Context,
+                const ur_device_handle_t Device, const uint32_t WorkDim,
+                const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
+                const size_t *LocalWorkSize, ur_kernel_handle_t &Kernel,
+                CUfunction &CuFunc, size_t (&ThreadsPerBlock)[3],
+                size_t (&BlocksPerGrid)[3]);
diff --git a/sycl/test-e2e/Graph/Explicit/add_node_while_recording.cpp b/sycl/test-e2e/Graph/Explicit/add_node_while_recording.cpp
index 44901e3fb452c..6f3c5acbf5873 100644
--- a/sycl/test-e2e/Graph/Explicit/add_node_while_recording.cpp
+++ b/sycl/test-e2e/Graph/Explicit/add_node_while_recording.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp b/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp
index 8ad6a413aea03..d41484a285602 100644
--- a/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp
+++ b/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/basic_buffer.cpp b/sycl/test-e2e/Graph/Explicit/basic_buffer.cpp
index 6191a875bbe41..b81a916e0fb5d 100644
--- a/sycl/test-e2e/Graph/Explicit/basic_buffer.cpp
+++ b/sycl/test-e2e/Graph/Explicit/basic_buffer.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm.cpp
index c7adb7f282da4..b6d765c60090e 100644
--- a/sycl/test-e2e/Graph/Explicit/basic_usm.cpp
+++ b/sycl/test-e2e/Graph/Explicit/basic_usm.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp
index 79e53ff4ba9d9..ac4564ce5797f 100644
--- a/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp
+++ b/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp
index fa5a2a1f018e6..17a2827372667 100644
--- a/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp
+++ b/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp
index 1b7447940e1fe..39843f07ff876 100644
--- a/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp
+++ b/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_system.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_system.cpp
index 26e5473bded66..2a8c69c2afca8 100644
--- a/sycl/test-e2e/Graph/Explicit/basic_usm_system.cpp
+++ b/sycl/test-e2e/Graph/Explicit/basic_usm_system.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/buffer_ordering.cpp b/sycl/test-e2e/Graph/Explicit/buffer_ordering.cpp
index 2c2edd374febc..f32ca3d9d5067 100644
--- a/sycl/test-e2e/Graph/Explicit/buffer_ordering.cpp
+++ b/sycl/test-e2e/Graph/Explicit/buffer_ordering.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/depends_on.cpp b/sycl/test-e2e/Graph/Explicit/depends_on.cpp
index 6454a0eeeaa0e..73a8a77361b55 100644
--- a/sycl/test-e2e/Graph/Explicit/depends_on.cpp
+++ b/sycl/test-e2e/Graph/Explicit/depends_on.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/dotp.cpp b/sycl/test-e2e/Graph/Explicit/dotp.cpp
index 37c8f1341343a..f976fbcc5c3ba 100644
--- a/sycl/test-e2e/Graph/Explicit/dotp.cpp
+++ b/sycl/test-e2e/Graph/Explicit/dotp.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/Explicit/dotp_buffer.cpp b/sycl/test-e2e/Graph/Explicit/dotp_buffer.cpp
index 4a2d46c611b70..bd6bf65730c55 100644
--- a/sycl/test-e2e/Graph/Explicit/dotp_buffer.cpp
+++ b/sycl/test-e2e/Graph/Explicit/dotp_buffer.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/Explicit/dotp_host_mem.cpp b/sycl/test-e2e/Graph/Explicit/dotp_host_mem.cpp
index 95beeefc8df86..1f252b4338f74 100644
--- a/sycl/test-e2e/Graph/Explicit/dotp_host_mem.cpp
+++ b/sycl/test-e2e/Graph/Explicit/dotp_host_mem.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/Explicit/dotp_host_shared.cpp b/sycl/test-e2e/Graph/Explicit/dotp_host_shared.cpp
index 2ebbd07d2c14c..88b473e7e5c4c 100644
--- a/sycl/test-e2e/Graph/Explicit/dotp_host_shared.cpp
+++ b/sycl/test-e2e/Graph/Explicit/dotp_host_shared.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/Explicit/dotp_mixed.cpp b/sycl/test-e2e/Graph/Explicit/dotp_mixed.cpp
index 590b5cee19919..a0449d6406d3e 100644
--- a/sycl/test-e2e/Graph/Explicit/dotp_mixed.cpp
+++ b/sycl/test-e2e/Graph/Explicit/dotp_mixed.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/Explicit/dotp_shared_mem.cpp b/sycl/test-e2e/Graph/Explicit/dotp_shared_mem.cpp
index 6fd1d79b8ba01..af0d5b916222f 100644
--- a/sycl/test-e2e/Graph/Explicit/dotp_shared_mem.cpp
+++ b/sycl/test-e2e/Graph/Explicit/dotp_shared_mem.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/Explicit/dotp_system_mem.cpp b/sycl/test-e2e/Graph/Explicit/dotp_system_mem.cpp
index e53598a9bca4d..c9c91268cbbb5 100644
--- a/sycl/test-e2e/Graph/Explicit/dotp_system_mem.cpp
+++ b/sycl/test-e2e/Graph/Explicit/dotp_system_mem.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/Explicit/empty.cpp b/sycl/test-e2e/Graph/Explicit/empty.cpp
index 04cec170a9010..548e0aefbec01 100644
--- a/sycl/test-e2e/Graph/Explicit/empty.cpp
+++ b/sycl/test-e2e/Graph/Explicit/empty.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/Explicit/empty_node.cpp b/sycl/test-e2e/Graph/Explicit/empty_node.cpp
index 687a25b923d78..638ccbde77e18 100644
--- a/sycl/test-e2e/Graph/Explicit/empty_node.cpp
+++ b/sycl/test-e2e/Graph/Explicit/empty_node.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/empty_with_deps.cpp b/sycl/test-e2e/Graph/Explicit/empty_with_deps.cpp
index 799e202422c51..803cf0ec32c19 100644
--- a/sycl/test-e2e/Graph/Explicit/empty_with_deps.cpp
+++ b/sycl/test-e2e/Graph/Explicit/empty_with_deps.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp b/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp
index 6948090873181..559646103f557 100644
--- a/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp
+++ b/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp
index a414e3f4b8d6c..9de3fd735eee5 100644
--- a/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp
+++ b/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/node_ordering.cpp b/sycl/test-e2e/Graph/Explicit/node_ordering.cpp
index 233013ba5cf8f..2fa23ac5559ab 100644
--- a/sycl/test-e2e/Graph/Explicit/node_ordering.cpp
+++ b/sycl/test-e2e/Graph/Explicit/node_ordering.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp b/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp
index 305831a6abb9b..495a5c3e7b2f7 100644
--- a/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp
+++ b/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/saxpy.cpp b/sycl/test-e2e/Graph/Explicit/saxpy.cpp
index 8c8ef7fad2c0e..e6214fcddf064 100644
--- a/sycl/test-e2e/Graph/Explicit/saxpy.cpp
+++ b/sycl/test-e2e/Graph/Explicit/saxpy.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/Explicit/single_node.cpp b/sycl/test-e2e/Graph/Explicit/single_node.cpp
index 766f6f08de281..3ef60d11d9e82 100644
--- a/sycl/test-e2e/Graph/Explicit/single_node.cpp
+++ b/sycl/test-e2e/Graph/Explicit/single_node.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph.cpp
index 154ea4e3470e3..ff3a8c1b2eea9 100644
--- a/sycl/test-e2e/Graph/Explicit/sub_graph.cpp
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp
index edce73a46ad73..ad0badd14ed98 100644
--- a/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp
index b8863b57c7290..bc66115d8fa35 100644
--- a/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp
index fe906bb7aba14..a1dc0e479da73 100644
--- a/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp
index 4254a861fe344..7847981a86577 100644
--- a/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/temp_buffer_reinterpret.cpp b/sycl/test-e2e/Graph/Explicit/temp_buffer_reinterpret.cpp
index d5b3ff7412b61..c8fd1d803266c 100644
--- a/sycl/test-e2e/Graph/Explicit/temp_buffer_reinterpret.cpp
+++ b/sycl/test-e2e/Graph/Explicit/temp_buffer_reinterpret.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/while_recording.cpp b/sycl/test-e2e/Graph/Explicit/while_recording.cpp
index b2cef162c7f06..910f6aa31c1f4 100644
--- a/sycl/test-e2e/Graph/Explicit/while_recording.cpp
+++ b/sycl/test-e2e/Graph/Explicit/while_recording.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp b/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp
index be0bcef2c8934..711a7c0838a1b 100644
--- a/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_buffer.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_buffer.cpp
index 7d0c7c81d780f..70052a1a2220b 100644
--- a/sycl/test-e2e/Graph/RecordReplay/basic_buffer.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_buffer.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp
index a6c098b472847..988e22b75d049 100644
--- a/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp
index c3492b6d26722..d60dc0f454242 100644
--- a/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp
index b4b7f26ceebbf..4645f079cd004 100644
--- a/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp
index c3a140d64eae4..51747fa00f7a6 100644
--- a/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_system.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_system.cpp
index e731b586885ac..360cb915f757e 100644
--- a/sycl/test-e2e/Graph/RecordReplay/basic_usm_system.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_system.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_ordering.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_ordering.cpp
index 9910cc82d6e6b..beda0d2b33ee8 100644
--- a/sycl/test-e2e/Graph/RecordReplay/buffer_ordering.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/buffer_ordering.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/concurrent_queue.cpp b/sycl/test-e2e/Graph/RecordReplay/concurrent_queue.cpp
index a25b0ff4c5a11..bf9eb1cf4ba90 100644
--- a/sycl/test-e2e/Graph/RecordReplay/concurrent_queue.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/concurrent_queue.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp.cpp
index 5e3403d688f60..9da5eabb01c03 100644
--- a/sycl/test-e2e/Graph/RecordReplay/dotp.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/dotp.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_buffer.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_buffer.cpp
index 030c5bc857050..8d911b2e621d6 100644
--- a/sycl/test-e2e/Graph/RecordReplay/dotp_buffer.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/dotp_buffer.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_host_mem.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_host_mem.cpp
index dca1d85cdf76e..c6f59da7ed8e0 100644
--- a/sycl/test-e2e/Graph/RecordReplay/dotp_host_mem.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/dotp_host_mem.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_host_shared.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_host_shared.cpp
index 2171c6db87e82..cb584495e978b 100644
--- a/sycl/test-e2e/Graph/RecordReplay/dotp_host_shared.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/dotp_host_shared.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_in_order.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_in_order.cpp
index 756fccdc99611..31aae2ae6dc27 100644
--- a/sycl/test-e2e/Graph/RecordReplay/dotp_in_order.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/dotp_in_order.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_in_order_with_empty_nodes.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_in_order_with_empty_nodes.cpp
index 2e41e5a85f5b9..935fa7b4ad297 100644
--- a/sycl/test-e2e/Graph/RecordReplay/dotp_in_order_with_empty_nodes.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/dotp_in_order_with_empty_nodes.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_multiple_queues.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_multiple_queues.cpp
index b0d988ca6deda..18be077ddb6e8 100644
--- a/sycl/test-e2e/Graph/RecordReplay/dotp_multiple_queues.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/dotp_multiple_queues.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_shared_mem.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_shared_mem.cpp
index a791311435a89..9564bcdee5c8b 100644
--- a/sycl/test-e2e/Graph/RecordReplay/dotp_shared_mem.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/dotp_shared_mem.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_system_mem.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_system_mem.cpp
index 4e8465bac3b0b..f0224c5b02995 100644
--- a/sycl/test-e2e/Graph/RecordReplay/dotp_system_mem.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/dotp_system_mem.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/RecordReplay/empty.cpp b/sycl/test-e2e/Graph/RecordReplay/empty.cpp
index 74c96aa407da9..2e3d961b84550 100644
--- a/sycl/test-e2e/Graph/RecordReplay/empty.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/empty.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp b/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp
index 967cfbaaf58e8..95cf1baa62835 100644
--- a/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/empty_node_with_dep.cpp b/sycl/test-e2e/Graph/RecordReplay/empty_node_with_dep.cpp
index c247d61796df6..73aa9a9b62b28 100644
--- a/sycl/test-e2e/Graph/RecordReplay/empty_node_with_dep.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/empty_node_with_dep.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp
index 8a59f12d316b4..1a6754619f7d0 100644
--- a/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp b/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp
index 3a702d025b3d3..4e5f0d35dbdd5 100644
--- a/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/saxpy.cpp b/sycl/test-e2e/Graph/RecordReplay/saxpy.cpp
index ddcc163672d2d..67cf264a102ad 100644
--- a/sycl/test-e2e/Graph/RecordReplay/saxpy.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/saxpy.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/RecordReplay/simple_shared_usm.cpp b/sycl/test-e2e/Graph/RecordReplay/simple_shared_usm.cpp
index 6719fdcb35160..98625b801b221 100644
--- a/sycl/test-e2e/Graph/RecordReplay/simple_shared_usm.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/simple_shared_usm.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp
index 2515d95fac5f2..8ce937c7d7195 100644
--- a/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp
index 4921e51cc98aa..afa6ec38574a4 100644
--- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_in_order.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_in_order.cpp
index 0f21e3b6fefd9..0eb2279f1ef1d 100644
--- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_in_order.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_in_order.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp
index aedd9e252e252..767cf279b7c06 100644
--- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp
index 6fc2b39efade3..75865f6f08697 100644
--- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp
index 6eb49c39583da..382d1ab67fb18 100644
--- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/temp_buffer_reinterpret.cpp b/sycl/test-e2e/Graph/RecordReplay/temp_buffer_reinterpret.cpp
index a51bcc967b2ee..1a124ae49d3b3 100644
--- a/sycl/test-e2e/Graph/RecordReplay/temp_buffer_reinterpret.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/temp_buffer_reinterpret.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/temp_scope.cpp b/sycl/test-e2e/Graph/RecordReplay/temp_scope.cpp
index 6cc286bc2c96b..333685c2a49ea 100644
--- a/sycl/test-e2e/Graph/RecordReplay/temp_scope.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/temp_scope.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Threading/finalize.cpp b/sycl/test-e2e/Graph/Threading/finalize.cpp
index e6e3337f2bd87..995b60b33edcb 100644
--- a/sycl/test-e2e/Graph/Threading/finalize.cpp
+++ b/sycl/test-e2e/Graph/Threading/finalize.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build_pthread_inc} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/Threading/queue_state.cpp b/sycl/test-e2e/Graph/Threading/queue_state.cpp
index b1713d2e9b1dd..e38976a7fc3e8 100644
--- a/sycl/test-e2e/Graph/Threading/queue_state.cpp
+++ b/sycl/test-e2e/Graph/Threading/queue_state.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu, TEMPORARY_DISABLED
+// REQUIRES: cuda || level_zero, gpu, TEMPORARY_DISABLED
 // Disabled as thread safety not yet implemented
 
 // RUN: %clangxx -pthread -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
diff --git a/sycl/test-e2e/Graph/Threading/submit.cpp b/sycl/test-e2e/Graph/Threading/submit.cpp
index 193ce76687bdf..5deace7cb8173 100644
--- a/sycl/test-e2e/Graph/Threading/submit.cpp
+++ b/sycl/test-e2e/Graph/Threading/submit.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build_pthread_inc} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/Threading/update.cpp b/sycl/test-e2e/Graph/Threading/update.cpp
index 6e9b7bcbdb3bb..f65fb8bcc915e 100644
--- a/sycl/test-e2e/Graph/Threading/update.cpp
+++ b/sycl/test-e2e/Graph/Threading/update.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu, TEMPORARY_DISABLED
+// REQUIRES: cuda || level_zero, gpu, TEMPORARY_DISABLED
 // Disabled as Update feature is not yet implemented
 
 // RUN: %clangxx -pthread -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
diff --git a/sycl/test-e2e/Graph/cuda_backend_create_and_finalize_empty_graph.cpp b/sycl/test-e2e/Graph/cuda_backend_create_and_finalize_empty_graph.cpp
deleted file mode 100644
index 585d3841d5ddb..0000000000000
--- a/sycl/test-e2e/Graph/cuda_backend_create_and_finalize_empty_graph.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-// REQUIRES: cuda, gpu
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-// Tests the ability to finalize a empty command graph
-// without submitting the graph.
-
-#include "graph_common.hpp"
-
-int GetCudaBackend(const sycl::device &Dev) {
-  // Return 1 if the device backend is "cuda" or 0 else.
-  // 0 does not prevent another device to be picked as a second choice
-  return Dev.get_backend() == backend::ext_oneapi_cuda;
-}
-
-int main() {
-  sycl::device CudaDev{GetCudaBackend};
-  queue Queue{CudaDev};
-
-  // Skip the test if no cuda backend found
-  if (CudaDev.get_backend() != backend::ext_oneapi_cuda)
-    return 0;
-
-  std::error_code ErrorCode = make_error_code(sycl::errc::success);
-  // This should not throw an exception
-  try {
-    exp_ext::command_graph Graph{Queue.get_context(), CudaDev};
-    auto GraphExec = Graph.finalize();
-  } catch (const sycl::exception &e) {
-    ErrorCode = e.code();
-  }
-  assert(ErrorCode == sycl::errc::success);
-
-  return 0;
-}
diff --git a/sycl/test-e2e/Graph/empty_graph.cpp b/sycl/test-e2e/Graph/empty_graph.cpp
index 90eb43986275a..e9937cd230cc2 100644
--- a/sycl/test-e2e/Graph/empty_graph.cpp
+++ b/sycl/test-e2e/Graph/empty_graph.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/Graph/finalize_empty.cpp b/sycl/test-e2e/Graph/finalize_empty.cpp
index dc29455ac2e81..1b1b0f8cee473 100644
--- a/sycl/test-e2e/Graph/finalize_empty.cpp
+++ b/sycl/test-e2e/Graph/finalize_empty.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/Graph/finalize_twice.cpp b/sycl/test-e2e/Graph/finalize_twice.cpp
index 31c178e8eeae5..752a63976f4d3 100644
--- a/sycl/test-e2e/Graph/finalize_twice.cpp
+++ b/sycl/test-e2e/Graph/finalize_twice.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out