From c46ec835e525a43a9f60813f2afd0f4a2ad4a844 Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Thu, 24 Aug 2023 11:03:15 +0100 Subject: [PATCH] [SYCL][Graph] Implement graph enqueue for CUDA backend (#305) * [SYCL][Graph] Implement graph enqueue for CUDA backend Implements urCommandBufferAppendKernelLaunchExp function to append kernel to a command buffer. Implements urCommandBufferAppendKernelLaunchExp function to enqueue a command buffer to a cuda stream. Enable all tests supported by the current state of the Cuda backend. * [SYCL][Graph] Refactors the code using helper to set Kernel params * [SYCL][Graph] Adds exception catching * [SYCL][Graph] Reformats paramaters of internal functions in PascalCase * [SYCL][Graph] Fixes command buffer enqueuing issue in multithreaded environment. Sets a ScopedContext before enqueuing command buffer. * [SYCL][Graph] Adds missing exception catch --- .../ur/adapters/cuda/command_buffer.cpp | 180 +++++++++++-- .../ur/adapters/cuda/command_buffer.hpp | 179 ++++++++++++- .../ur/adapters/cuda/enqueue.cpp | 250 ++++++++++-------- .../ur/adapters/cuda/enqueue.hpp | 17 ++ .../Explicit/add_node_while_recording.cpp | 2 +- .../Explicit/add_nodes_after_finalize.cpp | 2 +- sycl/test-e2e/Graph/Explicit/basic_buffer.cpp | 2 +- sycl/test-e2e/Graph/Explicit/basic_usm.cpp | 2 +- .../Graph/Explicit/basic_usm_host.cpp | 2 +- .../Graph/Explicit/basic_usm_mixed.cpp | 2 +- .../Graph/Explicit/basic_usm_shared.cpp | 2 +- .../Graph/Explicit/basic_usm_system.cpp | 2 +- .../Graph/Explicit/buffer_ordering.cpp | 2 +- sycl/test-e2e/Graph/Explicit/depends_on.cpp | 2 +- sycl/test-e2e/Graph/Explicit/dotp.cpp | 2 +- sycl/test-e2e/Graph/Explicit/dotp_buffer.cpp | 2 +- .../test-e2e/Graph/Explicit/dotp_host_mem.cpp | 2 +- .../Graph/Explicit/dotp_host_shared.cpp | 2 +- sycl/test-e2e/Graph/Explicit/dotp_mixed.cpp | 2 +- .../Graph/Explicit/dotp_shared_mem.cpp | 2 +- .../Graph/Explicit/dotp_system_mem.cpp | 2 +- sycl/test-e2e/Graph/Explicit/empty.cpp | 2 +- sycl/test-e2e/Graph/Explicit/empty_node.cpp | 2 +- .../Graph/Explicit/empty_with_deps.cpp | 2 +- .../Graph/Explicit/enqueue_ordering.cpp | 2 +- .../Graph/Explicit/multiple_exec_graphs.cpp | 2 +- .../test-e2e/Graph/Explicit/node_ordering.cpp | 2 +- .../test-e2e/Graph/Explicit/repeated_exec.cpp | 2 +- sycl/test-e2e/Graph/Explicit/saxpy.cpp | 2 +- sycl/test-e2e/Graph/Explicit/single_node.cpp | 2 +- sycl/test-e2e/Graph/Explicit/sub_graph.cpp | 2 +- .../sub_graph_execute_without_parent.cpp | 2 +- .../sub_graph_multiple_submission.cpp | 2 +- .../Graph/Explicit/sub_graph_nested.cpp | 2 +- .../Explicit/sub_graph_two_parent_graphs.cpp | 2 +- .../Explicit/temp_buffer_reinterpret.cpp | 2 +- .../Graph/Explicit/while_recording.cpp | 2 +- .../RecordReplay/add_nodes_after_finalize.cpp | 2 +- .../Graph/RecordReplay/basic_buffer.cpp | 2 +- .../test-e2e/Graph/RecordReplay/basic_usm.cpp | 2 +- .../Graph/RecordReplay/basic_usm_host.cpp | 2 +- .../Graph/RecordReplay/basic_usm_mixed.cpp | 2 +- .../Graph/RecordReplay/basic_usm_shared.cpp | 2 +- .../Graph/RecordReplay/basic_usm_system.cpp | 2 +- .../Graph/RecordReplay/buffer_ordering.cpp | 2 +- .../Graph/RecordReplay/concurrent_queue.cpp | 2 +- sycl/test-e2e/Graph/RecordReplay/dotp.cpp | 2 +- .../Graph/RecordReplay/dotp_buffer.cpp | 2 +- .../Graph/RecordReplay/dotp_host_mem.cpp | 2 +- .../Graph/RecordReplay/dotp_host_shared.cpp | 2 +- .../Graph/RecordReplay/dotp_in_order.cpp | 2 +- .../dotp_in_order_with_empty_nodes.cpp | 2 +- .../RecordReplay/dotp_multiple_queues.cpp | 2 +- .../Graph/RecordReplay/dotp_shared_mem.cpp | 2 +- .../Graph/RecordReplay/dotp_system_mem.cpp | 2 +- sycl/test-e2e/Graph/RecordReplay/empty.cpp | 2 +- .../Graph/RecordReplay/empty_node.cpp | 2 +- .../RecordReplay/empty_node_with_dep.cpp | 2 +- .../RecordReplay/multiple_exec_graphs.cpp | 2 +- .../Graph/RecordReplay/repeated_exec.cpp | 2 +- sycl/test-e2e/Graph/RecordReplay/saxpy.cpp | 2 +- .../Graph/RecordReplay/simple_shared_usm.cpp | 2 +- .../test-e2e/Graph/RecordReplay/sub_graph.cpp | 2 +- .../sub_graph_execute_without_parent.cpp | 2 +- .../Graph/RecordReplay/sub_graph_in_order.cpp | 2 +- .../sub_graph_multiple_submission.cpp | 2 +- .../Graph/RecordReplay/sub_graph_nested.cpp | 2 +- .../sub_graph_two_parent_graphs.cpp | 2 +- .../RecordReplay/temp_buffer_reinterpret.cpp | 2 +- .../Graph/RecordReplay/temp_scope.cpp | 2 +- sycl/test-e2e/Graph/Threading/finalize.cpp | 2 +- sycl/test-e2e/Graph/Threading/queue_state.cpp | 2 +- sycl/test-e2e/Graph/Threading/submit.cpp | 2 +- sycl/test-e2e/Graph/Threading/update.cpp | 2 +- ...ackend_create_and_finalize_empty_graph.cpp | 35 --- sycl/test-e2e/Graph/empty_graph.cpp | 2 +- sycl/test-e2e/Graph/finalize_empty.cpp | 2 +- sycl/test-e2e/Graph/finalize_twice.cpp | 2 +- 78 files changed, 562 insertions(+), 245 deletions(-) delete mode 100644 sycl/test-e2e/Graph/cuda_backend_create_and_finalize_empty_graph.cpp diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp index 66d193198a3c4..32b7a8b78b224 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.cpp @@ -8,13 +8,16 @@ #include "command_buffer.hpp" #include "common.hpp" - -/// Stub implementations of UR experimental feature command-buffers +#include "enqueue.hpp" +#include "event.hpp" +#include "kernel.hpp" +#include "memory.hpp" +#include "queue.hpp" ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( ur_context_handle_t hContext, ur_device_handle_t hDevice) : Context(hContext), - Device(hDevice), cudaGraph{nullptr}, cudaGraphExec{nullptr}, RefCount{1} { + Device(hDevice), CudaGraph{nullptr}, CudaGraphExec{nullptr}, RefCount{1} { urContextRetain(hContext); urDeviceRetain(hDevice); } @@ -29,10 +32,43 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { urDeviceRelease(Device); // Release the memory allocated to the CudaGraph - cuGraphDestroy(cudaGraph); + cuGraphDestroy(CudaGraph); // Release the memory allocated to the CudaGraphExec - cuGraphExecDestroy(cudaGraphExec); + cuGraphExecDestroy(CudaGraphExec); +} + +/// Helper function for finding the Cuda Nodes associated with the +/// commands in a command-buffer, each event is pointed to by a sync-point in +/// the wait list. +/// +/// @param[in] CommandBuffer to lookup the events from. +/// @param[in] NumSyncPointsInWaitList Length of \p SyncPointWaitList. +/// @param[in] SyncPointWaitList List of sync points in \p CommandBuffer +/// to find the events for. +/// @param[out] CuNodesList Return parameter for the Cuda Nodes associated with +/// each sync-point in \p SyncPointWaitList. +/// +/// @return UR_RESULT_SUCCESS or an error code on failure +static ur_result_t getNodesFromSyncPoints( + const ur_exp_command_buffer_handle_t &CommandBuffer, + size_t NumSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, + std::vector &CuNodesList) { + // Map of ur_exp_command_buffer_sync_point_t to ur_event_handle_t defining + // the event associated with each sync-point + auto SyncPoints = CommandBuffer->SyncPoints; + + // For each sync-point add associated L0 event to the return list. + for (size_t i = 0; i < NumSyncPointsInWaitList; i++) { + if (auto NodeHandle = SyncPoints.find(SyncPointWaitList[i]); + NodeHandle != SyncPoints.end()) { + CuNodesList.push_back(*NodeHandle->second.get()); + } else { + return UR_RESULT_ERROR_INVALID_VALUE; + } + } + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( @@ -51,7 +87,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( auto RetCommandBuffer = *hCommandBuffer; try { - UR_CHECK_ERROR(cuGraphCreate(&RetCommandBuffer->cudaGraph, 0)); + UR_CHECK_ERROR(cuGraphCreate(&RetCommandBuffer->CudaGraph, 0)); } catch (...) { return UR_RESULT_ERROR_OUT_OF_RESOURCES; } @@ -77,8 +113,8 @@ urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) { UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t hCommandBuffer) { try { - UR_CHECK_ERROR(cuGraphInstantiate(&hCommandBuffer->cudaGraphExec, - hCommandBuffer->cudaGraph, 0)); + UR_CHECK_ERROR(cuGraphInstantiate(&hCommandBuffer->CudaGraphExec, + hCommandBuffer->CudaGraph, 0)); } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } @@ -92,19 +128,82 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint) { - (void)hCommandBuffer; - (void)hKernel; - (void)workDim; - (void)pGlobalWorkOffset; - (void)pGlobalWorkSize; - (void)pLocalWorkSize; - (void)numSyncPointsInWaitList; - (void)pSyncPointWaitList; - (void)pSyncPoint; + // Preconditions + UR_ASSERT(hCommandBuffer->Context == hKernel->getContext(), + UR_RESULT_ERROR_INVALID_KERNEL); + UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + + ur_result_t Result = UR_RESULT_SUCCESS; + CUgraphNode GraphNode; + + std::vector DepsList; + UR_CALL(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList, + pSyncPointWaitList, DepsList)); + + if (*pGlobalWorkSize == 0) { + try { + // Create a empty node if the kernel worload size is zero + Result = UR_CHECK_ERROR( + cuGraphAddEmptyNode(&GraphNode, hCommandBuffer->CudaGraph, + DepsList.data(), DepsList.size())); + + // Get sync point and register the event with it. + *pSyncPoint = hCommandBuffer->GetNextSyncPoint(); + hCommandBuffer->RegisterSyncPoint( + *pSyncPoint, std::make_shared(GraphNode)); + } catch (ur_result_t Err) { + Result = Err; + } + return Result; + } - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + // Set the number of threads per block to the number of threads per warp + // by default unless user has provided a better number + size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; + size_t BlocksPerGrid[3] = {1u, 1u, 1u}; + + uint32_t LocalSize = hKernel->getLocalSize(); + CUfunction CuFunc = hKernel->get(); + + if ((Result = setKernelParams( + hCommandBuffer->Context, hCommandBuffer->Device, workDim, + pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, hKernel, CuFunc, + ThreadsPerBlock, BlocksPerGrid)) != UR_RESULT_SUCCESS) { + return Result; + } + + try { + // Set node param structure with the kernel related data + auto &ArgIndices = hKernel->getArgIndices(); + CUDA_KERNEL_NODE_PARAMS nodeParams; + nodeParams.func = CuFunc; + nodeParams.gridDimX = BlocksPerGrid[0]; + nodeParams.gridDimY = BlocksPerGrid[1]; + nodeParams.gridDimZ = BlocksPerGrid[2]; + nodeParams.blockDimX = ThreadsPerBlock[0]; + nodeParams.blockDimY = ThreadsPerBlock[1]; + nodeParams.blockDimZ = ThreadsPerBlock[2]; + nodeParams.sharedMemBytes = LocalSize; + nodeParams.kernelParams = const_cast(ArgIndices.data()); + nodeParams.extra = nullptr; + + // Create and add an new kernel node to the Cuda graph + Result = UR_CHECK_ERROR( + cuGraphAddKernelNode(&GraphNode, hCommandBuffer->CudaGraph, + DepsList.data(), DepsList.size(), &nodeParams)); + + if (LocalSize != 0) + hKernel->clearLocalSize(); + + // Get sync point and register the event with it. + *pSyncPoint = hCommandBuffer->GetNextSyncPoint(); + hCommandBuffer->RegisterSyncPoint(*pSyncPoint, + std::make_shared(GraphNode)); + } catch (ur_result_t Err) { + Result = Err; + } + return Result; } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemcpyUSMExp( @@ -275,13 +374,38 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - (void)hCommandBuffer; - (void)hQueue; - (void)numEventsInWaitList; - (void)phEventWaitList; - (void)phEvent; + ur_result_t Result = UR_RESULT_SUCCESS; - detail::ur::die("Experimental Command-buffer feature is not " - "implemented for CUDA adapter."); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + try { + std::unique_ptr RetImplEvent{nullptr}; + ScopedContext Active(hQueue->getContext()); + uint32_t StreamToken; + ur_stream_guard_ Guard; + CUstream CuStream = hQueue->getNextComputeStream( + numEventsInWaitList, phEventWaitList, Guard, &StreamToken); + + if ((Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList)) != UR_RESULT_SUCCESS) { + return Result; + } + + if (phEvent) { + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_KERNEL_LAUNCH, hQueue, CuStream, StreamToken)); + RetImplEvent->start(); + } + + // Launch graph + Result = + UR_CHECK_ERROR(cuGraphLaunch(hCommandBuffer->CudaGraphExec, CuStream)); + + if (phEvent) { + Result = RetImplEvent->record(); + *phEvent = RetImplEvent.release(); + } + } catch (ur_result_t Err) { + Result = Err; + } + return Result; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.hpp index 6049ceaed88a4..d2d5079005bc0 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/command_buffer.hpp @@ -9,7 +9,164 @@ #include #include +#include "context.hpp" #include +#include + +static auto getUrResultString = [](ur_result_t Result) { + switch (Result) { + case UR_RESULT_SUCCESS: + return "UR_RESULT_SUCCESS"; + case UR_RESULT_ERROR_INVALID_OPERATION: + return "UR_RESULT_ERROR_INVALID_OPERATION"; + case UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES: + return "UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES"; + case UR_RESULT_ERROR_INVALID_QUEUE: + return "UR_RESULT_ERROR_INVALID_QUEUE"; + case UR_RESULT_ERROR_INVALID_VALUE: + return "UR_RESULT_ERROR_INVALID_VALUE"; + case UR_RESULT_ERROR_INVALID_CONTEXT: + return "UR_RESULT_ERROR_INVALID_CONTEXT"; + case UR_RESULT_ERROR_INVALID_PLATFORM: + return "UR_RESULT_ERROR_INVALID_PLATFORM"; + case UR_RESULT_ERROR_INVALID_BINARY: + return "UR_RESULT_ERROR_INVALID_BINARY"; + case UR_RESULT_ERROR_INVALID_PROGRAM: + return "UR_RESULT_ERROR_INVALID_PROGRAM"; + case UR_RESULT_ERROR_INVALID_SAMPLER: + return "UR_RESULT_ERROR_INVALID_SAMPLER"; + case UR_RESULT_ERROR_INVALID_BUFFER_SIZE: + return "UR_RESULT_ERROR_INVALID_BUFFER_SIZE"; + case UR_RESULT_ERROR_INVALID_MEM_OBJECT: + return "UR_RESULT_ERROR_INVALID_MEM_OBJECT"; + case UR_RESULT_ERROR_INVALID_EVENT: + return "UR_RESULT_ERROR_INVALID_EVENT"; + case UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: + return "UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST"; + case UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET: + return "UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET"; + case UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE: + return "UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE"; + case UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE: + return "UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE"; + case UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE: + return "UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE"; + case UR_RESULT_ERROR_DEVICE_NOT_FOUND: + return "UR_RESULT_ERROR_DEVICE_NOT_FOUND"; + case UR_RESULT_ERROR_INVALID_DEVICE: + return "UR_RESULT_ERROR_INVALID_DEVICE"; + case UR_RESULT_ERROR_DEVICE_LOST: + return "UR_RESULT_ERROR_DEVICE_LOST"; + case UR_RESULT_ERROR_DEVICE_REQUIRES_RESET: + return "UR_RESULT_ERROR_DEVICE_REQUIRES_RESET"; + case UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE: + return "UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE"; + case UR_RESULT_ERROR_DEVICE_PARTITION_FAILED: + return "UR_RESULT_ERROR_DEVICE_PARTITION_FAILED"; + case UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT: + return "UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT"; + case UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE: + return "UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE"; + case UR_RESULT_ERROR_INVALID_WORK_DIMENSION: + return "UR_RESULT_ERROR_INVALID_WORK_DIMENSION"; + case UR_RESULT_ERROR_INVALID_KERNEL_ARGS: + return "UR_RESULT_ERROR_INVALID_KERNEL_ARGS"; + case UR_RESULT_ERROR_INVALID_KERNEL: + return "UR_RESULT_ERROR_INVALID_KERNEL"; + case UR_RESULT_ERROR_INVALID_KERNEL_NAME: + return "UR_RESULT_ERROR_INVALID_KERNEL_NAME"; + case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX: + return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX"; + case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE: + return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE"; + case UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE: + return "UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE"; + case UR_RESULT_ERROR_INVALID_IMAGE_SIZE: + return "UR_RESULT_ERROR_INVALID_IMAGE_SIZE"; + case UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR: + return "UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR"; + case UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED: + return "UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED"; + case UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE: + return "UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE"; + case UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE: + return "UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE"; + case UR_RESULT_ERROR_UNINITIALIZED: + return "UR_RESULT_ERROR_UNINITIALIZED"; + case UR_RESULT_ERROR_OUT_OF_HOST_MEMORY: + return "UR_RESULT_ERROR_OUT_OF_HOST_MEMORY"; + case UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY: + return "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY"; + case UR_RESULT_ERROR_OUT_OF_RESOURCES: + return "UR_RESULT_ERROR_OUT_OF_RESOURCES"; + case UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE: + return "UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE"; + case UR_RESULT_ERROR_PROGRAM_LINK_FAILURE: + return "UR_RESULT_ERROR_PROGRAM_LINK_FAILURE"; + case UR_RESULT_ERROR_UNSUPPORTED_VERSION: + return "UR_RESULT_ERROR_UNSUPPORTED_VERSION"; + case UR_RESULT_ERROR_UNSUPPORTED_FEATURE: + return "UR_RESULT_ERROR_UNSUPPORTED_FEATURE"; + case UR_RESULT_ERROR_INVALID_ARGUMENT: + return "UR_RESULT_ERROR_INVALID_ARGUMENT"; + case UR_RESULT_ERROR_INVALID_NULL_HANDLE: + return "UR_RESULT_ERROR_INVALID_NULL_HANDLE"; + case UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE: + return "UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE"; + case UR_RESULT_ERROR_INVALID_NULL_POINTER: + return "UR_RESULT_ERROR_INVALID_NULL_POINTER"; + case UR_RESULT_ERROR_INVALID_SIZE: + return "UR_RESULT_ERROR_INVALID_SIZE"; + case UR_RESULT_ERROR_UNSUPPORTED_SIZE: + return "UR_RESULT_ERROR_UNSUPPORTED_SIZE"; + case UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT: + return "UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT"; + case UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT: + return "UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT"; + case UR_RESULT_ERROR_INVALID_ENUMERATION: + return "UR_RESULT_ERROR_INVALID_ENUMERATION"; + case UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION: + return "UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION"; + case UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT: + return "UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT"; + case UR_RESULT_ERROR_INVALID_NATIVE_BINARY: + return "UR_RESULT_ERROR_INVALID_NATIVE_BINARY"; + case UR_RESULT_ERROR_INVALID_GLOBAL_NAME: + return "UR_RESULT_ERROR_INVALID_GLOBAL_NAME"; + case UR_RESULT_ERROR_INVALID_FUNCTION_NAME: + return "UR_RESULT_ERROR_INVALID_FUNCTION_NAME"; + case UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION: + return "UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION"; + case UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION: + return "UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION"; + case UR_RESULT_ERROR_PROGRAM_UNLINKED: + return "UR_RESULT_ERROR_PROGRAM_UNLINKED"; + case UR_RESULT_ERROR_OVERLAPPING_REGIONS: + return "UR_RESULT_ERROR_OVERLAPPING_REGIONS"; + case UR_RESULT_ERROR_INVALID_HOST_PTR: + return "UR_RESULT_ERROR_INVALID_HOST_PTR"; + case UR_RESULT_ERROR_INVALID_USM_SIZE: + return "UR_RESULT_ERROR_INVALID_USM_SIZE"; + case UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE: + return "UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE"; + case UR_RESULT_ERROR_ADAPTER_SPECIFIC: + return "UR_RESULT_ERROR_ADAPTER_SPECIFIC"; + default: + return "UR_RESULT_ERROR_UNKNOWN"; + } +}; + +// Trace an internal PI call; returns in case of an error. +#define UR_CALL(Call) \ + { \ + if (PrintTrace) \ + fprintf(stderr, "UR ---> %s\n", #Call); \ + ur_result_t Result = (Call); \ + if (PrintTrace) \ + fprintf(stderr, "UR <--- %s(%s)\n", #Call, getUrResultString(Result)); \ + if (Result != UR_RESULT_SUCCESS) \ + return Result; \ + } struct ur_exp_command_buffer_handle_t_ { @@ -18,18 +175,36 @@ struct ur_exp_command_buffer_handle_t_ { ~ur_exp_command_buffer_handle_t_(); + void RegisterSyncPoint(ur_exp_command_buffer_sync_point_t SyncPoint, + std::shared_ptr CuNode) { + SyncPoints[SyncPoint] = CuNode; + NextSyncPoint++; + } + + ur_exp_command_buffer_sync_point_t GetNextSyncPoint() const { + return NextSyncPoint; + } + // UR context associated with this command-buffer ur_context_handle_t Context; // Device associated with this command buffer ur_device_handle_t Device; // Cuda Graph handle - CUgraph cudaGraph; + CUgraph CudaGraph; // Cuda Graph Exec handle - CUgraphExec cudaGraphExec; + CUgraphExec CudaGraphExec; // Atomic variable counting the number of reference to this command_buffer // using std::atomic prevents data race when incrementing/decrementing. std::atomic_uint32_t RefCount; + // Map of sync_points to ur_events + std::unordered_map> + SyncPoints; + // Next sync_point value (may need to consider ways to reuse values if 32-bits + // is not enough) + ur_exp_command_buffer_sync_point_t NextSyncPoint; + // Used when retaining an object. uint32_t incrementReferenceCount() noexcept { return ++RefCount; } // Used when releasing an object. diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp index f1a1f25c5b221..085da92c34370 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.cpp @@ -184,6 +184,136 @@ bool hasExceededMaxRegistersPerBlock(ur_device_handle_t Device, return BlockSize * Kernel->getRegsPerThread() > Device->getMaxRegsPerBlock(); } +// Helper to compute and put in good shape kernel parameters from workload +// dimensions. +// @param [in] Context handler to the target Context +// @param [in] Device handler to the target Device +// @param [in] WorkDim workload dimension +// @param [in] GlobalWorkOffset pointer workload global offsets +// @param [in] LocalWorkOffset pointer workload local offsets +// @param [inout] Kernel handler to the kernel +// @param [inout] CuFunc handler to the cuda function attached to the kernel +// @param [out] ThreadsPerBlock Number of threads per block we should run +// @param [out] BlocksPerGrid Number of blocks per grid we should run +ur_result_t +setKernelParams(const ur_context_handle_t Context, + const ur_device_handle_t Device, const uint32_t WorkDim, + const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, + const size_t *LocalWorkSize, ur_kernel_handle_t &Kernel, + CUfunction &CuFunc, size_t (&ThreadsPerBlock)[3], + size_t (&BlocksPerGrid)[3]) { + ur_result_t Result = UR_RESULT_SUCCESS; + size_t MaxWorkGroupSize = 0u; + size_t MaxThreadsPerBlock[3] = {}; + bool ProvidedLocalWorkGroupSize = (LocalWorkSize != nullptr); + uint32_t LocalSize = Kernel->getLocalSize(); + + try { + // Set the active context here as guessLocalWorkSize needs an active context + ScopedContext Active(Context); + { + size_t *ReqdThreadsPerBlock = Kernel->ReqdThreadsPerBlock; + MaxWorkGroupSize = Device->getMaxWorkGroupSize(); + Device->getMaxWorkItemSizes(sizeof(MaxThreadsPerBlock), + MaxThreadsPerBlock); + + if (ProvidedLocalWorkGroupSize) { + auto IsValid = [&](int Dim) { + if (ReqdThreadsPerBlock[Dim] != 0 && + LocalWorkSize[Dim] != ReqdThreadsPerBlock[Dim]) + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + + if (LocalWorkSize[Dim] > MaxThreadsPerBlock[Dim]) + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + // Checks that local work sizes are a divisor of the global work sizes + // which includes that the local work sizes are neither larger than + // the global work sizes and not 0. + if (0u == LocalWorkSize[Dim]) + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + if (0u != (GlobalWorkSize[Dim] % LocalWorkSize[Dim])) + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + ThreadsPerBlock[Dim] = LocalWorkSize[Dim]; + return UR_RESULT_SUCCESS; + }; + + size_t KernelLocalWorkGroupSize = 0; + for (size_t Dim = 0; Dim < WorkDim; Dim++) { + auto Err = IsValid(Dim); + if (Err != UR_RESULT_SUCCESS) + return Err; + // If no error then sum the total local work size per dim. + KernelLocalWorkGroupSize += LocalWorkSize[Dim]; + } + + if (hasExceededMaxRegistersPerBlock(Device, Kernel, + KernelLocalWorkGroupSize)) { + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + } else { + guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim, + MaxThreadsPerBlock, Kernel, LocalSize); + } + } + + if (MaxWorkGroupSize < + ThreadsPerBlock[0] * ThreadsPerBlock[1] * ThreadsPerBlock[2]) { + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + + for (size_t i = 0; i < WorkDim; i++) { + BlocksPerGrid[i] = + (GlobalWorkSize[i] + ThreadsPerBlock[i] - 1) / ThreadsPerBlock[i]; + } + + // Set the implicit global offset parameter if kernel has offset variant + if (Kernel->get_with_offset_parameter()) { + std::uint32_t CudaImplicitOffset[3] = {0, 0, 0}; + if (GlobalWorkOffset) { + for (size_t i = 0; i < WorkDim; i++) { + CudaImplicitOffset[i] = + static_cast(GlobalWorkOffset[i]); + if (GlobalWorkOffset[i] != 0) { + CuFunc = Kernel->get_with_offset_parameter(); + } + } + } + Kernel->setImplicitOffsetArg(sizeof(CudaImplicitOffset), + CudaImplicitOffset); + } + + if (Context->getDevice()->maxLocalMemSizeChosen()) { + // Set up local memory requirements for kernel. + auto Device = Context->getDevice(); + if (Device->getMaxChosenLocalMem() < 0) { + setErrorMessage("Invalid value specified for " + "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE", + UR_RESULT_ERROR_ADAPTER_SPECIFIC); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + if (LocalSize > static_cast(Device->getMaxCapacityLocalMem())) { + setErrorMessage("Too much local memory allocated for device", + UR_RESULT_ERROR_ADAPTER_SPECIFIC); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + if (LocalSize > static_cast(Device->getMaxChosenLocalMem())) { + setErrorMessage( + "Local memory for kernel exceeds the amount requested using " + "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE. Try increasing the value for " + "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE.", + UR_RESULT_ERROR_ADAPTER_SPECIFIC); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + UR_CHECK_ERROR(cuFuncSetAttribute( + CuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, + Device->getMaxChosenLocalMem())); + } + + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} + /// Enqueues a wait on the given CUstream for all specified events (See /// \ref enqueueEventWaitWithBarrier.) If the events list is empty, the enqueued /// wait will wait on all previous events in the queue. @@ -292,100 +422,30 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // Set the number of threads per block to the number of threads per warp // by default unless user has provided a better number size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; - size_t MaxWorkGroupSize = 0u; - size_t MaxThreadsPerBlock[3] = {}; - bool ProvidedLocalWorkGroupSize = (pLocalWorkSize != nullptr); + size_t BlocksPerGrid[3] = {1u, 1u, 1u}; + uint32_t LocalSize = hKernel->getLocalSize(); ur_result_t Result = UR_RESULT_SUCCESS; + CUfunction CuFunc = hKernel->get(); - try { - // Set the active context here as guessLocalWorkSize needs an active context - ScopedContext Active(hQueue->getContext()); - { - size_t *ReqdThreadsPerBlock = hKernel->ReqdThreadsPerBlock; - MaxWorkGroupSize = hQueue->Device->getMaxWorkGroupSize(); - hQueue->Device->getMaxWorkItemSizes(sizeof(MaxThreadsPerBlock), - MaxThreadsPerBlock); - - if (ProvidedLocalWorkGroupSize) { - auto IsValid = [&](int Dim) { - if (ReqdThreadsPerBlock[Dim] != 0 && - pLocalWorkSize[Dim] != ReqdThreadsPerBlock[Dim]) - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - - if (pLocalWorkSize[Dim] > MaxThreadsPerBlock[Dim]) - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - // Checks that local work sizes are a divisor of the global work sizes - // which includes that the local work sizes are neither larger than - // the global work sizes and not 0. - if (0u == pLocalWorkSize[Dim]) - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - if (0u != (pGlobalWorkSize[Dim] % pLocalWorkSize[Dim])) - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - ThreadsPerBlock[Dim] = pLocalWorkSize[Dim]; - return UR_RESULT_SUCCESS; - }; - - size_t KernelLocalWorkGroupSize = 0; - for (size_t Dim = 0; Dim < workDim; Dim++) { - auto Err = IsValid(Dim); - if (Err != UR_RESULT_SUCCESS) - return Err; - // If no error then sum the total local work size per dim. - KernelLocalWorkGroupSize += pLocalWorkSize[Dim]; - } - - if (hasExceededMaxRegistersPerBlock(hQueue->Device, hKernel, - KernelLocalWorkGroupSize)) { - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - } - } else { - guessLocalWorkSize(hQueue->Device, ThreadsPerBlock, pGlobalWorkSize, - workDim, MaxThreadsPerBlock, hKernel, LocalSize); - } - } - - if (MaxWorkGroupSize < - ThreadsPerBlock[0] * ThreadsPerBlock[1] * ThreadsPerBlock[2]) { - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - } - - size_t BlocksPerGrid[3] = {1u, 1u, 1u}; - - for (size_t i = 0; i < workDim; i++) { - BlocksPerGrid[i] = - (pGlobalWorkSize[i] + ThreadsPerBlock[i] - 1) / ThreadsPerBlock[i]; - } + if ((Result = setKernelParams( + hQueue->getContext(), hQueue->Device, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, hKernel, CuFunc, ThreadsPerBlock, + BlocksPerGrid)) != UR_RESULT_SUCCESS) { + return Result; + } + try { std::unique_ptr RetImplEvent{nullptr}; uint32_t StreamToken; ur_stream_guard_ Guard; CUstream CuStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); - CUfunction CuFunc = hKernel->get(); Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, phEventWaitList); - // Set the implicit global offset parameter if kernel has offset variant - if (hKernel->get_with_offset_parameter()) { - std::uint32_t CudaImplicitOffset[3] = {0, 0, 0}; - if (pGlobalWorkOffset) { - for (size_t i = 0; i < workDim; i++) { - CudaImplicitOffset[i] = - static_cast(pGlobalWorkOffset[i]); - if (pGlobalWorkOffset[i] != 0) { - CuFunc = hKernel->get_with_offset_parameter(); - } - } - } - hKernel->setImplicitOffsetArg(sizeof(CudaImplicitOffset), - CudaImplicitOffset); - } - - auto &ArgIndices = hKernel->getArgIndices(); - if (phEvent) { RetImplEvent = std::unique_ptr(ur_event_handle_t_::makeNative( @@ -393,37 +453,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( RetImplEvent->start(); } - if (hQueue->getContext()->getDevice()->maxLocalMemSizeChosen()) { - // Set up local memory requirements for kernel. - auto Device = hQueue->getContext()->getDevice(); - if (Device->getMaxChosenLocalMem() < 0) { - setErrorMessage("Invalid value specified for " - "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE", - UR_RESULT_ERROR_ADAPTER_SPECIFIC); - return UR_RESULT_ERROR_ADAPTER_SPECIFIC; - } - if (LocalSize > static_cast(Device->getMaxCapacityLocalMem())) { - setErrorMessage("Too much local memory allocated for device", - UR_RESULT_ERROR_ADAPTER_SPECIFIC); - return UR_RESULT_ERROR_ADAPTER_SPECIFIC; - } - if (LocalSize > static_cast(Device->getMaxChosenLocalMem())) { - setErrorMessage( - "Local memory for kernel exceeds the amount requested using " - "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE. Try increasing the value for " - "SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE.", - UR_RESULT_ERROR_ADAPTER_SPECIFIC); - return UR_RESULT_ERROR_ADAPTER_SPECIFIC; - } - UR_CHECK_ERROR(cuFuncSetAttribute( - CuFunc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, - Device->getMaxChosenLocalMem())); - } - + auto &ArgIndices = hKernel->getArgIndices(); Result = UR_CHECK_ERROR(cuLaunchKernel( CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2], ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], LocalSize, CuStream, const_cast(ArgIndices.data()), nullptr)); + if (LocalSize != 0) hKernel->clearLocalSize(); @@ -431,6 +466,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( Result = RetImplEvent->record(); *phEvent = RetImplEvent.release(); } + } catch (ur_result_t Err) { Result = Err; } diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.hpp index 393085ce42eb2..b6a458eb53ac7 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/enqueue.hpp @@ -14,3 +14,20 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList); + +void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock, + const size_t *GlobalWorkSize, const uint32_t WorkDim, + const size_t MaxThreadsPerBlock[3], + ur_kernel_handle_t Kernel, uint32_t LocalSize); + +bool hasExceededMaxRegistersPerBlock(ur_device_handle_t Device, + ur_kernel_handle_t Kernel, + size_t BlockSize); + +ur_result_t +setKernelParams(const ur_context_handle_t Context, + const ur_device_handle_t Device, const uint32_t WorkDim, + const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, + const size_t *LocalWorkSize, ur_kernel_handle_t &Kernel, + CUfunction &CuFunc, size_t (&ThreadsPerBlock)[3], + size_t (&BlocksPerGrid)[3]); diff --git a/sycl/test-e2e/Graph/Explicit/add_node_while_recording.cpp b/sycl/test-e2e/Graph/Explicit/add_node_while_recording.cpp index 44901e3fb452c..6f3c5acbf5873 100644 --- a/sycl/test-e2e/Graph/Explicit/add_node_while_recording.cpp +++ b/sycl/test-e2e/Graph/Explicit/add_node_while_recording.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp b/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp index 8ad6a413aea03..d41484a285602 100644 --- a/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp +++ b/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/Explicit/basic_buffer.cpp b/sycl/test-e2e/Graph/Explicit/basic_buffer.cpp index 6191a875bbe41..b81a916e0fb5d 100644 --- a/sycl/test-e2e/Graph/Explicit/basic_buffer.cpp +++ b/sycl/test-e2e/Graph/Explicit/basic_buffer.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm.cpp index c7adb7f282da4..b6d765c60090e 100644 --- a/sycl/test-e2e/Graph/Explicit/basic_usm.cpp +++ b/sycl/test-e2e/Graph/Explicit/basic_usm.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp index 79e53ff4ba9d9..ac4564ce5797f 100644 --- a/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp +++ b/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp index fa5a2a1f018e6..17a2827372667 100644 --- a/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp +++ b/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp index 1b7447940e1fe..39843f07ff876 100644 --- a/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp +++ b/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_system.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_system.cpp index 26e5473bded66..2a8c69c2afca8 100644 --- a/sycl/test-e2e/Graph/Explicit/basic_usm_system.cpp +++ b/sycl/test-e2e/Graph/Explicit/basic_usm_system.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/Explicit/buffer_ordering.cpp b/sycl/test-e2e/Graph/Explicit/buffer_ordering.cpp index 2c2edd374febc..f32ca3d9d5067 100644 --- a/sycl/test-e2e/Graph/Explicit/buffer_ordering.cpp +++ b/sycl/test-e2e/Graph/Explicit/buffer_ordering.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/Explicit/depends_on.cpp b/sycl/test-e2e/Graph/Explicit/depends_on.cpp index 6454a0eeeaa0e..73a8a77361b55 100644 --- a/sycl/test-e2e/Graph/Explicit/depends_on.cpp +++ b/sycl/test-e2e/Graph/Explicit/depends_on.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/Explicit/dotp.cpp b/sycl/test-e2e/Graph/Explicit/dotp.cpp index 37c8f1341343a..f976fbcc5c3ba 100644 --- a/sycl/test-e2e/Graph/Explicit/dotp.cpp +++ b/sycl/test-e2e/Graph/Explicit/dotp.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/Explicit/dotp_buffer.cpp b/sycl/test-e2e/Graph/Explicit/dotp_buffer.cpp index 4a2d46c611b70..bd6bf65730c55 100644 --- a/sycl/test-e2e/Graph/Explicit/dotp_buffer.cpp +++ b/sycl/test-e2e/Graph/Explicit/dotp_buffer.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/Explicit/dotp_host_mem.cpp b/sycl/test-e2e/Graph/Explicit/dotp_host_mem.cpp index 95beeefc8df86..1f252b4338f74 100644 --- a/sycl/test-e2e/Graph/Explicit/dotp_host_mem.cpp +++ b/sycl/test-e2e/Graph/Explicit/dotp_host_mem.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/Explicit/dotp_host_shared.cpp b/sycl/test-e2e/Graph/Explicit/dotp_host_shared.cpp index 2ebbd07d2c14c..88b473e7e5c4c 100644 --- a/sycl/test-e2e/Graph/Explicit/dotp_host_shared.cpp +++ b/sycl/test-e2e/Graph/Explicit/dotp_host_shared.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/Explicit/dotp_mixed.cpp b/sycl/test-e2e/Graph/Explicit/dotp_mixed.cpp index 590b5cee19919..a0449d6406d3e 100644 --- a/sycl/test-e2e/Graph/Explicit/dotp_mixed.cpp +++ b/sycl/test-e2e/Graph/Explicit/dotp_mixed.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/Explicit/dotp_shared_mem.cpp b/sycl/test-e2e/Graph/Explicit/dotp_shared_mem.cpp index 6fd1d79b8ba01..af0d5b916222f 100644 --- a/sycl/test-e2e/Graph/Explicit/dotp_shared_mem.cpp +++ b/sycl/test-e2e/Graph/Explicit/dotp_shared_mem.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/Explicit/dotp_system_mem.cpp b/sycl/test-e2e/Graph/Explicit/dotp_system_mem.cpp index e53598a9bca4d..c9c91268cbbb5 100644 --- a/sycl/test-e2e/Graph/Explicit/dotp_system_mem.cpp +++ b/sycl/test-e2e/Graph/Explicit/dotp_system_mem.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/Explicit/empty.cpp b/sycl/test-e2e/Graph/Explicit/empty.cpp index 04cec170a9010..548e0aefbec01 100644 --- a/sycl/test-e2e/Graph/Explicit/empty.cpp +++ b/sycl/test-e2e/Graph/Explicit/empty.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/Explicit/empty_node.cpp b/sycl/test-e2e/Graph/Explicit/empty_node.cpp index 687a25b923d78..638ccbde77e18 100644 --- a/sycl/test-e2e/Graph/Explicit/empty_node.cpp +++ b/sycl/test-e2e/Graph/Explicit/empty_node.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/Explicit/empty_with_deps.cpp b/sycl/test-e2e/Graph/Explicit/empty_with_deps.cpp index 799e202422c51..803cf0ec32c19 100644 --- a/sycl/test-e2e/Graph/Explicit/empty_with_deps.cpp +++ b/sycl/test-e2e/Graph/Explicit/empty_with_deps.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp b/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp index 6948090873181..559646103f557 100644 --- a/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp +++ b/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp index a414e3f4b8d6c..9de3fd735eee5 100644 --- a/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp +++ b/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/Explicit/node_ordering.cpp b/sycl/test-e2e/Graph/Explicit/node_ordering.cpp index 233013ba5cf8f..2fa23ac5559ab 100644 --- a/sycl/test-e2e/Graph/Explicit/node_ordering.cpp +++ b/sycl/test-e2e/Graph/Explicit/node_ordering.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp b/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp index 305831a6abb9b..495a5c3e7b2f7 100644 --- a/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp +++ b/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/Explicit/saxpy.cpp b/sycl/test-e2e/Graph/Explicit/saxpy.cpp index 8c8ef7fad2c0e..e6214fcddf064 100644 --- a/sycl/test-e2e/Graph/Explicit/saxpy.cpp +++ b/sycl/test-e2e/Graph/Explicit/saxpy.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/Explicit/single_node.cpp b/sycl/test-e2e/Graph/Explicit/single_node.cpp index 766f6f08de281..3ef60d11d9e82 100644 --- a/sycl/test-e2e/Graph/Explicit/single_node.cpp +++ b/sycl/test-e2e/Graph/Explicit/single_node.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph.cpp index 154ea4e3470e3..ff3a8c1b2eea9 100644 --- a/sycl/test-e2e/Graph/Explicit/sub_graph.cpp +++ b/sycl/test-e2e/Graph/Explicit/sub_graph.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp index edce73a46ad73..ad0badd14ed98 100644 --- a/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp +++ b/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp index b8863b57c7290..bc66115d8fa35 100644 --- a/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp +++ b/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp index fe906bb7aba14..a1dc0e479da73 100644 --- a/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp +++ b/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp index 4254a861fe344..7847981a86577 100644 --- a/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp +++ b/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/Explicit/temp_buffer_reinterpret.cpp b/sycl/test-e2e/Graph/Explicit/temp_buffer_reinterpret.cpp index d5b3ff7412b61..c8fd1d803266c 100644 --- a/sycl/test-e2e/Graph/Explicit/temp_buffer_reinterpret.cpp +++ b/sycl/test-e2e/Graph/Explicit/temp_buffer_reinterpret.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/Explicit/while_recording.cpp b/sycl/test-e2e/Graph/Explicit/while_recording.cpp index b2cef162c7f06..910f6aa31c1f4 100644 --- a/sycl/test-e2e/Graph/Explicit/while_recording.cpp +++ b/sycl/test-e2e/Graph/Explicit/while_recording.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp b/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp index be0bcef2c8934..711a7c0838a1b 100644 --- a/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_buffer.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_buffer.cpp index 7d0c7c81d780f..70052a1a2220b 100644 --- a/sycl/test-e2e/Graph/RecordReplay/basic_buffer.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/basic_buffer.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp index a6c098b472847..988e22b75d049 100644 --- a/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp index c3492b6d26722..d60dc0f454242 100644 --- a/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp index b4b7f26ceebbf..4645f079cd004 100644 --- a/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp index c3a140d64eae4..51747fa00f7a6 100644 --- a/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_system.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_system.cpp index e731b586885ac..360cb915f757e 100644 --- a/sycl/test-e2e/Graph/RecordReplay/basic_usm_system.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_system.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_ordering.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_ordering.cpp index 9910cc82d6e6b..beda0d2b33ee8 100644 --- a/sycl/test-e2e/Graph/RecordReplay/buffer_ordering.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/buffer_ordering.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/RecordReplay/concurrent_queue.cpp b/sycl/test-e2e/Graph/RecordReplay/concurrent_queue.cpp index a25b0ff4c5a11..bf9eb1cf4ba90 100644 --- a/sycl/test-e2e/Graph/RecordReplay/concurrent_queue.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/concurrent_queue.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp.cpp index 5e3403d688f60..9da5eabb01c03 100644 --- a/sycl/test-e2e/Graph/RecordReplay/dotp.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/dotp.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_buffer.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_buffer.cpp index 030c5bc857050..8d911b2e621d6 100644 --- a/sycl/test-e2e/Graph/RecordReplay/dotp_buffer.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/dotp_buffer.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_host_mem.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_host_mem.cpp index dca1d85cdf76e..c6f59da7ed8e0 100644 --- a/sycl/test-e2e/Graph/RecordReplay/dotp_host_mem.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/dotp_host_mem.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_host_shared.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_host_shared.cpp index 2171c6db87e82..cb584495e978b 100644 --- a/sycl/test-e2e/Graph/RecordReplay/dotp_host_shared.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/dotp_host_shared.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_in_order.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_in_order.cpp index 756fccdc99611..31aae2ae6dc27 100644 --- a/sycl/test-e2e/Graph/RecordReplay/dotp_in_order.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/dotp_in_order.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_in_order_with_empty_nodes.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_in_order_with_empty_nodes.cpp index 2e41e5a85f5b9..935fa7b4ad297 100644 --- a/sycl/test-e2e/Graph/RecordReplay/dotp_in_order_with_empty_nodes.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/dotp_in_order_with_empty_nodes.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_multiple_queues.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_multiple_queues.cpp index b0d988ca6deda..18be077ddb6e8 100644 --- a/sycl/test-e2e/Graph/RecordReplay/dotp_multiple_queues.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/dotp_multiple_queues.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_shared_mem.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_shared_mem.cpp index a791311435a89..9564bcdee5c8b 100644 --- a/sycl/test-e2e/Graph/RecordReplay/dotp_shared_mem.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/dotp_shared_mem.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_system_mem.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_system_mem.cpp index 4e8465bac3b0b..f0224c5b02995 100644 --- a/sycl/test-e2e/Graph/RecordReplay/dotp_system_mem.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/dotp_system_mem.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/RecordReplay/empty.cpp b/sycl/test-e2e/Graph/RecordReplay/empty.cpp index 74c96aa407da9..2e3d961b84550 100644 --- a/sycl/test-e2e/Graph/RecordReplay/empty.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/empty.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp b/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp index 967cfbaaf58e8..95cf1baa62835 100644 --- a/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/RecordReplay/empty_node_with_dep.cpp b/sycl/test-e2e/Graph/RecordReplay/empty_node_with_dep.cpp index c247d61796df6..73aa9a9b62b28 100644 --- a/sycl/test-e2e/Graph/RecordReplay/empty_node_with_dep.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/empty_node_with_dep.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp index 8a59f12d316b4..1a6754619f7d0 100644 --- a/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp b/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp index 3a702d025b3d3..4e5f0d35dbdd5 100644 --- a/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/RecordReplay/saxpy.cpp b/sycl/test-e2e/Graph/RecordReplay/saxpy.cpp index ddcc163672d2d..67cf264a102ad 100644 --- a/sycl/test-e2e/Graph/RecordReplay/saxpy.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/saxpy.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/RecordReplay/simple_shared_usm.cpp b/sycl/test-e2e/Graph/RecordReplay/simple_shared_usm.cpp index 6719fdcb35160..98625b801b221 100644 --- a/sycl/test-e2e/Graph/RecordReplay/simple_shared_usm.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/simple_shared_usm.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp index 2515d95fac5f2..8ce937c7d7195 100644 --- a/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp index 4921e51cc98aa..afa6ec38574a4 100644 --- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_in_order.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_in_order.cpp index 0f21e3b6fefd9..0eb2279f1ef1d 100644 --- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_in_order.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_in_order.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp index aedd9e252e252..767cf279b7c06 100644 --- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp index 6fc2b39efade3..75865f6f08697 100644 --- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp index 6eb49c39583da..382d1ab67fb18 100644 --- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/RecordReplay/temp_buffer_reinterpret.cpp b/sycl/test-e2e/Graph/RecordReplay/temp_buffer_reinterpret.cpp index a51bcc967b2ee..1a124ae49d3b3 100644 --- a/sycl/test-e2e/Graph/RecordReplay/temp_buffer_reinterpret.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/temp_buffer_reinterpret.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/RecordReplay/temp_scope.cpp b/sycl/test-e2e/Graph/RecordReplay/temp_scope.cpp index 6cc286bc2c96b..333685c2a49ea 100644 --- a/sycl/test-e2e/Graph/RecordReplay/temp_scope.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/temp_scope.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using ZE_DEBUG diff --git a/sycl/test-e2e/Graph/Threading/finalize.cpp b/sycl/test-e2e/Graph/Threading/finalize.cpp index e6e3337f2bd87..995b60b33edcb 100644 --- a/sycl/test-e2e/Graph/Threading/finalize.cpp +++ b/sycl/test-e2e/Graph/Threading/finalize.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build_pthread_inc} -o %t.out // RUN: %{run} %t.out // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/Threading/queue_state.cpp b/sycl/test-e2e/Graph/Threading/queue_state.cpp index b1713d2e9b1dd..e38976a7fc3e8 100644 --- a/sycl/test-e2e/Graph/Threading/queue_state.cpp +++ b/sycl/test-e2e/Graph/Threading/queue_state.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu, TEMPORARY_DISABLED +// REQUIRES: cuda || level_zero, gpu, TEMPORARY_DISABLED // Disabled as thread safety not yet implemented // RUN: %clangxx -pthread -fsycl -fsycl-targets=%sycl_triple %s -o %t.out diff --git a/sycl/test-e2e/Graph/Threading/submit.cpp b/sycl/test-e2e/Graph/Threading/submit.cpp index 193ce76687bdf..5deace7cb8173 100644 --- a/sycl/test-e2e/Graph/Threading/submit.cpp +++ b/sycl/test-e2e/Graph/Threading/submit.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build_pthread_inc} -o %t.out // RUN: %{run} %t.out // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %} diff --git a/sycl/test-e2e/Graph/Threading/update.cpp b/sycl/test-e2e/Graph/Threading/update.cpp index 6e9b7bcbdb3bb..f65fb8bcc915e 100644 --- a/sycl/test-e2e/Graph/Threading/update.cpp +++ b/sycl/test-e2e/Graph/Threading/update.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu, TEMPORARY_DISABLED +// REQUIRES: cuda || level_zero, gpu, TEMPORARY_DISABLED // Disabled as Update feature is not yet implemented // RUN: %clangxx -pthread -fsycl -fsycl-targets=%sycl_triple %s -o %t.out diff --git a/sycl/test-e2e/Graph/cuda_backend_create_and_finalize_empty_graph.cpp b/sycl/test-e2e/Graph/cuda_backend_create_and_finalize_empty_graph.cpp deleted file mode 100644 index 585d3841d5ddb..0000000000000 --- a/sycl/test-e2e/Graph/cuda_backend_create_and_finalize_empty_graph.cpp +++ /dev/null @@ -1,35 +0,0 @@ -// REQUIRES: cuda, gpu -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - -// Tests the ability to finalize a empty command graph -// without submitting the graph. - -#include "graph_common.hpp" - -int GetCudaBackend(const sycl::device &Dev) { - // Return 1 if the device backend is "cuda" or 0 else. - // 0 does not prevent another device to be picked as a second choice - return Dev.get_backend() == backend::ext_oneapi_cuda; -} - -int main() { - sycl::device CudaDev{GetCudaBackend}; - queue Queue{CudaDev}; - - // Skip the test if no cuda backend found - if (CudaDev.get_backend() != backend::ext_oneapi_cuda) - return 0; - - std::error_code ErrorCode = make_error_code(sycl::errc::success); - // This should not throw an exception - try { - exp_ext::command_graph Graph{Queue.get_context(), CudaDev}; - auto GraphExec = Graph.finalize(); - } catch (const sycl::exception &e) { - ErrorCode = e.code(); - } - assert(ErrorCode == sycl::errc::success); - - return 0; -} diff --git a/sycl/test-e2e/Graph/empty_graph.cpp b/sycl/test-e2e/Graph/empty_graph.cpp index 90eb43986275a..e9937cd230cc2 100644 --- a/sycl/test-e2e/Graph/empty_graph.cpp +++ b/sycl/test-e2e/Graph/empty_graph.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Graph/finalize_empty.cpp b/sycl/test-e2e/Graph/finalize_empty.cpp index dc29455ac2e81..1b1b0f8cee473 100644 --- a/sycl/test-e2e/Graph/finalize_empty.cpp +++ b/sycl/test-e2e/Graph/finalize_empty.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Graph/finalize_twice.cpp b/sycl/test-e2e/Graph/finalize_twice.cpp index 31c178e8eeae5..752a63976f4d3 100644 --- a/sycl/test-e2e/Graph/finalize_twice.cpp +++ b/sycl/test-e2e/Graph/finalize_twice.cpp @@ -1,4 +1,4 @@ -// REQUIRES: level_zero, gpu +// REQUIRES: cuda || level_zero, gpu // RUN: %{build} -o %t.out // RUN: %{run} %t.out