From cfa3744c3bf55bc7d004687feab14fe0ae6dbfd1 Mon Sep 17 00:00:00 2001
From: Petr Vesely <petr.vesely@codeplay.com>
Date: Tue, 8 Aug 2023 11:03:37 +0100
Subject: [PATCH 1/3] [UR] Port PI CUDA unittests to UR

---
 source/adapters/cuda/CMakeLists.txt           |   2 +-
 test/conformance/adapters/cuda/CMakeLists.txt |   6 +-
 .../adapters/cuda/cuda_context_tests.cpp      | 212 ++++++++++++++
 .../conformance/adapters/cuda/cuda_fixtures.h |  14 +-
 .../cuda_urDeviceCreateWithNativeHandle.cpp   |   2 +-
 .../adapters/cuda/kernel_tests.cpp            | 270 ++++++++++++++++++
 .../adapters/cuda/memory_tests.cpp            |  27 ++
 7 files changed, 527 insertions(+), 6 deletions(-)
 create mode 100644 test/conformance/adapters/cuda/cuda_context_tests.cpp
 create mode 100644 test/conformance/adapters/cuda/kernel_tests.cpp
 create mode 100644 test/conformance/adapters/cuda/memory_tests.cpp
diff --git a/source/adapters/cuda/CMakeLists.txt b/source/adapters/cuda/CMakeLists.txt
index f85d759c09..1222b17f67 100644
--- a/source/adapters/cuda/CMakeLists.txt
+++ b/source/adapters/cuda/CMakeLists.txt
@@ -3,7 +3,7 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-set(CUDA_DIR "${SYCL_ADAPTER_DIR}/sycl/plugins/unified_runtime/ur/adapters/cuda")
+set(CUDA_DIR "${SYCL_ADAPTER_DIR}/sycl/plugins/unified_runtime/ur/adapters/cuda" CACHE PATH "CUDA adapter directory")
 
 set(TARGET_NAME ur_adapter_cuda)
 
diff --git a/test/conformance/adapters/cuda/CMakeLists.txt b/test/conformance/adapters/cuda/CMakeLists.txt
index 241eb87a8c..6d653c2374 100644
--- a/test/conformance/adapters/cuda/CMakeLists.txt
+++ b/test/conformance/adapters/cuda/CMakeLists.txt
@@ -5,13 +5,17 @@
 
 add_conformance_test_with_devices_environment(adapter-cuda 
     cuda_fixtures.h
+    cuda_context_tests.cpp
     cuda_urContextGetNativeHandle.cpp
     cuda_urDeviceGetNativeHandle.cpp
     cuda_urDeviceCreateWithNativeHandle.cpp
     cuda_urEventGetNativeHandle.cpp
     cuda_urEventCreateWithNativeHandle.cpp
+    kernel_tests.cpp
+    memory_tests.cpp
 )
-target_link_libraries(test-adapter-cuda PRIVATE cudadrv)
+target_link_libraries(test-adapter-cuda PRIVATE cudadrv ur_adapter_cuda)
+target_include_directories(test-adapter-cuda PRIVATE  ${CUDA_DIR} "${CUDA_DIR}/../../../" )
 
 set_tests_properties(adapter-cuda PROPERTIES
         LABELS "conformance:cuda"
diff --git a/test/conformance/adapters/cuda/cuda_context_tests.cpp b/test/conformance/adapters/cuda/cuda_context_tests.cpp
new file mode 100644
index 0000000000..e8f4348e15
--- /dev/null
+++ b/test/conformance/adapters/cuda/cuda_context_tests.cpp
@@ -0,0 +1,212 @@
+// Copyright (C) 2022-2023 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "context.hpp"
+#include "cuda_fixtures.h"
+#include "queue.hpp"
+#include <thread>
+
+using cudaUrContextCreateTest = uur::urDeviceTest;
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(cudaUrContextCreateTest);
+
+constexpr unsigned int known_cuda_api_version = 3020;
+
+TEST_P(cudaUrContextCreateTest, CreateWithChildThread) {
+
+    ur_context_handle_t context = nullptr;
+    ASSERT_SUCCESS(urContextCreate(1, &device, nullptr, &context));
+    ASSERT_NE(context, nullptr);
+
+    // Retrieve the CUDA context to check information is correct
+    auto checkValue = [=] {
+        CUcontext cudaContext = context->get();
+        unsigned int version = 0;
+        EXPECT_SUCCESS_CUDA(cuCtxGetApiVersion(cudaContext, &version));
+        EXPECT_EQ(version, known_cuda_api_version);
+
+        // The current context is different from the current thread
+        CUcontext current;
+        ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(&current));
+        EXPECT_NE(cudaContext, current);
+
+        // Set the context
+        EXPECT_SUCCESS_CUDA(cuCtxPushCurrent(cudaContext));
+        EXPECT_SUCCESS_CUDA(cuCtxGetCurrent(&current));
+        EXPECT_EQ(cudaContext, current);
+    };
+
+    auto callContextFromOtherThread = std::thread(checkValue);
+    callContextFromOtherThread.join();
+    ASSERT_SUCCESS(urContextRelease(context));
+}
+
+TEST_P(cudaUrContextCreateTest, ActiveContext) {
+    // start with no active context
+    uur::clearCudaContext();
+
+    ur_context_handle_t context = nullptr;
+    ASSERT_SUCCESS(urContextCreate(1, &device, nullptr, &context));
+    ASSERT_NE(context, nullptr);
+
+    ur_queue_handle_t queue = nullptr;
+    ur_queue_properties_t queue_props{UR_STRUCTURE_TYPE_QUEUE_PROPERTIES,
+                                      nullptr, 0};
+    ASSERT_SUCCESS(urQueueCreate(context, device, &queue_props, &queue));
+    ASSERT_NE(queue, nullptr);
+
+    // check that the queue has the correct context
+    ASSERT_EQ(context, queue->getContext());
+
+    // create a buffer
+    ur_mem_handle_t buffer = nullptr;
+    ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, 1024,
+                                     nullptr, &buffer));
+    ASSERT_NE(buffer, nullptr);
+
+    // check that the context is now the active CUDA context
+    CUcontext cudaCtx = nullptr;
+    ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(&cudaCtx));
+    ASSERT_NE(cudaCtx, nullptr);
+
+    ur_native_handle_t native_context = nullptr;
+    ASSERT_SUCCESS(urContextGetNativeHandle(context, &native_context));
+    ASSERT_NE(native_context, nullptr);
+    ASSERT_EQ(cudaCtx, reinterpret_cast<CUcontext>(native_context));
+
+    // release resources
+    ASSERT_SUCCESS(urMemRelease(buffer));
+    ASSERT_SUCCESS(urQueueRelease(queue));
+    ASSERT_SUCCESS(urContextRelease(context));
+}
+
+TEST_P(cudaUrContextCreateTest, ContextLifetimeExisting) {
+    // start by setting up a CUDA context on the thread
+    CUcontext original;
+    ASSERT_SUCCESS_CUDA(cuCtxCreate(&original, CU_CTX_MAP_HOST, device->get()));
+
+    // ensure the CUDA context is active
+    CUcontext current = nullptr;
+    ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(&current));
+    ASSERT_EQ(original, current);
+
+    // create a UR context
+    ur_context_handle_t context;
+    ASSERT_SUCCESS(urContextCreate(1, &device, nullptr, &context));
+    ASSERT_NE(context, nullptr);
+
+    // create a queue with the context
+    ur_queue_handle_t queue;
+    ASSERT_SUCCESS(urQueueCreate(context, device, nullptr, &queue));
+    ASSERT_NE(queue, nullptr);
+
+    // ensure the queue has the correct context
+    ASSERT_EQ(context, queue->getContext());
+
+    // create a buffer in the context to set the context as active
+    ur_mem_handle_t buffer;
+    ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, 1024,
+                                     nullptr, &buffer));
+
+    // check that context is now the active cuda context
+    ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(&current));
+    ASSERT_EQ(current, context->get());
+
+    ASSERT_SUCCESS(urQueueRelease(queue));
+    ASSERT_SUCCESS(urContextRelease(context));
+
+    // check that the context was cleaned up, the old context will be restored
+}
+
+TEST_P(cudaUrContextCreateTest, ThreadedContext) {
+    // start with no active cuda context
+    uur::clearCudaContext();
+
+    // create two new UR contexts
+    ur_context_handle_t context1;
+    ASSERT_SUCCESS(urContextCreate(1, &device, nullptr, &context1));
+    ASSERT_NE(context1, nullptr);
+
+    ur_context_handle_t context2;
+    ASSERT_SUCCESS(urContextCreate(1, &device, nullptr, &context2));
+    ASSERT_NE(context2, nullptr);
+
+    // setup synchronization variables between the main thread and
+    // the testing thread
+    std::mutex m;
+    std::condition_variable cv;
+    bool released = false;
+    bool thread_done = false;
+
+    // create a testing thread that will create a queue with the first context,
+    // release the queue, then wait for the main thread to release
+    // the first context, and then create and release another queue with
+    // the second context.
+    auto test_thread = std::thread([&] {
+        CUcontext current = nullptr;
+
+        // create a queue with the first context
+        ur_queue_handle_t queue;
+        ASSERT_SUCCESS(urQueueCreate(context1, device, nullptr, &queue));
+        ASSERT_NE(queue, nullptr);
+
+        // ensure that the queue has the correct context
+        ASSERT_EQ(context1, queue->getContext());
+
+        // create a buffer to set context1 as the active context
+        ur_mem_handle_t buffer;
+        ASSERT_SUCCESS(urMemBufferCreate(context1, UR_MEM_FLAG_READ_WRITE, 1024,
+                                         nullptr, &buffer));
+        ASSERT_NE(buffer, nullptr);
+
+        // release the mem and queue
+        ASSERT_SUCCESS(urMemRelease(buffer));
+        ASSERT_SUCCESS(urQueueRelease(queue));
+
+        // mark the first set of processing as done and notify the main thread
+        std::unique_lock<std::mutex> lock(m);
+        thread_done = true;
+        lock.unlock();
+        cv.notify_one();
+
+        // wait for the main thread to release the first context
+        lock.lock();
+        cv.wait(lock, [&] { return released; });
+
+        // create a queue with the 2nd context
+        ASSERT_SUCCESS(urQueueCreate(context2, device, nullptr, &queue));
+        ASSERT_NE(queue, nullptr);
+
+        // ensure queue has correct context
+        ASSERT_EQ(context2, queue->getContext());
+
+        // create a buffer to set the active context
+        ASSERT_SUCCESS(urMemBufferCreate(context2, UR_MEM_FLAG_READ_WRITE, 1024,
+                                         nullptr, &buffer));
+
+        // check that the 2nd context is now tha active cuda context
+        ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(&current));
+        ASSERT_EQ(current, context2->get());
+
+        // release
+        ASSERT_SUCCESS(urMemRelease(buffer));
+        ASSERT_SUCCESS(urQueueRelease(queue));
+    });
+
+    // wait for the thread to be done with the first queue to release the first
+    // context
+    std::unique_lock<std::mutex> lock(m);
+    cv.wait(lock, [&] { return thread_done; });
+    ASSERT_SUCCESS(urContextRelease(context1));
+
+    // notify the other thread that the context was released
+    released = true;
+    lock.unlock();
+    cv.notify_one();
+
+    // wait for the thread to finish
+    test_thread.join();
+
+    ASSERT_SUCCESS(urContextRelease(context2));
+}
diff --git a/test/conformance/adapters/cuda/cuda_fixtures.h b/test/conformance/adapters/cuda/cuda_fixtures.h
index e367a4aa2c..062cc83db0 100644
--- a/test/conformance/adapters/cuda/cuda_fixtures.h
+++ b/test/conformance/adapters/cuda/cuda_fixtures.h
@@ -19,6 +19,15 @@ struct ResultCuda {
 
     CUresult value;
 };
+
+inline void clearCudaContext() {
+    CUcontext context = nullptr;
+    do {
+        cuCtxSetCurrent(nullptr);
+        cuCtxGetCurrent(&context);
+    } while (context != nullptr);
+}
+
 } // namespace uur
 
 #ifndef ASSERT_EQ_RESULT_CUDA
@@ -32,12 +41,11 @@ struct ResultCuda {
 
 #ifndef EXPECT_EQ_RESULT_CUDA
 #define EXPECT_EQ_RESULT_CUDA(EXPECTED, ACTUAL)                                \
-    EXPECT_EQ_RESULT_CUDA(uur::ResultCuda(EXPECTED), uur::ResultCuda(ACTUAL))
+    ASSERT_EQ(uur::ResultCuda(EXPECTED), uur::ResultCuda(ACTUAL))
 #endif // EXPECT_EQ_RESULT_CUDA
 
 #ifndef EXPECT_SUCCESS_CUDA
-#define EXPECT_SUCCESS_CUDA(ACTUAL)                                            \
-    EXPECT_EQ_RESULT_CUDA(UR_RESULT_SUCCESS, ACTUAL)
+#define EXPECT_SUCCESS_CUDA(ACTUAL) EXPECT_EQ_RESULT_CUDA(CUDA_SUCCESS, ACTUAL)
 #endif // EXPECT_EQ_RESULT_CUDA
 
 #endif // UR_TEST_CONFORMANCE_ADAPTERS_CUDA_FIXTURES_H_INCLUDED
diff --git a/test/conformance/adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp b/test/conformance/adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp
index b116c9a5c9..7fcee9a8f2 100644
--- a/test/conformance/adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp
+++ b/test/conformance/adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp
@@ -7,7 +7,7 @@
 
 using urCudaDeviceCreateWithNativeHandle = uur::urPlatformTest;
 
-TEST_F(urCudaDeviceCreateWithNativeHandle, Success) {
+TEST_F(urCudaDeviceCreateWithNativeHandle, DISABLED_Success) {
     // get a device from cuda
     int nCudaDevices;
     ASSERT_SUCCESS_CUDA(cuDeviceGetCount(&nCudaDevices));
diff --git a/test/conformance/adapters/cuda/kernel_tests.cpp b/test/conformance/adapters/cuda/kernel_tests.cpp
new file mode 100644
index 0000000000..1e7f54f3cf
--- /dev/null
+++ b/test/conformance/adapters/cuda/kernel_tests.cpp
@@ -0,0 +1,270 @@
+// Copyright (C) 2022-2023 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "kernel.hpp"
+#include <uur/fixtures.h>
+
+using cudaKernelTest = uur::urQueueTest;
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(cudaKernelTest);
+
+inline constexpr size_t NumberOfImplicitArgsCUDA = 1;
+
+const char *ptxSource = "\n\
+.version 3.2\n\
+.target sm_20\n\
+.address_size 64\n\
+.visible .entry _Z8myKernelPi(\n\
+	.param .u64 _Z8myKernelPi_param_0\n\
+)\n\
+{\n\
+	.reg .s32 	%r<5>;\n\
+	.reg .s64 	%rd<5>;\n\
+	ld.param.u64 	%rd1, [_Z8myKernelPi_param_0];\n\
+	cvta.to.global.u64 	%rd2, %rd1;\n\
+	.loc 1 3 1\n\
+	mov.u32 	%r1, %ntid.x;\n\
+	mov.u32 	%r2, %ctaid.x;\n\
+	mov.u32 	%r3, %tid.x;\n\
+	mad.lo.s32 	%r4, %r1, %r2, %r3;\n\
+	mul.wide.s32 	%rd3, %r4, 4;\n\
+	add.s64 	%rd4, %rd2, %rd3;\n\
+	.loc 1 4 1\n\
+	st.global.u32 	[%rd4], %r4;\n\
+	.loc 1 5 2\n\
+	ret;\n\
+    ret;\
+\n\
+}\
+\n\
+";
+
+const char *twoParams = "\n\
+.version 3.2\n\
+.target sm_20\n\
+.address_size 64\n\
+.visible .entry twoParamKernel(\n\
+	.param .u64 twoParamKernel_param_0,\n\
+  .param .u64 twoParamKernel_param_1\n\
+)\n\
+{\n\
+  ret;\
+  \n\
+}\n\
+";
+
+const char *threeParamsTwoLocal = "\n\
+.version 3.2\n\
+.target sm_20\n\
+.address_size 64\n\
+.visible .entry twoParamKernelLocal(\n\
+	.param .u64 twoParamKernel_param_0,\n\
+  .param .u32 twoParamKernel_param_1,\n\
+  .param .u32 twoParamKernel_param_2\n\
+)\n\
+{\n\
+  ret;\
+  \n\
+}\n\
+";
+
+TEST_P(cudaKernelTest, CreateProgramAndKernel) {
+
+    ur_program_handle_t program = nullptr;
+    ASSERT_SUCCESS(urProgramCreateWithBinary(context, device, sizeof(ptxSource),
+                                             (const uint8_t *)ptxSource,
+                                             nullptr, &program));
+    ASSERT_NE(program, nullptr);
+    ASSERT_SUCCESS(urProgramBuild(context, program, nullptr));
+
+    ur_kernel_handle_t kernel = nullptr;
+    ASSERT_SUCCESS(urKernelCreate(program, "_Z8myKernelPi", &kernel));
+    ASSERT_NE(kernel, nullptr);
+
+    ASSERT_SUCCESS(urKernelRelease(kernel));
+    ASSERT_SUCCESS(urProgramRelease(program));
+}
+
+TEST_P(cudaKernelTest, CreateProgramAndKernelWithMetadata) {
+
+    std::vector<uint32_t> reqdWorkGroupSizeMD;
+    reqdWorkGroupSizeMD.reserve(5);
+    // 64-bit representing bit size
+    reqdWorkGroupSizeMD.push_back(96);
+    reqdWorkGroupSizeMD.push_back(0);
+    // reqd_work_group_size x
+    reqdWorkGroupSizeMD.push_back(8);
+    // reqd_work_group_size y
+    reqdWorkGroupSizeMD.push_back(16);
+    // reqd_work_group_size z
+    reqdWorkGroupSizeMD.push_back(32);
+
+    const char *reqdWorkGroupSizeMDConstName =
+        "_Z8myKernelPi@reqd_work_group_size";
+    std::vector<char> reqdWorkGroupSizeMDName(
+        reqdWorkGroupSizeMDConstName, reqdWorkGroupSizeMDConstName +
+                                          strlen(reqdWorkGroupSizeMDConstName) +
+                                          1);
+
+    ur_program_metadata_value_t reqd_work_group_value;
+    reqd_work_group_value.pData = reqdWorkGroupSizeMD.data();
+
+    ur_program_metadata_t reqdWorkGroupSizeMDProp = {
+        reqdWorkGroupSizeMDName.data(), UR_PROGRAM_METADATA_TYPE_BYTE_ARRAY,
+        reqdWorkGroupSizeMD.size() * sizeof(uint32_t), reqd_work_group_value};
+
+    ur_program_properties_t programProps{UR_STRUCTURE_TYPE_PROGRAM_PROPERTIES,
+                                         nullptr, 1, &reqdWorkGroupSizeMDProp};
+    ur_program_handle_t program = nullptr;
+    ASSERT_SUCCESS(urProgramCreateWithBinary(context, device, sizeof(ptxSource),
+                                             (const uint8_t *)ptxSource,
+                                             &programProps, &program));
+    ASSERT_NE(program, nullptr);
+
+    ASSERT_SUCCESS(urProgramBuild(context, program, nullptr));
+
+    ur_kernel_handle_t kernel = nullptr;
+    ASSERT_SUCCESS(urKernelCreate(program, "_Z8myKernelPi", &kernel));
+
+    size_t compileWGSize[3] = {0};
+    ASSERT_SUCCESS(urKernelGetGroupInfo(
+        kernel, device, UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
+        sizeof(compileWGSize), &compileWGSize, nullptr));
+
+    for (int i = 0; i < 3; i++) {
+        ASSERT_EQ(compileWGSize[i], reqdWorkGroupSizeMD[i + 2]);
+    }
+
+    ASSERT_SUCCESS(urKernelRelease(kernel));
+    ASSERT_SUCCESS(urProgramRelease(program));
+}
+
+TEST_P(cudaKernelTest, URKernelArgumentSimple) {
+    ur_program_handle_t program = nullptr;
+    ASSERT_SUCCESS(urProgramCreateWithBinary(context, device, sizeof(ptxSource),
+                                             (const uint8_t *)ptxSource,
+                                             nullptr, &program));
+    ASSERT_NE(program, nullptr);
+    ASSERT_SUCCESS(urProgramBuild(context, program, nullptr));
+
+    ur_kernel_handle_t kernel = nullptr;
+    ASSERT_SUCCESS(urKernelCreate(program, "_Z8myKernelPi", &kernel));
+    ASSERT_NE(kernel, nullptr);
+
+    int number = 10;
+    ASSERT_SUCCESS(
+        urKernelSetArgValue(kernel, 0, sizeof(int), nullptr, &number));
+    const auto &kernelArgs = kernel->getArgIndices();
+    ASSERT_EQ(kernelArgs.size(), 1 + NumberOfImplicitArgsCUDA);
+
+    int storedValue = *static_cast<const int *>(kernelArgs[0]);
+    ASSERT_EQ(storedValue, number);
+
+    ASSERT_SUCCESS(urKernelRelease(kernel));
+    ASSERT_SUCCESS(urProgramRelease(program));
+}
+
+TEST_P(cudaKernelTest, URKernelArgumentSetTwice) {
+    ur_program_handle_t program = nullptr;
+    ASSERT_SUCCESS(urProgramCreateWithBinary(context, device, sizeof(ptxSource),
+                                             (const uint8_t *)ptxSource,
+                                             nullptr, &program));
+    ASSERT_NE(program, nullptr);
+    ASSERT_SUCCESS(urProgramBuild(context, program, nullptr));
+
+    ur_kernel_handle_t kernel = nullptr;
+    ASSERT_SUCCESS(urKernelCreate(program, "_Z8myKernelPi", &kernel));
+    ASSERT_NE(kernel, nullptr);
+
+    int number = 10;
+    ASSERT_SUCCESS(
+        urKernelSetArgValue(kernel, 0, sizeof(int), nullptr, &number));
+    const auto &kernelArgs = kernel->getArgIndices();
+    ASSERT_EQ(kernelArgs.size(), 1 + NumberOfImplicitArgsCUDA);
+    int storedValue = *static_cast<const int *>(kernelArgs[0]);
+    ASSERT_EQ(storedValue, number);
+
+    int otherNumber = 934;
+    ASSERT_SUCCESS(
+        urKernelSetArgValue(kernel, 0, sizeof(int), nullptr, &otherNumber));
+    const auto kernelArgs2 = kernel->getArgIndices();
+    ASSERT_EQ(kernelArgs2.size(), 1 + NumberOfImplicitArgsCUDA);
+    storedValue = *static_cast<const int *>(kernelArgs2[0]);
+    ASSERT_EQ(storedValue, otherNumber);
+
+    ASSERT_SUCCESS(urKernelRelease(kernel));
+    ASSERT_SUCCESS(urProgramRelease(program));
+}
+
+TEST_P(cudaKernelTest, URKernelDispatch) {
+    ur_program_handle_t program = nullptr;
+    ASSERT_SUCCESS(urProgramCreateWithBinary(context, device, sizeof(ptxSource),
+                                             (const uint8_t *)ptxSource,
+                                             nullptr, &program));
+    ASSERT_NE(program, nullptr);
+    ASSERT_SUCCESS(urProgramBuild(context, program, nullptr));
+
+    ur_kernel_handle_t kernel = nullptr;
+    ASSERT_SUCCESS(urKernelCreate(program, "_Z8myKernelPi", &kernel));
+    ASSERT_NE(kernel, nullptr);
+
+    const size_t memSize = 1024u;
+    ur_mem_handle_t buffer = nullptr;
+    ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, memSize,
+                                     nullptr, &buffer));
+    ASSERT_NE(buffer, nullptr);
+    ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, 0, nullptr, buffer));
+
+    const size_t workDim = 1;
+    const size_t globalWorkOffset[] = {0};
+    const size_t globalWorkSize[] = {1};
+    const size_t localWorkSize[] = {1};
+    ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, workDim,
+                                         globalWorkOffset, globalWorkSize,
+                                         localWorkSize, 0, nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    ASSERT_SUCCESS(urMemRelease(buffer));
+    ASSERT_SUCCESS(urKernelRelease(kernel));
+    ASSERT_SUCCESS(urProgramRelease(program));
+}
+
+TEST_P(cudaKernelTest, URKernelDispatchTwo) {
+    ur_program_handle_t program = nullptr;
+    ASSERT_SUCCESS(urProgramCreateWithBinary(context, device, sizeof(ptxSource),
+                                             (const uint8_t *)twoParams,
+                                             nullptr, &program));
+    ASSERT_NE(program, nullptr);
+    ASSERT_SUCCESS(urProgramBuild(context, program, nullptr));
+
+    ur_kernel_handle_t kernel = nullptr;
+    ASSERT_SUCCESS(urKernelCreate(program, "twoParamKernel", &kernel));
+    ASSERT_NE(kernel, nullptr);
+
+    const size_t memSize = 1024u;
+    ur_mem_handle_t buffer1 = nullptr;
+    ur_mem_handle_t buffer2 = nullptr;
+    ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, memSize,
+                                     nullptr, &buffer1));
+    ASSERT_NE(buffer1, nullptr);
+    ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, memSize,
+                                     nullptr, &buffer2));
+    ASSERT_NE(buffer1, nullptr);
+    ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, 0, nullptr, buffer1));
+    ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, 1, nullptr, buffer2));
+
+    const size_t workDim = 1;
+    const size_t globalWorkOffset[] = {0};
+    const size_t globalWorkSize[] = {1};
+    const size_t localWorkSize[] = {1};
+    ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, workDim,
+                                         globalWorkOffset, globalWorkSize,
+                                         localWorkSize, 0, nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+
+    ASSERT_SUCCESS(urMemRelease(buffer1));
+    ASSERT_SUCCESS(urMemRelease(buffer2));
+    ASSERT_SUCCESS(urKernelRelease(kernel));
+    ASSERT_SUCCESS(urProgramRelease(program));
+}
diff --git a/test/conformance/adapters/cuda/memory_tests.cpp b/test/conformance/adapters/cuda/memory_tests.cpp
new file mode 100644
index 0000000000..8ca62c1e63
--- /dev/null
+++ b/test/conformance/adapters/cuda/memory_tests.cpp
@@ -0,0 +1,27 @@
+// Copyright (C) 2022-2023 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "cuda_fixtures.h"
+
+using cudaMemoryTest = uur::urContextTest;
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(cudaMemoryTest);
+
+TEST_P(cudaMemoryTest, urMemBufferNoActiveContext) {
+    constexpr size_t memSize = 1024u;
+
+    CUcontext current = nullptr;
+    do {
+        CUcontext oldContext = nullptr;
+        ASSERT_SUCCESS_CUDA(cuCtxPopCurrent(&oldContext));
+        ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(&current));
+    } while (current != nullptr);
+
+    ur_mem_handle_t mem;
+    ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, memSize,
+                                     nullptr, &mem));
+    ASSERT_NE(mem, nullptr);
+
+    ASSERT_SUCCESS(urMemRelease(mem));
+}

From 01f2f92fc876230496b6372f8c0f16e098ab496a Mon Sep 17 00:00:00 2001
From: Petr Vesely <petr.vesely@codeplay.com>
Date: Tue, 8 Aug 2023 15:00:50 +0100
Subject: [PATCH 2/3] [UR] Include overloads for complete types

---
 scripts/templates/params.hpp.mako             | 13 ++++
 source/common/ur_params.hpp                   | 76 +++++++++++++++++++
 test/conformance/adapters/cuda/CMakeLists.txt |  4 +-
 ...da_context_tests.cpp => context_tests.cpp} |  0
 4 files changed, 92 insertions(+), 1 deletion(-)
 rename test/conformance/adapters/cuda/{cuda_context_tests.cpp => context_tests.cpp} (100%)

diff --git a/scripts/templates/params.hpp.mako b/scripts/templates/params.hpp.mako
index 78966d66ff..3fef2e0c72 100644
--- a/scripts/templates/params.hpp.mako
+++ b/scripts/templates/params.hpp.mako
@@ -126,6 +126,19 @@ template <typename T> inline void serializeTagged(std::ostream &os, const void *
 %endfor
 } // namespace ${x}_params
 
+## TODO - should be removed as part of #789
+#ifdef UR_INCLUDE_HANDLE_IMPLEMENTATION_OVERLOADS
+%for spec in specs:
+%for obj in spec['objects']:
+%if re.match(r"handle", obj['type']):
+inline std::ostream &operator<<(std::ostream &os, const struct ${th.make_type_name(n, tags, obj)}_ &){
+    return os;
+}
+%endif
+%endfor
+%endfor
+#endif
+
 %for spec in specs:
 %for obj in spec['objects']:
 ## ENUM #######################################################################
diff --git a/source/common/ur_params.hpp b/source/common/ur_params.hpp
index 2f9789506e..b3f9b88fc5 100644
--- a/source/common/ur_params.hpp
+++ b/source/common/ur_params.hpp
@@ -193,6 +193,82 @@ inline void serializeTagged(std::ostream &os, const void *ptr,
 
 } // namespace ur_params
 
+#ifdef UR_INCLUDE_HANDLE_IMPLEMENTATION_OVERLOADS
+inline std::ostream &operator<<(std::ostream &os,
+                                const struct ur_adapter_handle_t_ &) {
+    return os;
+}
+inline std::ostream &operator<<(std::ostream &os,
+                                const struct ur_platform_handle_t_ &) {
+    return os;
+}
+inline std::ostream &operator<<(std::ostream &os,
+                                const struct ur_device_handle_t_ &) {
+    return os;
+}
+inline std::ostream &operator<<(std::ostream &os,
+                                const struct ur_context_handle_t_ &) {
+    return os;
+}
+inline std::ostream &operator<<(std::ostream &os,
+                                const struct ur_event_handle_t_ &) {
+    return os;
+}
+inline std::ostream &operator<<(std::ostream &os,
+                                const struct ur_program_handle_t_ &) {
+    return os;
+}
+inline std::ostream &operator<<(std::ostream &os,
+                                const struct ur_kernel_handle_t_ &) {
+    return os;
+}
+inline std::ostream &operator<<(std::ostream &os,
+                                const struct ur_queue_handle_t_ &) {
+    return os;
+}
+inline std::ostream &operator<<(std::ostream &os,
+                                const struct ur_native_handle_t_ &) {
+    return os;
+}
+inline std::ostream &operator<<(std::ostream &os,
+                                const struct ur_sampler_handle_t_ &) {
+    return os;
+}
+inline std::ostream &operator<<(std::ostream &os,
+                                const struct ur_mem_handle_t_ &) {
+    return os;
+}
+inline std::ostream &operator<<(std::ostream &os,
+                                const struct ur_physical_mem_handle_t_ &) {
+    return os;
+}
+inline std::ostream &operator<<(std::ostream &os,
+                                const struct ur_usm_pool_handle_t_ &) {
+    return os;
+}
+inline std::ostream &operator<<(std::ostream &os,
+                                const struct ur_exp_image_handle_t_ &) {
+    return os;
+}
+inline std::ostream &operator<<(std::ostream &os,
+                                const struct ur_exp_image_mem_handle_t_ &) {
+    return os;
+}
+inline std::ostream &operator<<(std::ostream &os,
+                                const struct ur_exp_interop_mem_handle_t_ &) {
+    return os;
+}
+inline std::ostream &
+operator<<(std::ostream &os,
+           const struct ur_exp_interop_semaphore_handle_t_ &) {
+    return os;
+}
+inline std::ostream &
+operator<<(std::ostream &os, const struct ur_exp_command_buffer_handle_t_ &) {
+    return os;
+}
+#endif
+
 inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value);
 inline std::ostream &operator<<(std::ostream &os,
                                 enum ur_structure_type_t value);
diff --git a/test/conformance/adapters/cuda/CMakeLists.txt b/test/conformance/adapters/cuda/CMakeLists.txt
index 6d653c2374..89c558019b 100644
--- a/test/conformance/adapters/cuda/CMakeLists.txt
+++ b/test/conformance/adapters/cuda/CMakeLists.txt
@@ -5,7 +5,7 @@
 
 add_conformance_test_with_devices_environment(adapter-cuda 
     cuda_fixtures.h
-    cuda_context_tests.cpp
+    context_tests.cpp
     cuda_urContextGetNativeHandle.cpp
     cuda_urDeviceGetNativeHandle.cpp
     cuda_urDeviceCreateWithNativeHandle.cpp
@@ -16,6 +16,8 @@ add_conformance_test_with_devices_environment(adapter-cuda
 )
 target_link_libraries(test-adapter-cuda PRIVATE cudadrv ur_adapter_cuda)
 target_include_directories(test-adapter-cuda PRIVATE  ${CUDA_DIR} "${CUDA_DIR}/../../../" )
+# TODO - remove as part of #789
+target_compile_definitions(test-adapter-cuda PRIVATE UR_INCLUDE_HANDLE_IMPLEMENTATION_OVERLOADS)
 
 set_tests_properties(adapter-cuda PROPERTIES
         LABELS "conformance:cuda"
diff --git a/test/conformance/adapters/cuda/cuda_context_tests.cpp b/test/conformance/adapters/cuda/context_tests.cpp
similarity index 100%
rename from test/conformance/adapters/cuda/cuda_context_tests.cpp
rename to test/conformance/adapters/cuda/context_tests.cpp

From 0457bd70d8a6ac001e972b09163a7f621c417ad3 Mon Sep 17 00:00:00 2001
From: Petr Vesely <petr.vesely@codeplay.com>
Date: Wed, 9 Aug 2023 11:11:38 +0100
Subject: [PATCH 3/3] [UR] Fix review feedback

---
 test/conformance/adapters/cuda/context_tests.cpp       |  8 --------
 test/conformance/adapters/cuda/cuda_fixtures.h         | 10 +---------
 .../cuda/cuda_urDeviceCreateWithNativeHandle.cpp       |  2 +-
 test/conformance/adapters/cuda/kernel_tests.cpp        |  1 +
 4 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/test/conformance/adapters/cuda/context_tests.cpp b/test/conformance/adapters/cuda/context_tests.cpp
index e8f4348e15..0ca826fff5 100644
--- a/test/conformance/adapters/cuda/context_tests.cpp
+++ b/test/conformance/adapters/cuda/context_tests.cpp
@@ -43,9 +43,6 @@ TEST_P(cudaUrContextCreateTest, CreateWithChildThread) {
 }
 
 TEST_P(cudaUrContextCreateTest, ActiveContext) {
-    // start with no active context
-    uur::clearCudaContext();
-
     ur_context_handle_t context = nullptr;
     ASSERT_SUCCESS(urContextCreate(1, &device, nullptr, &context));
     ASSERT_NE(context, nullptr);
@@ -115,14 +112,9 @@ TEST_P(cudaUrContextCreateTest, ContextLifetimeExisting) {
 
     ASSERT_SUCCESS(urQueueRelease(queue));
     ASSERT_SUCCESS(urContextRelease(context));
-
-    // check that the context was cleaned up, the old context will be restored
 }
 
 TEST_P(cudaUrContextCreateTest, ThreadedContext) {
-    // start with no active cuda context
-    uur::clearCudaContext();
-
     // create two new UR contexts
     ur_context_handle_t context1;
     ASSERT_SUCCESS(urContextCreate(1, &device, nullptr, &context1));
diff --git a/test/conformance/adapters/cuda/cuda_fixtures.h b/test/conformance/adapters/cuda/cuda_fixtures.h
index 062cc83db0..3c81a79dd2 100644
--- a/test/conformance/adapters/cuda/cuda_fixtures.h
+++ b/test/conformance/adapters/cuda/cuda_fixtures.h
@@ -20,14 +20,6 @@ struct ResultCuda {
     CUresult value;
 };
 
-inline void clearCudaContext() {
-    CUcontext context = nullptr;
-    do {
-        cuCtxSetCurrent(nullptr);
-        cuCtxGetCurrent(&context);
-    } while (context != nullptr);
-}
-
 } // namespace uur
 
 #ifndef ASSERT_EQ_RESULT_CUDA
@@ -41,7 +33,7 @@ inline void clearCudaContext() {
 
 #ifndef EXPECT_EQ_RESULT_CUDA
 #define EXPECT_EQ_RESULT_CUDA(EXPECTED, ACTUAL)                                \
-    ASSERT_EQ(uur::ResultCuda(EXPECTED), uur::ResultCuda(ACTUAL))
+    EXPECT_EQ(uur::ResultCuda(EXPECTED), uur::ResultCuda(ACTUAL))
 #endif // EXPECT_EQ_RESULT_CUDA
 
 #ifndef EXPECT_SUCCESS_CUDA
diff --git a/test/conformance/adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp b/test/conformance/adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp
index 7fcee9a8f2..b116c9a5c9 100644
--- a/test/conformance/adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp
+++ b/test/conformance/adapters/cuda/cuda_urDeviceCreateWithNativeHandle.cpp
@@ -7,7 +7,7 @@
 
 using urCudaDeviceCreateWithNativeHandle = uur::urPlatformTest;
 
-TEST_F(urCudaDeviceCreateWithNativeHandle, DISABLED_Success) {
+TEST_F(urCudaDeviceCreateWithNativeHandle, Success) {
     // get a device from cuda
     int nCudaDevices;
     ASSERT_SUCCESS_CUDA(cuDeviceGetCount(&nCudaDevices));
diff --git a/test/conformance/adapters/cuda/kernel_tests.cpp b/test/conformance/adapters/cuda/kernel_tests.cpp
index 1e7f54f3cf..53efdf28da 100644
--- a/test/conformance/adapters/cuda/kernel_tests.cpp
+++ b/test/conformance/adapters/cuda/kernel_tests.cpp
@@ -9,6 +9,7 @@
 using cudaKernelTest = uur::urQueueTest;
 UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(cudaKernelTest);
 
+// The first argument stores the implicit global offset
 inline constexpr size_t NumberOfImplicitArgsCUDA = 1;
 
 const char *ptxSource = "\n\