diff --git a/scripts/templates/params.hpp.mako b/scripts/templates/params.hpp.mako index 78966d66ff..3fef2e0c72 100644 --- a/scripts/templates/params.hpp.mako +++ b/scripts/templates/params.hpp.mako @@ -126,6 +126,19 @@ template inline void serializeTagged(std::ostream &os, const void * %endfor } // namespace ${x}_params +## TODO - should be removed as part of #789 +#ifdef UR_INCLUDE_HANDLE_IMPLEMENTATION_OVERLOADS +%for spec in specs: +%for obj in spec['objects']: +%if re.match(r"handle", obj['type']): +inline std::ostream &operator<<(std::ostream &os, const struct ${th.make_type_name(n, tags, obj)}_ &){ + return os; +} +%endif +%endfor +%endfor +#endif + %for spec in specs: %for obj in spec['objects']: ## ENUM ####################################################################### diff --git a/source/adapters/cuda/CMakeLists.txt b/source/adapters/cuda/CMakeLists.txt index f85d759c09..1222b17f67 100644 --- a/source/adapters/cuda/CMakeLists.txt +++ b/source/adapters/cuda/CMakeLists.txt @@ -3,7 +3,7 @@ # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -set(CUDA_DIR "${SYCL_ADAPTER_DIR}/sycl/plugins/unified_runtime/ur/adapters/cuda") +set(CUDA_DIR "${SYCL_ADAPTER_DIR}/sycl/plugins/unified_runtime/ur/adapters/cuda" CACHE PATH "CUDA adapter directory") set(TARGET_NAME ur_adapter_cuda) diff --git a/source/common/ur_params.hpp b/source/common/ur_params.hpp index 2f9789506e..b3f9b88fc5 100644 --- a/source/common/ur_params.hpp +++ b/source/common/ur_params.hpp @@ -193,6 +193,82 @@ inline void serializeTagged(std::ostream &os, const void *ptr, } // namespace ur_params +#ifdef UR_INCLUDE_HANDLE_IMPLEMENTATION_OVERLOADS +inline std::ostream &operator<<(std::ostream &os, + const struct ur_adapter_handle_t_ &) { + return os; +} +inline std::ostream &operator<<(std::ostream &os, + const struct ur_platform_handle_t_ &) { + return os; +} +inline std::ostream &operator<<(std::ostream &os, + const struct ur_device_handle_t_ &) { + return os; +} +inline std::ostream &operator<<(std::ostream &os, + const struct ur_context_handle_t_ &) { + return os; +} +inline std::ostream &operator<<(std::ostream &os, + const struct ur_event_handle_t_ &) { + return os; +} +inline std::ostream &operator<<(std::ostream &os, + const struct ur_program_handle_t_ &) { + return os; +} +inline std::ostream &operator<<(std::ostream &os, + const struct ur_kernel_handle_t_ &) { + return os; +} +inline std::ostream &operator<<(std::ostream &os, + const struct ur_queue_handle_t_ &) { + return os; +} +inline std::ostream &operator<<(std::ostream &os, + const struct ur_native_handle_t_ &) { + return os; +} +inline std::ostream &operator<<(std::ostream &os, + const struct ur_sampler_handle_t_ &) { + return os; +} +inline std::ostream &operator<<(std::ostream &os, + const struct ur_mem_handle_t_ &) { + return os; +} +inline std::ostream &operator<<(std::ostream &os, + const struct ur_physical_mem_handle_t_ &) { + return os; +} +inline std::ostream &operator<<(std::ostream &os, + const struct ur_usm_pool_handle_t_ &) { + return os; +} +inline std::ostream &operator<<(std::ostream &os, + const struct ur_exp_image_handle_t_ &) { + return os; +} +inline std::ostream &operator<<(std::ostream &os, + const struct ur_exp_image_mem_handle_t_ &) { + return os; +} +inline std::ostream &operator<<(std::ostream &os, + const struct ur_exp_interop_mem_handle_t_ &) { + return os; +} +inline std::ostream & +operator<<(std::ostream &os, + const struct ur_exp_interop_semaphore_handle_t_ &) { + return os; +} +inline std::ostream & +operator<<(std::ostream &os, const struct ur_exp_command_buffer_handle_t_ &) { + return os; +} +#endif + inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value); inline std::ostream &operator<<(std::ostream &os, enum ur_structure_type_t value); diff --git a/test/conformance/adapters/cuda/CMakeLists.txt b/test/conformance/adapters/cuda/CMakeLists.txt index 241eb87a8c..89c558019b 100644 --- a/test/conformance/adapters/cuda/CMakeLists.txt +++ b/test/conformance/adapters/cuda/CMakeLists.txt @@ -5,13 +5,19 @@ add_conformance_test_with_devices_environment(adapter-cuda cuda_fixtures.h + context_tests.cpp cuda_urContextGetNativeHandle.cpp cuda_urDeviceGetNativeHandle.cpp cuda_urDeviceCreateWithNativeHandle.cpp cuda_urEventGetNativeHandle.cpp cuda_urEventCreateWithNativeHandle.cpp + kernel_tests.cpp + memory_tests.cpp ) -target_link_libraries(test-adapter-cuda PRIVATE cudadrv) +target_link_libraries(test-adapter-cuda PRIVATE cudadrv ur_adapter_cuda) +target_include_directories(test-adapter-cuda PRIVATE ${CUDA_DIR} "${CUDA_DIR}/../../../" ) +# TODO - remove as part of #789 +target_compile_definitions(test-adapter-cuda PRIVATE UR_INCLUDE_HANDLE_IMPLEMENTATION_OVERLOADS) set_tests_properties(adapter-cuda PROPERTIES LABELS "conformance:cuda" diff --git a/test/conformance/adapters/cuda/context_tests.cpp b/test/conformance/adapters/cuda/context_tests.cpp new file mode 100644 index 0000000000..0ca826fff5 --- /dev/null +++ b/test/conformance/adapters/cuda/context_tests.cpp @@ -0,0 +1,204 @@ +// Copyright (C) 2022-2023 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "context.hpp" +#include "cuda_fixtures.h" +#include "queue.hpp" +#include + +using cudaUrContextCreateTest = uur::urDeviceTest; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(cudaUrContextCreateTest); + +constexpr unsigned int known_cuda_api_version = 3020; + +TEST_P(cudaUrContextCreateTest, CreateWithChildThread) { + + ur_context_handle_t context = nullptr; + ASSERT_SUCCESS(urContextCreate(1, &device, nullptr, &context)); + ASSERT_NE(context, nullptr); + + // Retrieve the CUDA context to check information is correct + auto checkValue = [=] { + CUcontext cudaContext = context->get(); + unsigned int version = 0; + EXPECT_SUCCESS_CUDA(cuCtxGetApiVersion(cudaContext, &version)); + EXPECT_EQ(version, known_cuda_api_version); + + // The current context is different from the current thread + CUcontext current; + ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(¤t)); + EXPECT_NE(cudaContext, current); + + // Set the context + EXPECT_SUCCESS_CUDA(cuCtxPushCurrent(cudaContext)); + EXPECT_SUCCESS_CUDA(cuCtxGetCurrent(¤t)); + EXPECT_EQ(cudaContext, current); + }; + + auto callContextFromOtherThread = std::thread(checkValue); + callContextFromOtherThread.join(); + ASSERT_SUCCESS(urContextRelease(context)); +} + +TEST_P(cudaUrContextCreateTest, ActiveContext) { + ur_context_handle_t context = nullptr; + ASSERT_SUCCESS(urContextCreate(1, &device, nullptr, &context)); + ASSERT_NE(context, nullptr); + + ur_queue_handle_t queue = nullptr; + ur_queue_properties_t queue_props{UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, + nullptr, 0}; + ASSERT_SUCCESS(urQueueCreate(context, device, &queue_props, &queue)); + ASSERT_NE(queue, nullptr); + + // check that the queue has the correct context + ASSERT_EQ(context, queue->getContext()); + + // create a buffer + ur_mem_handle_t buffer = nullptr; + ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, 1024, + nullptr, &buffer)); + ASSERT_NE(buffer, nullptr); + + // check that the context is now the active CUDA context + CUcontext cudaCtx = nullptr; + ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(&cudaCtx)); + ASSERT_NE(cudaCtx, nullptr); + + ur_native_handle_t native_context = nullptr; + ASSERT_SUCCESS(urContextGetNativeHandle(context, &native_context)); + ASSERT_NE(native_context, nullptr); + ASSERT_EQ(cudaCtx, reinterpret_cast(native_context)); + + // release resources + ASSERT_SUCCESS(urMemRelease(buffer)); + ASSERT_SUCCESS(urQueueRelease(queue)); + ASSERT_SUCCESS(urContextRelease(context)); +} + +TEST_P(cudaUrContextCreateTest, ContextLifetimeExisting) { + // start by setting up a CUDA context on the thread + CUcontext original; + ASSERT_SUCCESS_CUDA(cuCtxCreate(&original, CU_CTX_MAP_HOST, device->get())); + + // ensure the CUDA context is active + CUcontext current = nullptr; + ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(¤t)); + ASSERT_EQ(original, current); + + // create a UR context + ur_context_handle_t context; + ASSERT_SUCCESS(urContextCreate(1, &device, nullptr, &context)); + ASSERT_NE(context, nullptr); + + // create a queue with the context + ur_queue_handle_t queue; + ASSERT_SUCCESS(urQueueCreate(context, device, nullptr, &queue)); + ASSERT_NE(queue, nullptr); + + // ensure the queue has the correct context + ASSERT_EQ(context, queue->getContext()); + + // create a buffer in the context to set the context as active + ur_mem_handle_t buffer; + ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, 1024, + nullptr, &buffer)); + + // check that context is now the active cuda context + ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(¤t)); + ASSERT_EQ(current, context->get()); + + ASSERT_SUCCESS(urQueueRelease(queue)); + ASSERT_SUCCESS(urContextRelease(context)); +} + +TEST_P(cudaUrContextCreateTest, ThreadedContext) { + // create two new UR contexts + ur_context_handle_t context1; + ASSERT_SUCCESS(urContextCreate(1, &device, nullptr, &context1)); + ASSERT_NE(context1, nullptr); + + ur_context_handle_t context2; + ASSERT_SUCCESS(urContextCreate(1, &device, nullptr, &context2)); + ASSERT_NE(context2, nullptr); + + // setup synchronization variables between the main thread and + // the testing thread + std::mutex m; + std::condition_variable cv; + bool released = false; + bool thread_done = false; + + // create a testing thread that will create a queue with the first context, + // release the queue, then wait for the main thread to release + // the first context, and then create and release another queue with + // the second context. + auto test_thread = std::thread([&] { + CUcontext current = nullptr; + + // create a queue with the first context + ur_queue_handle_t queue; + ASSERT_SUCCESS(urQueueCreate(context1, device, nullptr, &queue)); + ASSERT_NE(queue, nullptr); + + // ensure that the queue has the correct context + ASSERT_EQ(context1, queue->getContext()); + + // create a buffer to set context1 as the active context + ur_mem_handle_t buffer; + ASSERT_SUCCESS(urMemBufferCreate(context1, UR_MEM_FLAG_READ_WRITE, 1024, + nullptr, &buffer)); + ASSERT_NE(buffer, nullptr); + + // release the mem and queue + ASSERT_SUCCESS(urMemRelease(buffer)); + ASSERT_SUCCESS(urQueueRelease(queue)); + + // mark the first set of processing as done and notify the main thread + std::unique_lock lock(m); + thread_done = true; + lock.unlock(); + cv.notify_one(); + + // wait for the main thread to release the first context + lock.lock(); + cv.wait(lock, [&] { return released; }); + + // create a queue with the 2nd context + ASSERT_SUCCESS(urQueueCreate(context2, device, nullptr, &queue)); + ASSERT_NE(queue, nullptr); + + // ensure queue has correct context + ASSERT_EQ(context2, queue->getContext()); + + // create a buffer to set the active context + ASSERT_SUCCESS(urMemBufferCreate(context2, UR_MEM_FLAG_READ_WRITE, 1024, + nullptr, &buffer)); + + // check that the 2nd context is now tha active cuda context + ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(¤t)); + ASSERT_EQ(current, context2->get()); + + // release + ASSERT_SUCCESS(urMemRelease(buffer)); + ASSERT_SUCCESS(urQueueRelease(queue)); + }); + + // wait for the thread to be done with the first queue to release the first + // context + std::unique_lock lock(m); + cv.wait(lock, [&] { return thread_done; }); + ASSERT_SUCCESS(urContextRelease(context1)); + + // notify the other thread that the context was released + released = true; + lock.unlock(); + cv.notify_one(); + + // wait for the thread to finish + test_thread.join(); + + ASSERT_SUCCESS(urContextRelease(context2)); +} diff --git a/test/conformance/adapters/cuda/cuda_fixtures.h b/test/conformance/adapters/cuda/cuda_fixtures.h index e367a4aa2c..3c81a79dd2 100644 --- a/test/conformance/adapters/cuda/cuda_fixtures.h +++ b/test/conformance/adapters/cuda/cuda_fixtures.h @@ -19,6 +19,7 @@ struct ResultCuda { CUresult value; }; + } // namespace uur #ifndef ASSERT_EQ_RESULT_CUDA @@ -32,12 +33,11 @@ struct ResultCuda { #ifndef EXPECT_EQ_RESULT_CUDA #define EXPECT_EQ_RESULT_CUDA(EXPECTED, ACTUAL) \ - EXPECT_EQ_RESULT_CUDA(uur::ResultCuda(EXPECTED), uur::ResultCuda(ACTUAL)) + EXPECT_EQ(uur::ResultCuda(EXPECTED), uur::ResultCuda(ACTUAL)) #endif // EXPECT_EQ_RESULT_CUDA #ifndef EXPECT_SUCCESS_CUDA -#define EXPECT_SUCCESS_CUDA(ACTUAL) \ - EXPECT_EQ_RESULT_CUDA(UR_RESULT_SUCCESS, ACTUAL) +#define EXPECT_SUCCESS_CUDA(ACTUAL) EXPECT_EQ_RESULT_CUDA(CUDA_SUCCESS, ACTUAL) #endif // EXPECT_EQ_RESULT_CUDA #endif // UR_TEST_CONFORMANCE_ADAPTERS_CUDA_FIXTURES_H_INCLUDED diff --git a/test/conformance/adapters/cuda/kernel_tests.cpp b/test/conformance/adapters/cuda/kernel_tests.cpp new file mode 100644 index 0000000000..53efdf28da --- /dev/null +++ b/test/conformance/adapters/cuda/kernel_tests.cpp @@ -0,0 +1,271 @@ +// Copyright (C) 2022-2023 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "kernel.hpp" +#include + +using cudaKernelTest = uur::urQueueTest; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(cudaKernelTest); + +// The first argument stores the implicit global offset +inline constexpr size_t NumberOfImplicitArgsCUDA = 1; + +const char *ptxSource = "\n\ +.version 3.2\n\ +.target sm_20\n\ +.address_size 64\n\ +.visible .entry _Z8myKernelPi(\n\ + .param .u64 _Z8myKernelPi_param_0\n\ +)\n\ +{\n\ + .reg .s32 %r<5>;\n\ + .reg .s64 %rd<5>;\n\ + ld.param.u64 %rd1, [_Z8myKernelPi_param_0];\n\ + cvta.to.global.u64 %rd2, %rd1;\n\ + .loc 1 3 1\n\ + mov.u32 %r1, %ntid.x;\n\ + mov.u32 %r2, %ctaid.x;\n\ + mov.u32 %r3, %tid.x;\n\ + mad.lo.s32 %r4, %r1, %r2, %r3;\n\ + mul.wide.s32 %rd3, %r4, 4;\n\ + add.s64 %rd4, %rd2, %rd3;\n\ + .loc 1 4 1\n\ + st.global.u32 [%rd4], %r4;\n\ + .loc 1 5 2\n\ + ret;\n\ + ret;\ +\n\ +}\ +\n\ +"; + +const char *twoParams = "\n\ +.version 3.2\n\ +.target sm_20\n\ +.address_size 64\n\ +.visible .entry twoParamKernel(\n\ + .param .u64 twoParamKernel_param_0,\n\ + .param .u64 twoParamKernel_param_1\n\ +)\n\ +{\n\ + ret;\ + \n\ +}\n\ +"; + +const char *threeParamsTwoLocal = "\n\ +.version 3.2\n\ +.target sm_20\n\ +.address_size 64\n\ +.visible .entry twoParamKernelLocal(\n\ + .param .u64 twoParamKernel_param_0,\n\ + .param .u32 twoParamKernel_param_1,\n\ + .param .u32 twoParamKernel_param_2\n\ +)\n\ +{\n\ + ret;\ + \n\ +}\n\ +"; + +TEST_P(cudaKernelTest, CreateProgramAndKernel) { + + ur_program_handle_t program = nullptr; + ASSERT_SUCCESS(urProgramCreateWithBinary(context, device, sizeof(ptxSource), + (const uint8_t *)ptxSource, + nullptr, &program)); + ASSERT_NE(program, nullptr); + ASSERT_SUCCESS(urProgramBuild(context, program, nullptr)); + + ur_kernel_handle_t kernel = nullptr; + ASSERT_SUCCESS(urKernelCreate(program, "_Z8myKernelPi", &kernel)); + ASSERT_NE(kernel, nullptr); + + ASSERT_SUCCESS(urKernelRelease(kernel)); + ASSERT_SUCCESS(urProgramRelease(program)); +} + +TEST_P(cudaKernelTest, CreateProgramAndKernelWithMetadata) { + + std::vector reqdWorkGroupSizeMD; + reqdWorkGroupSizeMD.reserve(5); + // 64-bit representing bit size + reqdWorkGroupSizeMD.push_back(96); + reqdWorkGroupSizeMD.push_back(0); + // reqd_work_group_size x + reqdWorkGroupSizeMD.push_back(8); + // reqd_work_group_size y + reqdWorkGroupSizeMD.push_back(16); + // reqd_work_group_size z + reqdWorkGroupSizeMD.push_back(32); + + const char *reqdWorkGroupSizeMDConstName = + "_Z8myKernelPi@reqd_work_group_size"; + std::vector reqdWorkGroupSizeMDName( + reqdWorkGroupSizeMDConstName, reqdWorkGroupSizeMDConstName + + strlen(reqdWorkGroupSizeMDConstName) + + 1); + + ur_program_metadata_value_t reqd_work_group_value; + reqd_work_group_value.pData = reqdWorkGroupSizeMD.data(); + + ur_program_metadata_t reqdWorkGroupSizeMDProp = { + reqdWorkGroupSizeMDName.data(), UR_PROGRAM_METADATA_TYPE_BYTE_ARRAY, + reqdWorkGroupSizeMD.size() * sizeof(uint32_t), reqd_work_group_value}; + + ur_program_properties_t programProps{UR_STRUCTURE_TYPE_PROGRAM_PROPERTIES, + nullptr, 1, &reqdWorkGroupSizeMDProp}; + ur_program_handle_t program = nullptr; + ASSERT_SUCCESS(urProgramCreateWithBinary(context, device, sizeof(ptxSource), + (const uint8_t *)ptxSource, + &programProps, &program)); + ASSERT_NE(program, nullptr); + + ASSERT_SUCCESS(urProgramBuild(context, program, nullptr)); + + ur_kernel_handle_t kernel = nullptr; + ASSERT_SUCCESS(urKernelCreate(program, "_Z8myKernelPi", &kernel)); + + size_t compileWGSize[3] = {0}; + ASSERT_SUCCESS(urKernelGetGroupInfo( + kernel, device, UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, + sizeof(compileWGSize), &compileWGSize, nullptr)); + + for (int i = 0; i < 3; i++) { + ASSERT_EQ(compileWGSize[i], reqdWorkGroupSizeMD[i + 2]); + } + + ASSERT_SUCCESS(urKernelRelease(kernel)); + ASSERT_SUCCESS(urProgramRelease(program)); +} + +TEST_P(cudaKernelTest, URKernelArgumentSimple) { + ur_program_handle_t program = nullptr; + ASSERT_SUCCESS(urProgramCreateWithBinary(context, device, sizeof(ptxSource), + (const uint8_t *)ptxSource, + nullptr, &program)); + ASSERT_NE(program, nullptr); + ASSERT_SUCCESS(urProgramBuild(context, program, nullptr)); + + ur_kernel_handle_t kernel = nullptr; + ASSERT_SUCCESS(urKernelCreate(program, "_Z8myKernelPi", &kernel)); + ASSERT_NE(kernel, nullptr); + + int number = 10; + ASSERT_SUCCESS( + urKernelSetArgValue(kernel, 0, sizeof(int), nullptr, &number)); + const auto &kernelArgs = kernel->getArgIndices(); + ASSERT_EQ(kernelArgs.size(), 1 + NumberOfImplicitArgsCUDA); + + int storedValue = *static_cast(kernelArgs[0]); + ASSERT_EQ(storedValue, number); + + ASSERT_SUCCESS(urKernelRelease(kernel)); + ASSERT_SUCCESS(urProgramRelease(program)); +} + +TEST_P(cudaKernelTest, URKernelArgumentSetTwice) { + ur_program_handle_t program = nullptr; + ASSERT_SUCCESS(urProgramCreateWithBinary(context, device, sizeof(ptxSource), + (const uint8_t *)ptxSource, + nullptr, &program)); + ASSERT_NE(program, nullptr); + ASSERT_SUCCESS(urProgramBuild(context, program, nullptr)); + + ur_kernel_handle_t kernel = nullptr; + ASSERT_SUCCESS(urKernelCreate(program, "_Z8myKernelPi", &kernel)); + ASSERT_NE(kernel, nullptr); + + int number = 10; + ASSERT_SUCCESS( + urKernelSetArgValue(kernel, 0, sizeof(int), nullptr, &number)); + const auto &kernelArgs = kernel->getArgIndices(); + ASSERT_EQ(kernelArgs.size(), 1 + NumberOfImplicitArgsCUDA); + int storedValue = *static_cast(kernelArgs[0]); + ASSERT_EQ(storedValue, number); + + int otherNumber = 934; + ASSERT_SUCCESS( + urKernelSetArgValue(kernel, 0, sizeof(int), nullptr, &otherNumber)); + const auto kernelArgs2 = kernel->getArgIndices(); + ASSERT_EQ(kernelArgs2.size(), 1 + NumberOfImplicitArgsCUDA); + storedValue = *static_cast(kernelArgs2[0]); + ASSERT_EQ(storedValue, otherNumber); + + ASSERT_SUCCESS(urKernelRelease(kernel)); + ASSERT_SUCCESS(urProgramRelease(program)); +} + +TEST_P(cudaKernelTest, URKernelDispatch) { + ur_program_handle_t program = nullptr; + ASSERT_SUCCESS(urProgramCreateWithBinary(context, device, sizeof(ptxSource), + (const uint8_t *)ptxSource, + nullptr, &program)); + ASSERT_NE(program, nullptr); + ASSERT_SUCCESS(urProgramBuild(context, program, nullptr)); + + ur_kernel_handle_t kernel = nullptr; + ASSERT_SUCCESS(urKernelCreate(program, "_Z8myKernelPi", &kernel)); + ASSERT_NE(kernel, nullptr); + + const size_t memSize = 1024u; + ur_mem_handle_t buffer = nullptr; + ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, memSize, + nullptr, &buffer)); + ASSERT_NE(buffer, nullptr); + ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, 0, nullptr, buffer)); + + const size_t workDim = 1; + const size_t globalWorkOffset[] = {0}; + const size_t globalWorkSize[] = {1}; + const size_t localWorkSize[] = {1}; + ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, workDim, + globalWorkOffset, globalWorkSize, + localWorkSize, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + ASSERT_SUCCESS(urMemRelease(buffer)); + ASSERT_SUCCESS(urKernelRelease(kernel)); + ASSERT_SUCCESS(urProgramRelease(program)); +} + +TEST_P(cudaKernelTest, URKernelDispatchTwo) { + ur_program_handle_t program = nullptr; + ASSERT_SUCCESS(urProgramCreateWithBinary(context, device, sizeof(ptxSource), + (const uint8_t *)twoParams, + nullptr, &program)); + ASSERT_NE(program, nullptr); + ASSERT_SUCCESS(urProgramBuild(context, program, nullptr)); + + ur_kernel_handle_t kernel = nullptr; + ASSERT_SUCCESS(urKernelCreate(program, "twoParamKernel", &kernel)); + ASSERT_NE(kernel, nullptr); + + const size_t memSize = 1024u; + ur_mem_handle_t buffer1 = nullptr; + ur_mem_handle_t buffer2 = nullptr; + ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, memSize, + nullptr, &buffer1)); + ASSERT_NE(buffer1, nullptr); + ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, memSize, + nullptr, &buffer2)); + ASSERT_NE(buffer1, nullptr); + ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, 0, nullptr, buffer1)); + ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, 1, nullptr, buffer2)); + + const size_t workDim = 1; + const size_t globalWorkOffset[] = {0}; + const size_t globalWorkSize[] = {1}; + const size_t localWorkSize[] = {1}; + ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, workDim, + globalWorkOffset, globalWorkSize, + localWorkSize, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + ASSERT_SUCCESS(urMemRelease(buffer1)); + ASSERT_SUCCESS(urMemRelease(buffer2)); + ASSERT_SUCCESS(urKernelRelease(kernel)); + ASSERT_SUCCESS(urProgramRelease(program)); +} diff --git a/test/conformance/adapters/cuda/memory_tests.cpp b/test/conformance/adapters/cuda/memory_tests.cpp new file mode 100644 index 0000000000..8ca62c1e63 --- /dev/null +++ b/test/conformance/adapters/cuda/memory_tests.cpp @@ -0,0 +1,27 @@ +// Copyright (C) 2022-2023 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "cuda_fixtures.h" + +using cudaMemoryTest = uur::urContextTest; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(cudaMemoryTest); + +TEST_P(cudaMemoryTest, urMemBufferNoActiveContext) { + constexpr size_t memSize = 1024u; + + CUcontext current = nullptr; + do { + CUcontext oldContext = nullptr; + ASSERT_SUCCESS_CUDA(cuCtxPopCurrent(&oldContext)); + ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(¤t)); + } while (current != nullptr); + + ur_mem_handle_t mem; + ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, memSize, + nullptr, &mem)); + ASSERT_NE(mem, nullptr); + + ASSERT_SUCCESS(urMemRelease(mem)); +}