From 22e97851e401a1427f9d4a313cc954d4ea75baff Mon Sep 17 00:00:00 2001 From: "Neil R. Spruit" Date: Mon, 18 Mar 2024 00:23:22 -0700 Subject: [PATCH 1/7] =?UTF-8?q?[UR][L0]=20Support=20for=20urUsmP2PPeerAcce?= =?UTF-8?q?ssGetInfoExp=20to=20query=20p2p=20access=E2=80=A6=20(#12983)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … info pre-commit PR for https://github.com/oneapi-src/unified-runtime/pull/1429 --------- Signed-off-by: Neil R. Spruit Co-authored-by: Kenneth Benzie (Benie) --- sycl/plugins/unified_runtime/CMakeLists.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 2a437a05f6676..6b02bd454e7b9 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -57,13 +57,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT) include(FetchContent) set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - # commit ec634ff05b067d7922ec45059dda94665e5dcd9b - # Merge: 418ad535 8714b853 - # Author: Piotr Balcer - # Date: Thu Mar 14 15:52:52 2024 +0100 - # Merge pull request #1438 from PatKamin/disable-fuzztests - # Disable fuzz tests on ubuntu-22.04 runner - set(UNIFIED_RUNTIME_TAG ec634ff05b067d7922ec45059dda94665e5dcd9b) + # commit 09be0881b727fadb1c04b38c00d2562d7dc6875f + # Merge: bb589ca8 e9f855d4 + # Author: Kenneth Benzie (Benie) + # Date: Thu Mar 14 22:10:28 2024 +0000 + # Merge pull request #1429 from nrspruit/l0_p2p_device_query + # [L0] Support for urUsmP2PPeerAccessGetInfoExp to query p2p access info + set(UNIFIED_RUNTIME_TAG 09be0881b727fadb1c04b38c00d2562d7dc6875f) if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO) set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}") From fa53feae984e03a0c42fe269b04554141b02bf7b Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Mon, 18 Mar 2024 17:14:53 +0000 Subject: [PATCH 2/7] [CUDA][LIBCLC] Implement RC11 seq_cst for PTX6.0 (#12516) Implement `seq_cst` RC11/ptx6.0 memory consistency for CUDA backend. See https://dl.acm.org/doi/pdf/10.1145/3297858.3304043 and https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#memory-consistency-model for full details. Requires sm_70 or above. With this PR there is now a complete mapping between SYCL memory consistency model capabilities and the official CUDA model, fully exploiting CUDA capabilities when possible on supported arches. This makes the SYCL-CTS atomic_ref tests fully pass for sm_70 on the cuda backend. Fixes https://github.com/intel/llvm/issues/11208 Depends on https://github.com/intel/llvm/pull/12907 --------- Signed-off-by: JackAKirk --- .../ptx-nvidiacl/libspirv/atomic/atomic_add.cl | 7 +++++++ .../libspirv/atomic/atomic_cmpxchg.cl | 8 ++++++++ .../libspirv/atomic/atomic_helpers.h | 16 ++++++++++++++++ .../ptx-nvidiacl/libspirv/atomic/atomic_load.cl | 7 +++++++ .../ptx-nvidiacl/libspirv/atomic/atomic_store.cl | 8 ++++++++ sycl/plugins/unified_runtime/CMakeLists.txt | 12 ++++++------ 6 files changed, 52 insertions(+), 6 deletions(-) diff --git a/libclc/ptx-nvidiacl/libspirv/atomic/atomic_add.cl b/libclc/ptx-nvidiacl/libspirv/atomic/atomic_add.cl index 8ab2604cde0c4..c1d69efc5b477 100644 --- a/libclc/ptx-nvidiacl/libspirv/atomic/atomic_add.cl +++ b/libclc/ptx-nvidiacl/libspirv/atomic/atomic_add.cl @@ -71,6 +71,13 @@ Memory order is stored in the lowest 5 bits */ ADDR_SPACE, ADDR_SPACE_NV) \ } \ break; \ + case SequentiallyConsistent: \ + if (__clc_nvvm_reflect_arch() >= 700) { \ + __CLC_NVVM_FENCE_SC_SM70() \ + __CLC_NVVM_ATOMIC_IMPL_ORDER(double, double, d, add, ADDR_SPACE, \ + ADDR_SPACE_NV, _acq_rel) \ + break; \ + } \ } \ __builtin_trap(); \ __builtin_unreachable(); \ diff --git a/libclc/ptx-nvidiacl/libspirv/atomic/atomic_cmpxchg.cl b/libclc/ptx-nvidiacl/libspirv/atomic/atomic_cmpxchg.cl index fceeda0f60361..19d4dca833fef 100644 --- a/libclc/ptx-nvidiacl/libspirv/atomic/atomic_cmpxchg.cl +++ b/libclc/ptx-nvidiacl/libspirv/atomic/atomic_cmpxchg.cl @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include #include #include @@ -120,6 +121,13 @@ Memory order is stored in the lowest 5 bits */ \ TYPE, TYPE_NV, TYPE_MANGLED_NV, OP, ADDR_SPACE, ADDR_SPACE_NV) \ } \ break; \ + case SequentiallyConsistent: \ + if (__clc_nvvm_reflect_arch() >= 700) { \ + __CLC_NVVM_FENCE_SC_SM70() \ + __CLC_NVVM_ATOMIC_CAS_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, OP, \ + ADDR_SPACE, ADDR_SPACE_NV, _acq_rel) \ + break; \ + } \ } \ __builtin_trap(); \ __builtin_unreachable(); \ diff --git a/libclc/ptx-nvidiacl/libspirv/atomic/atomic_helpers.h b/libclc/ptx-nvidiacl/libspirv/atomic/atomic_helpers.h index 56c20cece7935..ecffd9e82d2fe 100644 --- a/libclc/ptx-nvidiacl/libspirv/atomic/atomic_helpers.h +++ b/libclc/ptx-nvidiacl/libspirv/atomic/atomic_helpers.h @@ -72,6 +72,15 @@ _CLC_OVERLOAD _CLC_DECL void __spirv_MemoryBarrier(unsigned int, unsigned int); } \ } +#define __CLC_NVVM_FENCE_SC_SM70() \ + if (scope == CrossDevice) { \ + __asm__ __volatile__("fence.sc.sys;"); \ + } else if (scope == Device) { \ + __asm__ __volatile__("fence.sc.gpu;"); \ + } else { \ + __asm__ __volatile__("fence.sc.cta;"); \ + } + #define __CLC_NVVM_ATOMIC_IMPL( \ TYPE, TYPE_MANGLED, TYPE_NV, TYPE_MANGLED_NV, OP, NAME_MANGLED, \ ADDR_SPACE, POINTER_AND_ADDR_SPACE_MANGLED, ADDR_SPACE_NV, SUBSTITUTION) \ @@ -117,6 +126,13 @@ Memory order is stored in the lowest 5 bits */ \ OP, ADDR_SPACE, ADDR_SPACE_NV) \ } \ break; \ + case SequentiallyConsistent: \ + if (__clc_nvvm_reflect_arch() >= 700) { \ + __CLC_NVVM_FENCE_SC_SM70() \ + __CLC_NVVM_ATOMIC_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, OP, \ + ADDR_SPACE, ADDR_SPACE_NV, _acq_rel) \ + break; \ + } \ } \ __builtin_trap(); \ __builtin_unreachable(); \ diff --git a/libclc/ptx-nvidiacl/libspirv/atomic/atomic_load.cl b/libclc/ptx-nvidiacl/libspirv/atomic/atomic_load.cl index 54483b8c5ec25..60311a978762d 100644 --- a/libclc/ptx-nvidiacl/libspirv/atomic/atomic_load.cl +++ b/libclc/ptx-nvidiacl/libspirv/atomic/atomic_load.cl @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include #include #include @@ -53,6 +54,12 @@ Memory order is stored in the lowest 5 bits */ \ case Acquire: \ __CLC_NVVM_ATOMIC_LOAD_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, \ ADDR_SPACE, ADDR_SPACE_NV, _acquire) \ + break; \ + case SequentiallyConsistent: \ + __CLC_NVVM_FENCE_SC_SM70() \ + __CLC_NVVM_ATOMIC_LOAD_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, \ + ADDR_SPACE, ADDR_SPACE_NV, _acquire) \ + break; \ } \ } else { \ TYPE_NV res = __nvvm_volatile_ld##ADDR_SPACE_NV##TYPE_MANGLED_NV( \ diff --git a/libclc/ptx-nvidiacl/libspirv/atomic/atomic_store.cl b/libclc/ptx-nvidiacl/libspirv/atomic/atomic_store.cl index 1aaf1c8ab8499..b2e23cd76eac2 100644 --- a/libclc/ptx-nvidiacl/libspirv/atomic/atomic_store.cl +++ b/libclc/ptx-nvidiacl/libspirv/atomic/atomic_store.cl @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include #include #include @@ -54,6 +55,13 @@ Memory order is stored in the lowest 5 bits */ \ __CLC_NVVM_ATOMIC_STORE_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, \ ADDR_SPACE, ADDR_SPACE_NV, \ _release) \ + break; \ + case SequentiallyConsistent: \ + __CLC_NVVM_FENCE_SC_SM70() \ + __CLC_NVVM_ATOMIC_STORE_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, \ + ADDR_SPACE, ADDR_SPACE_NV, \ + _release) \ + break; \ } \ } else { \ switch (order) { \ diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 6b02bd454e7b9..319763ca97b99 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -57,13 +57,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT) include(FetchContent) set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - # commit 09be0881b727fadb1c04b38c00d2562d7dc6875f - # Merge: bb589ca8 e9f855d4 + # commit 29ee45c4451a682f744146cc9dbeb2617ecdd6b3 + # Merge: db4b0c14 4f5d005a # Author: Kenneth Benzie (Benie) - # Date: Thu Mar 14 22:10:28 2024 +0000 - # Merge pull request #1429 from nrspruit/l0_p2p_device_query - # [L0] Support for urUsmP2PPeerAccessGetInfoExp to query p2p access info - set(UNIFIED_RUNTIME_TAG 09be0881b727fadb1c04b38c00d2562d7dc6875f) + # Date: Mon Mar 18 12:14:26 2024 +0000 + # Merge pull request #1291 from JackAKirk/cuda-seq-cst-b + # [CUDA] Report that devices with cc >= sm_70 support seq_cst + set(UNIFIED_RUNTIME_TAG 29ee45c4451a682f744146cc9dbeb2617ecdd6b3) if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO) set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}") From 0326cdc5a42286db076ac56fed1ef703e9769d38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio?= Date: Tue, 19 Mar 2024 16:43:01 +0000 Subject: [PATCH 3/7] [UR] Add urProgramGetGlobalVariablePointer entrypoint (#12496) Co-authored-by: Kenneth Benzie (Benie) --- sycl/include/sycl/detail/pi.def | 1 + sycl/include/sycl/detail/pi.h | 7 ++++++- sycl/plugins/cuda/pi_cuda.cpp | 9 +++++++++ sycl/plugins/hip/pi_hip.cpp | 9 +++++++++ sycl/plugins/level_zero/pi_level_zero.cpp | 9 +++++++++ sycl/plugins/native_cpu/pi_native_cpu.cpp | 9 +++++++++ sycl/plugins/opencl/pi_opencl.cpp | 9 +++++++++ sycl/plugins/unified_runtime/CMakeLists.txt | 12 ++++++------ sycl/plugins/unified_runtime/pi2ur.hpp | 16 ++++++++++++++++ .../unified_runtime/pi_unified_runtime.cpp | 10 ++++++++++ sycl/test/abi/pi_cuda_symbol_check.dump | 1 + sycl/test/abi/pi_hip_symbol_check.dump | 1 + sycl/test/abi/pi_level_zero_symbol_check.dump | 1 + sycl/test/abi/pi_nativecpu_symbol_check.dump | 1 + sycl/test/abi/pi_opencl_symbol_check.dump | 1 + sycl/unittests/helpers/PiMockPlugin.hpp | 6 ++++++ 16 files changed, 95 insertions(+), 7 deletions(-) diff --git a/sycl/include/sycl/detail/pi.def b/sycl/include/sycl/detail/pi.def index c6b962b8b0f48..46a200e001231 100644 --- a/sycl/include/sycl/detail/pi.def +++ b/sycl/include/sycl/detail/pi.def @@ -31,6 +31,7 @@ _PI_API(piDeviceRetain) _PI_API(piDeviceRelease) _PI_API(piextDeviceSelectBinary) _PI_API(piextGetDeviceFunctionPointer) +_PI_API(piextGetGlobalVariablePointer) _PI_API(piextDeviceGetNativeHandle) _PI_API(piextDeviceCreateWithNativeHandle) // Context diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 56fdeb7a1051b..f6ee364c17a23 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -154,9 +154,10 @@ // 15.44 Add coarse-grain memory advice flag for HIP. // 15.45 Added piextKernelSuggestMaxCooperativeGroupCount and // piextEnqueueCooperativeKernelLaunch. +// 15.46 Add piextGetGlobalVariablePointer #define _PI_H_VERSION_MAJOR 15 -#define _PI_H_VERSION_MINOR 45 +#define _PI_H_VERSION_MINOR 46 #define _PI_STRING_HELPER(a) #a #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b) @@ -1287,6 +1288,10 @@ __SYCL_EXPORT pi_result piextGetDeviceFunctionPointer( pi_device device, pi_program program, const char *function_name, pi_uint64 *function_pointer_ret); +__SYCL_EXPORT pi_result piextGetGlobalVariablePointer( + pi_device Device, pi_program Program, const char *GlobalVariableName, + size_t *GlobalVariableSize, void **GlobalVariablePointerRet); + // // Context // diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 02fe3af901cb8..e6d395e758568 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -848,6 +848,15 @@ pi_result piextGetDeviceFunctionPointer(pi_device Device, pi_program Program, FunctionPointerRet); } +pi_result piextGetGlobalVariablePointer(pi_device Device, pi_program Program, + const char *GlobalVariableName, + size_t *GlobalVariableSize, + void **GlobalVariablePointerRet) { + return pi2ur::piextGetGlobalVariablePointer( + Device, Program, GlobalVariableName, GlobalVariableSize, + GlobalVariablePointerRet); +} + pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context, pi_device Device, pi_usm_mem_properties *Properties, size_t Size, diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index b895727c9d0fa..2fbde10b77123 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -851,6 +851,15 @@ pi_result piextGetDeviceFunctionPointer(pi_device Device, pi_program Program, FunctionPointerRet); } +pi_result piextGetGlobalVariablePointer(pi_device Device, pi_program Program, + const char *GlobalVariableName, + size_t *GlobalVariableSize, + void **GlobalVariablePointerRet) { + return pi2ur::piextGetGlobalVariablePointer( + Device, Program, GlobalVariableName, GlobalVariableSize, + GlobalVariablePointerRet); +} + pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context, pi_device Device, pi_usm_mem_properties *Properties, size_t Size, diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index c923c802f1d3f..26a1b104b3335 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -875,6 +875,15 @@ pi_result piextGetDeviceFunctionPointer(pi_device Device, pi_program Program, FunctionPointerRet); } +pi_result piextGetGlobalVariablePointer(pi_device Device, pi_program Program, + const char *GlobalVariableName, + size_t *GlobalVariableSize, + void **GlobalVariablePointerRet) { + return pi2ur::piextGetGlobalVariablePointer( + Device, Program, GlobalVariableName, GlobalVariableSize, + GlobalVariablePointerRet); +} + pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context, pi_device Device, pi_usm_mem_properties *Properties, size_t Size, diff --git a/sycl/plugins/native_cpu/pi_native_cpu.cpp b/sycl/plugins/native_cpu/pi_native_cpu.cpp index 2aef845590663..01b6dee1bb0f2 100644 --- a/sycl/plugins/native_cpu/pi_native_cpu.cpp +++ b/sycl/plugins/native_cpu/pi_native_cpu.cpp @@ -852,6 +852,15 @@ pi_result piextGetDeviceFunctionPointer(pi_device Device, pi_program Program, FunctionPointerRet); } +pi_result piextGetGlobalVariablePointer(pi_device Device, pi_program Program, + const char *GlobalVariableName, + size_t *GlobalVariableSize, + void **GlobalVariablePointerRet) { + return pi2ur::piextGetGlobalVariablePointer( + Device, Program, GlobalVariableName, GlobalVariableSize, + GlobalVariablePointerRet); +} + pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context, pi_device Device, pi_usm_mem_properties *Properties, size_t Size, diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 45fb66575ec42..e8a168b60445e 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -813,6 +813,15 @@ pi_result piextGetDeviceFunctionPointer(pi_device Device, pi_program Program, FunctionPointerRet); } +pi_result piextGetGlobalVariablePointer(pi_device Device, pi_program Program, + const char *GlobalVariableName, + size_t *GlobalVariableSize, + void **GlobalVariablePointerRet) { + return pi2ur::piextGetGlobalVariablePointer( + Device, Program, GlobalVariableName, GlobalVariableSize, + GlobalVariablePointerRet); +} + pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context, pi_device Device, pi_usm_mem_properties *Properties, size_t Size, diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 319763ca97b99..b96906e4f67ad 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -57,13 +57,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT) include(FetchContent) set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - # commit 29ee45c4451a682f744146cc9dbeb2617ecdd6b3 - # Merge: db4b0c14 4f5d005a + # commit 4d0183a8e3152f7c5b7a814d5001c90cb2412051 + # Merge: 29ee45c4 ca3da5aa # Author: Kenneth Benzie (Benie) - # Date: Mon Mar 18 12:14:26 2024 +0000 - # Merge pull request #1291 from JackAKirk/cuda-seq-cst-b - # [CUDA] Report that devices with cc >= sm_70 support seq_cst - set(UNIFIED_RUNTIME_TAG 29ee45c4451a682f744146cc9dbeb2617ecdd6b3) + # Date: Mon Mar 18 23:56:24 2024 +0000 + # Merge pull request #1255 from fabiomestre/fabio/add_global_variable_pointer + # [SPEC] Add urProgramGetGlobalVariablePointer entrypoint + set(UNIFIED_RUNTIME_TAG 4d0183a8e3152f7c5b7a814d5001c90cb2412051) if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO) set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}") diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 87ee60f41e2da..f1eb777046a90 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -2162,6 +2162,22 @@ inline pi_result piextGetDeviceFunctionPointer(pi_device Device, return PI_SUCCESS; } +inline pi_result piextGetGlobalVariablePointer( + pi_device Device, pi_program Program, const char *GlobalVariableName, + size_t *GlobalVariableSize, void **GlobalVariablePointerRet) { + PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); + + auto UrDevice = reinterpret_cast(Device); + + ur_program_handle_t UrProgram = + reinterpret_cast(Program); + + HANDLE_ERRORS(urProgramGetGlobalVariablePointer( + UrDevice, UrProgram, GlobalVariableName, GlobalVariableSize, + GlobalVariablePointerRet)); + return PI_SUCCESS; +} + // Special version of piKernelSetArg to accept pi_mem. inline pi_result piextKernelSetArgMemObj(pi_kernel Kernel, pi_uint32 ArgIndex, diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp index b7741ce6cf509..8701d23027682 100644 --- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp +++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp @@ -763,6 +763,15 @@ __SYCL_EXPORT pi_result piextGetDeviceFunctionPointer( FunctionPointerRet); } +__SYCL_EXPORT pi_result piextGetGlobalVariablePointer( + pi_device Device, pi_program Program, const char *GlobalVariableName, + size_t *GlobalVariableSize, void **GlobalVariablePointerRet) { + + return pi2ur::piextGetGlobalVariablePointer( + Device, Program, GlobalVariableName, GlobalVariableSize, + GlobalVariablePointerRet); +} + /// Hint to migrate memory to the device /// /// @param Queue is the queue to submit to @@ -1428,6 +1437,7 @@ __SYCL_EXPORT pi_result piPluginInit(pi_plugin *PluginInit) { _PI_API(piProgramCompile) _PI_API(piProgramGetBuildInfo) _PI_API(piextGetDeviceFunctionPointer) + _PI_API(piextGetGlobalVariablePointer) _PI_API(piMemBufferCreate) _PI_API(piMemGetInfo) diff --git a/sycl/test/abi/pi_cuda_symbol_check.dump b/sycl/test/abi/pi_cuda_symbol_check.dump index 13b555bec8880..ec83769469dcd 100644 --- a/sycl/test/abi/pi_cuda_symbol_check.dump +++ b/sycl/test/abi/pi_cuda_symbol_check.dump @@ -115,6 +115,7 @@ piextEnqueueWriteHostPipe piextEventCreateWithNativeHandle piextEventGetNativeHandle piextGetDeviceFunctionPointer +piextGetGlobalVariablePointer piextImportExternalSemaphoreOpaqueFD piextKernelCreateWithNativeHandle piextKernelGetNativeHandle diff --git a/sycl/test/abi/pi_hip_symbol_check.dump b/sycl/test/abi/pi_hip_symbol_check.dump index 4c091716caedb..975e3315c0197 100644 --- a/sycl/test/abi/pi_hip_symbol_check.dump +++ b/sycl/test/abi/pi_hip_symbol_check.dump @@ -115,6 +115,7 @@ piextEnqueueWriteHostPipe piextEventCreateWithNativeHandle piextEventGetNativeHandle piextGetDeviceFunctionPointer +piextGetGlobalVariablePointer piextImportExternalSemaphoreOpaqueFD piextKernelCreateWithNativeHandle piextKernelGetNativeHandle diff --git a/sycl/test/abi/pi_level_zero_symbol_check.dump b/sycl/test/abi/pi_level_zero_symbol_check.dump index 7a90e461a30f6..336e1cd3cdd8e 100644 --- a/sycl/test/abi/pi_level_zero_symbol_check.dump +++ b/sycl/test/abi/pi_level_zero_symbol_check.dump @@ -114,6 +114,7 @@ piextEnqueueWriteHostPipe piextEventCreateWithNativeHandle piextEventGetNativeHandle piextGetDeviceFunctionPointer +piextGetGlobalVariablePointer piextImportExternalSemaphoreOpaqueFD piextKernelCreateWithNativeHandle piextKernelGetNativeHandle diff --git a/sycl/test/abi/pi_nativecpu_symbol_check.dump b/sycl/test/abi/pi_nativecpu_symbol_check.dump index 1929f3871cfe0..1294e7ae831cf 100644 --- a/sycl/test/abi/pi_nativecpu_symbol_check.dump +++ b/sycl/test/abi/pi_nativecpu_symbol_check.dump @@ -115,6 +115,7 @@ piextEnqueueWriteHostPipe piextEventCreateWithNativeHandle piextEventGetNativeHandle piextGetDeviceFunctionPointer +piextGetGlobalVariablePointer piextImportExternalSemaphoreOpaqueFD piextKernelCreateWithNativeHandle piextKernelGetNativeHandle diff --git a/sycl/test/abi/pi_opencl_symbol_check.dump b/sycl/test/abi/pi_opencl_symbol_check.dump index 159e427835651..fa7c7a2dc0525 100644 --- a/sycl/test/abi/pi_opencl_symbol_check.dump +++ b/sycl/test/abi/pi_opencl_symbol_check.dump @@ -114,6 +114,7 @@ piextEnqueueWriteHostPipe piextEventCreateWithNativeHandle piextEventGetNativeHandle piextGetDeviceFunctionPointer +piextGetGlobalVariablePointer piextImportExternalSemaphoreOpaqueFD piextKernelCreateWithNativeHandle piextKernelGetNativeHandle diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp index 5ab408d2eed01..be777f23df239 100644 --- a/sycl/unittests/helpers/PiMockPlugin.hpp +++ b/sycl/unittests/helpers/PiMockPlugin.hpp @@ -280,6 +280,12 @@ mock_piextGetDeviceFunctionPointer(pi_device device, pi_program program, return PI_SUCCESS; } +inline pi_result mock_piextGetGlobalVariablePointer( + pi_device device, pi_program program, const char *global_variable_name, + size_t *global_variable_size, void **global_variable_size_ret) { + return PI_SUCCESS; +} + // // Context // From a486c128c90a7b6b97d6c6cc8a8e731470e216bc Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Tue, 19 Mar 2024 14:47:58 -0700 Subject: [PATCH 4/7] [SYCL][Graph][UR] Update UR to support updating kernel commands in command buffers for L0 (#12897) --- sycl/plugins/unified_runtime/CMakeLists.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index b96906e4f67ad..8aab95a8f8769 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -57,13 +57,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT) include(FetchContent) set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - # commit 4d0183a8e3152f7c5b7a814d5001c90cb2412051 - # Merge: 29ee45c4 ca3da5aa + # commit ca5c34213c3484072f32a9ab2b56b0f49ebe712e + # Merge: 4d0183a8 c9be1e28 # Author: Kenneth Benzie (Benie) - # Date: Mon Mar 18 23:56:24 2024 +0000 - # Merge pull request #1255 from fabiomestre/fabio/add_global_variable_pointer - # [SPEC] Add urProgramGetGlobalVariablePointer entrypoint - set(UNIFIED_RUNTIME_TAG 4d0183a8e3152f7c5b7a814d5001c90cb2412051) + # Date: Tue Mar 19 16:58:27 2024 +0000 + # Merge pull request #1353 from againull/againull/l0_adapter_update_cmd_buffer + # [L0] Support updating kernel commands in command buffers + set(UNIFIED_RUNTIME_TAG ca5c34213c3484072f32a9ab2b56b0f49ebe712e) if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO) set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}") From 0838aba5066451b46a46fb08b5d33cc022dc185a Mon Sep 17 00:00:00 2001 From: Hugh Delaney Date: Wed, 20 Mar 2024 07:04:37 +0000 Subject: [PATCH 5/7] [UR] CI for UR PR refactor-guess-local-worksize (#12663) https://github.com/oneapi-src/unified-runtime/pull/1326 --------- Co-authored-by: Kenneth Benzie (Benie) --- sycl/plugins/unified_runtime/CMakeLists.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 8aab95a8f8769..b27b080b828e5 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -57,13 +57,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT) include(FetchContent) set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - # commit ca5c34213c3484072f32a9ab2b56b0f49ebe712e - # Merge: 4d0183a8 c9be1e28 + # commit ed1f8bf618c88eaabea6bde0f6c06fc265f3b49f + # Merge: ca5c3421 69c43b45 # Author: Kenneth Benzie (Benie) - # Date: Tue Mar 19 16:58:27 2024 +0000 - # Merge pull request #1353 from againull/againull/l0_adapter_update_cmd_buffer - # [L0] Support updating kernel commands in command buffers - set(UNIFIED_RUNTIME_TAG ca5c34213c3484072f32a9ab2b56b0f49ebe712e) + # Date: Tue Mar 19 21:00:20 2024 +0000 + # Merge pull request #1326 from hdelan/refactor-guess-local-worksize + # [CUDA][HIP] Fix bug in guess local worksize funcs and improve local worksize guessing in HIP adapter + set(UNIFIED_RUNTIME_TAG ed1f8bf618c88eaabea6bde0f6c06fc265f3b49f) if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO) set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}") From 257ac92a140349972bb2422d707e784552b67455 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Thu, 21 Mar 2024 10:28:46 +0000 Subject: [PATCH 6/7] [SYCL][Graph][HIP] Set minimum ROCm version for graphs (#13035) Tests UR PR https://github.com/oneapi-src/unified-runtime/pull/1447 that only reports support for UR command-buffers on ROCm 5.5.1 and later to work around HIP driver bugs related to HIP-Graph in earlier version. This requirement is also explicitly mentioned in the design doc. --- sycl/doc/design/CommandGraph.md | 4 +++- sycl/plugins/unified_runtime/CMakeLists.txt | 14 +++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md index 8f5d715a32925..2fc1e75749364 100644 --- a/sycl/doc/design/CommandGraph.md +++ b/sycl/doc/design/CommandGraph.md @@ -405,8 +405,10 @@ The HIP backend offers a Graph managemenet API very similar to CUDA Graph feature for batching series of operations. The SYCL Graph HIP backend implementation is therefore very similar to that of CUDA. +The minimum version of ROCm required to support `sycl_ext_oneapi_graph` is 5.5.1. + UR commands (e.g. kernels) are mapped as graph nodes using the -[HIP Management API](https://docs.amd.com/projects/HIP/en/docs-5.5.0/doxygen/html/group___graph.html). +[HIP Management API](https://rocm.docs.amd.com/projects/HIP/en/docs-5.5.1/doxygen/html/group___graph.html). Synchronization between commands (UR sync-points) is implemented using graph dependencies. Executable HIP Graphs can be submitted to a HIP stream diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index b27b080b828e5..21f26b960aca8 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -57,13 +57,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT) include(FetchContent) set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - # commit ed1f8bf618c88eaabea6bde0f6c06fc265f3b49f - # Merge: ca5c3421 69c43b45 - # Author: Kenneth Benzie (Benie) - # Date: Tue Mar 19 21:00:20 2024 +0000 - # Merge pull request #1326 from hdelan/refactor-guess-local-worksize - # [CUDA][HIP] Fix bug in guess local worksize funcs and improve local worksize guessing in HIP adapter - set(UNIFIED_RUNTIME_TAG ed1f8bf618c88eaabea6bde0f6c06fc265f3b49f) + # commit 5f4dd113824e90522d813420932c14072dc3049d + # Merge: ed1f8bf b551c77 + # Author: Ewan Crawford + # Date: Fri Mar 15 10:22:39 2024 +0000 + # Merge pull request #1447 from Bensuo/ewan/rocm_5_5_1 + # [HIP][CMDBUF] Require ROCm 5.5.1 for HIP command-buffers + set(UNIFIED_RUNTIME_TAG 5f4dd113824e90522d813420932c14072dc3049d) if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO) set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}") From 42919a98770cfec6b0df6e53c0b54244faac67bb Mon Sep 17 00:00:00 2001 From: "Neil R. Spruit" Date: Thu, 21 Mar 2024 06:50:57 -0700 Subject: [PATCH 7/7] [UR][L0] Fix Native Host memory usage on device with copy back sync (#13014) pre-commit PR for https://github.com/oneapi-src/unified-runtime/pull/1439 --------- Signed-off-by: Neil R. Spruit Co-authored-by: Kenneth Benzie (Benie) --- sycl/plugins/unified_runtime/CMakeLists.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 21f26b960aca8..9d1a27965e54d 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -57,13 +57,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT) include(FetchContent) set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - # commit 5f4dd113824e90522d813420932c14072dc3049d - # Merge: ed1f8bf b551c77 - # Author: Ewan Crawford - # Date: Fri Mar 15 10:22:39 2024 +0000 - # Merge pull request #1447 from Bensuo/ewan/rocm_5_5_1 - # [HIP][CMDBUF] Require ROCm 5.5.1 for HIP command-buffers - set(UNIFIED_RUNTIME_TAG 5f4dd113824e90522d813420932c14072dc3049d) + # commit c98fdbcf1f43ce132fbae75336bda984e4ce2e78 + # Merge: 5f4dd113 9b3cf9d3 + # Author: Kenneth Benzie (Benie) + # Date: Thu Mar 21 10:51:45 2024 +0000 + # Merge pull request #1439 from nrspruit/fix_device_native_proxy_buffer + # [L0] Fix Native Host memory usage on device with copy back sync + set(UNIFIED_RUNTIME_TAG c98fdbcf1f43ce132fbae75336bda984e4ce2e78) if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO) set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")