diff --git a/libclc/ptx-nvidiacl/libspirv/atomic/atomic_add.cl b/libclc/ptx-nvidiacl/libspirv/atomic/atomic_add.cl index 8ab2604cde0c4..c1d69efc5b477 100644 --- a/libclc/ptx-nvidiacl/libspirv/atomic/atomic_add.cl +++ b/libclc/ptx-nvidiacl/libspirv/atomic/atomic_add.cl @@ -71,6 +71,13 @@ Memory order is stored in the lowest 5 bits */ ADDR_SPACE, ADDR_SPACE_NV) \ } \ break; \ + case SequentiallyConsistent: \ + if (__clc_nvvm_reflect_arch() >= 700) { \ + __CLC_NVVM_FENCE_SC_SM70() \ + __CLC_NVVM_ATOMIC_IMPL_ORDER(double, double, d, add, ADDR_SPACE, \ + ADDR_SPACE_NV, _acq_rel) \ + break; \ + } \ } \ __builtin_trap(); \ __builtin_unreachable(); \ diff --git a/libclc/ptx-nvidiacl/libspirv/atomic/atomic_cmpxchg.cl b/libclc/ptx-nvidiacl/libspirv/atomic/atomic_cmpxchg.cl index fceeda0f60361..19d4dca833fef 100644 --- a/libclc/ptx-nvidiacl/libspirv/atomic/atomic_cmpxchg.cl +++ b/libclc/ptx-nvidiacl/libspirv/atomic/atomic_cmpxchg.cl @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include #include #include @@ -120,6 +121,13 @@ Memory order is stored in the lowest 5 bits */ \ TYPE, TYPE_NV, TYPE_MANGLED_NV, OP, ADDR_SPACE, ADDR_SPACE_NV) \ } \ break; \ + case SequentiallyConsistent: \ + if (__clc_nvvm_reflect_arch() >= 700) { \ + __CLC_NVVM_FENCE_SC_SM70() \ + __CLC_NVVM_ATOMIC_CAS_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, OP, \ + ADDR_SPACE, ADDR_SPACE_NV, _acq_rel) \ + break; \ + } \ } \ __builtin_trap(); \ __builtin_unreachable(); \ diff --git a/libclc/ptx-nvidiacl/libspirv/atomic/atomic_helpers.h b/libclc/ptx-nvidiacl/libspirv/atomic/atomic_helpers.h index 56c20cece7935..ecffd9e82d2fe 100644 --- a/libclc/ptx-nvidiacl/libspirv/atomic/atomic_helpers.h +++ b/libclc/ptx-nvidiacl/libspirv/atomic/atomic_helpers.h @@ -72,6 +72,15 @@ _CLC_OVERLOAD _CLC_DECL void __spirv_MemoryBarrier(unsigned int, unsigned int); } \ } +#define __CLC_NVVM_FENCE_SC_SM70() \ + if (scope == CrossDevice) { \ + __asm__ __volatile__("fence.sc.sys;"); \ + } else if (scope == Device) { \ + __asm__ __volatile__("fence.sc.gpu;"); \ + } else { \ + __asm__ __volatile__("fence.sc.cta;"); \ + } + #define __CLC_NVVM_ATOMIC_IMPL( \ TYPE, TYPE_MANGLED, TYPE_NV, TYPE_MANGLED_NV, OP, NAME_MANGLED, \ ADDR_SPACE, POINTER_AND_ADDR_SPACE_MANGLED, ADDR_SPACE_NV, SUBSTITUTION) \ @@ -117,6 +126,13 @@ Memory order is stored in the lowest 5 bits */ \ OP, ADDR_SPACE, ADDR_SPACE_NV) \ } \ break; \ + case SequentiallyConsistent: \ + if (__clc_nvvm_reflect_arch() >= 700) { \ + __CLC_NVVM_FENCE_SC_SM70() \ + __CLC_NVVM_ATOMIC_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, OP, \ + ADDR_SPACE, ADDR_SPACE_NV, _acq_rel) \ + break; \ + } \ } \ __builtin_trap(); \ __builtin_unreachable(); \ diff --git a/libclc/ptx-nvidiacl/libspirv/atomic/atomic_load.cl b/libclc/ptx-nvidiacl/libspirv/atomic/atomic_load.cl index 54483b8c5ec25..60311a978762d 100644 --- a/libclc/ptx-nvidiacl/libspirv/atomic/atomic_load.cl +++ b/libclc/ptx-nvidiacl/libspirv/atomic/atomic_load.cl @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include #include #include @@ -53,6 +54,12 @@ Memory order is stored in the lowest 5 bits */ \ case Acquire: \ __CLC_NVVM_ATOMIC_LOAD_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, \ ADDR_SPACE, ADDR_SPACE_NV, _acquire) \ + break; \ + case SequentiallyConsistent: \ + __CLC_NVVM_FENCE_SC_SM70() \ + __CLC_NVVM_ATOMIC_LOAD_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, \ + ADDR_SPACE, ADDR_SPACE_NV, _acquire) \ + break; \ } \ } else { \ TYPE_NV res = __nvvm_volatile_ld##ADDR_SPACE_NV##TYPE_MANGLED_NV( \ diff --git a/libclc/ptx-nvidiacl/libspirv/atomic/atomic_store.cl b/libclc/ptx-nvidiacl/libspirv/atomic/atomic_store.cl index 1aaf1c8ab8499..b2e23cd76eac2 100644 --- a/libclc/ptx-nvidiacl/libspirv/atomic/atomic_store.cl +++ b/libclc/ptx-nvidiacl/libspirv/atomic/atomic_store.cl @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include #include #include @@ -54,6 +55,13 @@ Memory order is stored in the lowest 5 bits */ \ __CLC_NVVM_ATOMIC_STORE_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, \ ADDR_SPACE, ADDR_SPACE_NV, \ _release) \ + break; \ + case SequentiallyConsistent: \ + __CLC_NVVM_FENCE_SC_SM70() \ + __CLC_NVVM_ATOMIC_STORE_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, \ + ADDR_SPACE, ADDR_SPACE_NV, \ + _release) \ + break; \ } \ } else { \ switch (order) { \ diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md index 8f5d715a32925..2fc1e75749364 100644 --- a/sycl/doc/design/CommandGraph.md +++ b/sycl/doc/design/CommandGraph.md @@ -405,8 +405,10 @@ The HIP backend offers a Graph managemenet API very similar to CUDA Graph feature for batching series of operations. The SYCL Graph HIP backend implementation is therefore very similar to that of CUDA. +The minimum version of ROCm required to support `sycl_ext_oneapi_graph` is 5.5.1. + UR commands (e.g. kernels) are mapped as graph nodes using the -[HIP Management API](https://docs.amd.com/projects/HIP/en/docs-5.5.0/doxygen/html/group___graph.html). +[HIP Management API](https://rocm.docs.amd.com/projects/HIP/en/docs-5.5.1/doxygen/html/group___graph.html). Synchronization between commands (UR sync-points) is implemented using graph dependencies. Executable HIP Graphs can be submitted to a HIP stream diff --git a/sycl/include/sycl/detail/pi.def b/sycl/include/sycl/detail/pi.def index c6b962b8b0f48..46a200e001231 100644 --- a/sycl/include/sycl/detail/pi.def +++ b/sycl/include/sycl/detail/pi.def @@ -31,6 +31,7 @@ _PI_API(piDeviceRetain) _PI_API(piDeviceRelease) _PI_API(piextDeviceSelectBinary) _PI_API(piextGetDeviceFunctionPointer) +_PI_API(piextGetGlobalVariablePointer) _PI_API(piextDeviceGetNativeHandle) _PI_API(piextDeviceCreateWithNativeHandle) // Context diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 56fdeb7a1051b..f6ee364c17a23 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -154,9 +154,10 @@ // 15.44 Add coarse-grain memory advice flag for HIP. // 15.45 Added piextKernelSuggestMaxCooperativeGroupCount and // piextEnqueueCooperativeKernelLaunch. +// 15.46 Add piextGetGlobalVariablePointer #define _PI_H_VERSION_MAJOR 15 -#define _PI_H_VERSION_MINOR 45 +#define _PI_H_VERSION_MINOR 46 #define _PI_STRING_HELPER(a) #a #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b) @@ -1287,6 +1288,10 @@ __SYCL_EXPORT pi_result piextGetDeviceFunctionPointer( pi_device device, pi_program program, const char *function_name, pi_uint64 *function_pointer_ret); +__SYCL_EXPORT pi_result piextGetGlobalVariablePointer( + pi_device Device, pi_program Program, const char *GlobalVariableName, + size_t *GlobalVariableSize, void **GlobalVariablePointerRet); + // // Context // diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 02fe3af901cb8..e6d395e758568 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -848,6 +848,15 @@ pi_result piextGetDeviceFunctionPointer(pi_device Device, pi_program Program, FunctionPointerRet); } +pi_result piextGetGlobalVariablePointer(pi_device Device, pi_program Program, + const char *GlobalVariableName, + size_t *GlobalVariableSize, + void **GlobalVariablePointerRet) { + return pi2ur::piextGetGlobalVariablePointer( + Device, Program, GlobalVariableName, GlobalVariableSize, + GlobalVariablePointerRet); +} + pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context, pi_device Device, pi_usm_mem_properties *Properties, size_t Size, diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index b895727c9d0fa..2fbde10b77123 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -851,6 +851,15 @@ pi_result piextGetDeviceFunctionPointer(pi_device Device, pi_program Program, FunctionPointerRet); } +pi_result piextGetGlobalVariablePointer(pi_device Device, pi_program Program, + const char *GlobalVariableName, + size_t *GlobalVariableSize, + void **GlobalVariablePointerRet) { + return pi2ur::piextGetGlobalVariablePointer( + Device, Program, GlobalVariableName, GlobalVariableSize, + GlobalVariablePointerRet); +} + pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context, pi_device Device, pi_usm_mem_properties *Properties, size_t Size, diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index c923c802f1d3f..26a1b104b3335 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -875,6 +875,15 @@ pi_result piextGetDeviceFunctionPointer(pi_device Device, pi_program Program, FunctionPointerRet); } +pi_result piextGetGlobalVariablePointer(pi_device Device, pi_program Program, + const char *GlobalVariableName, + size_t *GlobalVariableSize, + void **GlobalVariablePointerRet) { + return pi2ur::piextGetGlobalVariablePointer( + Device, Program, GlobalVariableName, GlobalVariableSize, + GlobalVariablePointerRet); +} + pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context, pi_device Device, pi_usm_mem_properties *Properties, size_t Size, diff --git a/sycl/plugins/native_cpu/pi_native_cpu.cpp b/sycl/plugins/native_cpu/pi_native_cpu.cpp index 2aef845590663..01b6dee1bb0f2 100644 --- a/sycl/plugins/native_cpu/pi_native_cpu.cpp +++ b/sycl/plugins/native_cpu/pi_native_cpu.cpp @@ -852,6 +852,15 @@ pi_result piextGetDeviceFunctionPointer(pi_device Device, pi_program Program, FunctionPointerRet); } +pi_result piextGetGlobalVariablePointer(pi_device Device, pi_program Program, + const char *GlobalVariableName, + size_t *GlobalVariableSize, + void **GlobalVariablePointerRet) { + return pi2ur::piextGetGlobalVariablePointer( + Device, Program, GlobalVariableName, GlobalVariableSize, + GlobalVariablePointerRet); +} + pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context, pi_device Device, pi_usm_mem_properties *Properties, size_t Size, diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 45fb66575ec42..e8a168b60445e 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -813,6 +813,15 @@ pi_result piextGetDeviceFunctionPointer(pi_device Device, pi_program Program, FunctionPointerRet); } +pi_result piextGetGlobalVariablePointer(pi_device Device, pi_program Program, + const char *GlobalVariableName, + size_t *GlobalVariableSize, + void **GlobalVariablePointerRet) { + return pi2ur::piextGetGlobalVariablePointer( + Device, Program, GlobalVariableName, GlobalVariableSize, + GlobalVariablePointerRet); +} + pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context, pi_device Device, pi_usm_mem_properties *Properties, size_t Size, diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 2a437a05f6676..9d1a27965e54d 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -57,13 +57,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT) include(FetchContent) set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - # commit ec634ff05b067d7922ec45059dda94665e5dcd9b - # Merge: 418ad535 8714b853 - # Author: Piotr Balcer - # Date: Thu Mar 14 15:52:52 2024 +0100 - # Merge pull request #1438 from PatKamin/disable-fuzztests - # Disable fuzz tests on ubuntu-22.04 runner - set(UNIFIED_RUNTIME_TAG ec634ff05b067d7922ec45059dda94665e5dcd9b) + # commit c98fdbcf1f43ce132fbae75336bda984e4ce2e78 + # Merge: 5f4dd113 9b3cf9d3 + # Author: Kenneth Benzie (Benie) + # Date: Thu Mar 21 10:51:45 2024 +0000 + # Merge pull request #1439 from nrspruit/fix_device_native_proxy_buffer + # [L0] Fix Native Host memory usage on device with copy back sync + set(UNIFIED_RUNTIME_TAG c98fdbcf1f43ce132fbae75336bda984e4ce2e78) if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO) set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}") diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 87ee60f41e2da..f1eb777046a90 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -2162,6 +2162,22 @@ inline pi_result piextGetDeviceFunctionPointer(pi_device Device, return PI_SUCCESS; } +inline pi_result piextGetGlobalVariablePointer( + pi_device Device, pi_program Program, const char *GlobalVariableName, + size_t *GlobalVariableSize, void **GlobalVariablePointerRet) { + PI_ASSERT(Program, PI_ERROR_INVALID_PROGRAM); + + auto UrDevice = reinterpret_cast(Device); + + ur_program_handle_t UrProgram = + reinterpret_cast(Program); + + HANDLE_ERRORS(urProgramGetGlobalVariablePointer( + UrDevice, UrProgram, GlobalVariableName, GlobalVariableSize, + GlobalVariablePointerRet)); + return PI_SUCCESS; +} + // Special version of piKernelSetArg to accept pi_mem. inline pi_result piextKernelSetArgMemObj(pi_kernel Kernel, pi_uint32 ArgIndex, diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp index b7741ce6cf509..8701d23027682 100644 --- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp +++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp @@ -763,6 +763,15 @@ __SYCL_EXPORT pi_result piextGetDeviceFunctionPointer( FunctionPointerRet); } +__SYCL_EXPORT pi_result piextGetGlobalVariablePointer( + pi_device Device, pi_program Program, const char *GlobalVariableName, + size_t *GlobalVariableSize, void **GlobalVariablePointerRet) { + + return pi2ur::piextGetGlobalVariablePointer( + Device, Program, GlobalVariableName, GlobalVariableSize, + GlobalVariablePointerRet); +} + /// Hint to migrate memory to the device /// /// @param Queue is the queue to submit to @@ -1428,6 +1437,7 @@ __SYCL_EXPORT pi_result piPluginInit(pi_plugin *PluginInit) { _PI_API(piProgramCompile) _PI_API(piProgramGetBuildInfo) _PI_API(piextGetDeviceFunctionPointer) + _PI_API(piextGetGlobalVariablePointer) _PI_API(piMemBufferCreate) _PI_API(piMemGetInfo) diff --git a/sycl/test/abi/pi_cuda_symbol_check.dump b/sycl/test/abi/pi_cuda_symbol_check.dump index 13b555bec8880..ec83769469dcd 100644 --- a/sycl/test/abi/pi_cuda_symbol_check.dump +++ b/sycl/test/abi/pi_cuda_symbol_check.dump @@ -115,6 +115,7 @@ piextEnqueueWriteHostPipe piextEventCreateWithNativeHandle piextEventGetNativeHandle piextGetDeviceFunctionPointer +piextGetGlobalVariablePointer piextImportExternalSemaphoreOpaqueFD piextKernelCreateWithNativeHandle piextKernelGetNativeHandle diff --git a/sycl/test/abi/pi_hip_symbol_check.dump b/sycl/test/abi/pi_hip_symbol_check.dump index 4c091716caedb..975e3315c0197 100644 --- a/sycl/test/abi/pi_hip_symbol_check.dump +++ b/sycl/test/abi/pi_hip_symbol_check.dump @@ -115,6 +115,7 @@ piextEnqueueWriteHostPipe piextEventCreateWithNativeHandle piextEventGetNativeHandle piextGetDeviceFunctionPointer +piextGetGlobalVariablePointer piextImportExternalSemaphoreOpaqueFD piextKernelCreateWithNativeHandle piextKernelGetNativeHandle diff --git a/sycl/test/abi/pi_level_zero_symbol_check.dump b/sycl/test/abi/pi_level_zero_symbol_check.dump index 7a90e461a30f6..336e1cd3cdd8e 100644 --- a/sycl/test/abi/pi_level_zero_symbol_check.dump +++ b/sycl/test/abi/pi_level_zero_symbol_check.dump @@ -114,6 +114,7 @@ piextEnqueueWriteHostPipe piextEventCreateWithNativeHandle piextEventGetNativeHandle piextGetDeviceFunctionPointer +piextGetGlobalVariablePointer piextImportExternalSemaphoreOpaqueFD piextKernelCreateWithNativeHandle piextKernelGetNativeHandle diff --git a/sycl/test/abi/pi_nativecpu_symbol_check.dump b/sycl/test/abi/pi_nativecpu_symbol_check.dump index 1929f3871cfe0..1294e7ae831cf 100644 --- a/sycl/test/abi/pi_nativecpu_symbol_check.dump +++ b/sycl/test/abi/pi_nativecpu_symbol_check.dump @@ -115,6 +115,7 @@ piextEnqueueWriteHostPipe piextEventCreateWithNativeHandle piextEventGetNativeHandle piextGetDeviceFunctionPointer +piextGetGlobalVariablePointer piextImportExternalSemaphoreOpaqueFD piextKernelCreateWithNativeHandle piextKernelGetNativeHandle diff --git a/sycl/test/abi/pi_opencl_symbol_check.dump b/sycl/test/abi/pi_opencl_symbol_check.dump index 159e427835651..fa7c7a2dc0525 100644 --- a/sycl/test/abi/pi_opencl_symbol_check.dump +++ b/sycl/test/abi/pi_opencl_symbol_check.dump @@ -114,6 +114,7 @@ piextEnqueueWriteHostPipe piextEventCreateWithNativeHandle piextEventGetNativeHandle piextGetDeviceFunctionPointer +piextGetGlobalVariablePointer piextImportExternalSemaphoreOpaqueFD piextKernelCreateWithNativeHandle piextKernelGetNativeHandle diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp index 5ab408d2eed01..be777f23df239 100644 --- a/sycl/unittests/helpers/PiMockPlugin.hpp +++ b/sycl/unittests/helpers/PiMockPlugin.hpp @@ -280,6 +280,12 @@ mock_piextGetDeviceFunctionPointer(pi_device device, pi_program program, return PI_SUCCESS; } +inline pi_result mock_piextGetGlobalVariablePointer( + pi_device device, pi_program program, const char *global_variable_name, + size_t *global_variable_size, void **global_variable_size_ret) { + return PI_SUCCESS; +} + // // Context //