Skip to content

Commit

Permalink
[CUDA][LIBCLC] Implement RC11 seq_cst for PTX6.0 (#12516)
Browse files Browse the repository at this point in the history
Implement `seq_cst` RC11/ptx6.0 memory consistency for CUDA backend.

See https://dl.acm.org/doi/pdf/10.1145/3297858.3304043 and
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#memory-consistency-model
for full details. Requires sm_70 or above. With this PR there is now a
complete mapping between SYCL memory consistency model capabilities and
the official CUDA model, fully exploiting CUDA capabilities when
possible on supported arches.

This makes the SYCL-CTS atomic_ref tests fully pass for sm_70 on the
cuda backend.

Fixes #11208

Depends on #12907

---------

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
  • Loading branch information
JackAKirk authored Mar 18, 2024
1 parent 17ef793 commit c1e2957
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 7 deletions.
7 changes: 7 additions & 0 deletions libclc/ptx-nvidiacl/libspirv/atomic/atomic_add.cl
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,13 @@ Memory order is stored in the lowest 5 bits */
ADDR_SPACE, ADDR_SPACE_NV) \
} \
break; \
case SequentiallyConsistent: \
if (__clc_nvvm_reflect_arch() >= 700) { \
__CLC_NVVM_FENCE_SC_SM70() \
__CLC_NVVM_ATOMIC_IMPL_ORDER(double, double, d, add, ADDR_SPACE, \
ADDR_SPACE_NV, _acq_rel) \
break; \
} \
} \
__builtin_trap(); \
__builtin_unreachable(); \
Expand Down
8 changes: 8 additions & 0 deletions libclc/ptx-nvidiacl/libspirv/atomic/atomic_cmpxchg.cl
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
//
//===----------------------------------------------------------------------===//

#include <atomic_helpers.h>
#include <spirv/spirv.h>
#include <spirv/spirv_types.h>

Expand Down Expand Up @@ -120,6 +121,13 @@ Memory order is stored in the lowest 5 bits */ \
TYPE, TYPE_NV, TYPE_MANGLED_NV, OP, ADDR_SPACE, ADDR_SPACE_NV) \
} \
break; \
case SequentiallyConsistent: \
if (__clc_nvvm_reflect_arch() >= 700) { \
__CLC_NVVM_FENCE_SC_SM70() \
__CLC_NVVM_ATOMIC_CAS_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, OP, \
ADDR_SPACE, ADDR_SPACE_NV, _acq_rel) \
break; \
} \
} \
__builtin_trap(); \
__builtin_unreachable(); \
Expand Down
16 changes: 16 additions & 0 deletions libclc/ptx-nvidiacl/libspirv/atomic/atomic_helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,15 @@ _CLC_OVERLOAD _CLC_DECL void __spirv_MemoryBarrier(unsigned int, unsigned int);
} \
}

#define __CLC_NVVM_FENCE_SC_SM70() \
if (scope == CrossDevice) { \
__asm__ __volatile__("fence.sc.sys;"); \
} else if (scope == Device) { \
__asm__ __volatile__("fence.sc.gpu;"); \
} else { \
__asm__ __volatile__("fence.sc.cta;"); \
}

#define __CLC_NVVM_ATOMIC_IMPL( \
TYPE, TYPE_MANGLED, TYPE_NV, TYPE_MANGLED_NV, OP, NAME_MANGLED, \
ADDR_SPACE, POINTER_AND_ADDR_SPACE_MANGLED, ADDR_SPACE_NV, SUBSTITUTION) \
Expand Down Expand Up @@ -117,6 +126,13 @@ Memory order is stored in the lowest 5 bits */ \
OP, ADDR_SPACE, ADDR_SPACE_NV) \
} \
break; \
case SequentiallyConsistent: \
if (__clc_nvvm_reflect_arch() >= 700) { \
__CLC_NVVM_FENCE_SC_SM70() \
__CLC_NVVM_ATOMIC_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, OP, \
ADDR_SPACE, ADDR_SPACE_NV, _acq_rel) \
break; \
} \
} \
__builtin_trap(); \
__builtin_unreachable(); \
Expand Down
7 changes: 7 additions & 0 deletions libclc/ptx-nvidiacl/libspirv/atomic/atomic_load.cl
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
//
//===----------------------------------------------------------------------===//

#include <atomic_helpers.h>
#include <spirv/spirv.h>
#include <spirv/spirv_types.h>

Expand Down Expand Up @@ -53,6 +54,12 @@ Memory order is stored in the lowest 5 bits */ \
case Acquire: \
__CLC_NVVM_ATOMIC_LOAD_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, \
ADDR_SPACE, ADDR_SPACE_NV, _acquire) \
break; \
case SequentiallyConsistent: \
__CLC_NVVM_FENCE_SC_SM70() \
__CLC_NVVM_ATOMIC_LOAD_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, \
ADDR_SPACE, ADDR_SPACE_NV, _acquire) \
break; \
} \
} else { \
TYPE_NV res = __nvvm_volatile_ld##ADDR_SPACE_NV##TYPE_MANGLED_NV( \
Expand Down
8 changes: 8 additions & 0 deletions libclc/ptx-nvidiacl/libspirv/atomic/atomic_store.cl
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
//
//===----------------------------------------------------------------------===//

#include <atomic_helpers.h>
#include <spirv/spirv.h>
#include <spirv/spirv_types.h>

Expand Down Expand Up @@ -54,6 +55,13 @@ Memory order is stored in the lowest 5 bits */ \
__CLC_NVVM_ATOMIC_STORE_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, \
ADDR_SPACE, ADDR_SPACE_NV, \
_release) \
break; \
case SequentiallyConsistent: \
__CLC_NVVM_FENCE_SC_SM70() \
__CLC_NVVM_ATOMIC_STORE_IMPL_ORDER(TYPE, TYPE_NV, TYPE_MANGLED_NV, \
ADDR_SPACE, ADDR_SPACE_NV, \
_release) \
break; \
} \
} else { \
switch (order) { \
Expand Down
14 changes: 7 additions & 7 deletions sycl/plugins/unified_runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,14 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
CACHE PATH "Path to external '${name}' adapter source dir" FORCE)
endfunction()

set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
# commit 6513abc404979fa109d64500bf899e632d511291
# Merge: 09be0881 6d586094
set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
# commit 29ee45c4451a682f744146cc9dbeb2617ecdd6b3
# Merge: db4b0c14 4f5d005a
# Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
# Date: Thu Mar 14 22:38:53 2024 +0000
# Merge pull request #1410 from kbenzie/benie/cmake-external-adapter-source-dirs
# [CMake] Support external adapter source dirs
set(UNIFIED_RUNTIME_TAG 6513abc404979fa109d64500bf899e632d511291)
# Date: Mon Mar 18 12:14:26 2024 +0000
# Merge pull request #1291 from JackAKirk/cuda-seq-cst-b
# [CUDA] Report that devices with cc >= sm_70 support seq_cst
set(UNIFIED_RUNTIME_TAG 29ee45c4451a682f744146cc9dbeb2617ecdd6b3)

if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)
set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")
Expand Down

0 comments on commit c1e2957

Please sign in to comment.