From 5332773b17efbf10e1b72cd633c1d7e2b4f75125 Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Fri, 12 Apr 2024 01:51:19 +0900
Subject: [PATCH] [SYCL][ESIMD] atomic_update with data size less than 4 bytes
 should use LSC atomics (#13340)

SVM doesn't support less than 4 bytes on Gen12, we either get an error
or the wrong answer.

---------

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 sycl/include/sycl/ext/intel/esimd/memory.hpp |  32 ++---
 sycl/test/esimd/memory_properties.cpp        | 126 ++++++++++++++++++-
 2 files changed, 142 insertions(+), 16 deletions(-)
diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index 7ef701b7edc85..188bce9d59b62 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -5862,8 +5862,8 @@ __ESIMD_API simd<T, N> slm_atomic_update_impl(simd<uint32_t, N> offsets,
 template <atomic_op Op, typename T, int N>
 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0, simd<T, N>>
 slm_atomic_update(simd<uint32_t, N> byte_offset, simd_mask<N> mask = 1) {
-  // 2 byte, 8 byte types, non-power of two, and operations wider than 32 are
-  // supported only by LSC.
+  // 2 byte, 8 byte types, non-power of two, and operations wider than
+  // 32 are supported only by LSC.
   if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
                 !__ESIMD_DNS::isPowerOf2(N, 32)) {
     return slm_atomic_update_impl<Op, T, N,
@@ -5942,8 +5942,8 @@ template <atomic_op Op, typename T, int N>
 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1, simd<T, N>>
 slm_atomic_update(simd<uint32_t, N> byte_offset, simd<T, N> src0,
                   simd_mask<N> mask = 1) {
-  // 2 byte, 8 byte types, non-power of two, and operations wider than 32 are
-  // supported only by LSC.
+  // 2 byte, 8 byte types, non-power of two, and operations wider than
+  // 32 are supported only by LSC.
   if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
                 !__ESIMD_DNS::isPowerOf2(N, 32)) {
     // half and short are supported in LSC.
@@ -6031,8 +6031,8 @@ template <atomic_op Op, typename T, int N>
 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2, simd<T, N>>
 slm_atomic_update(simd<uint32_t, N> byte_offset, simd<T, N> src0,
                   simd<T, N> src1, simd_mask<N> mask = 1) {
-  // 2 byte, 8 byte types, non-power of two, and operations wider than 32 are
-  // supported only by LSC.
+  // 2 byte, 8 byte types, non-power of two, and operations wider than
+  // 32 are supported only by LSC.
   if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
                 !__ESIMD_DNS::isPowerOf2(N, 32)) {
     // 2-argument lsc_atomic_update arguments order matches the standard one -
@@ -6417,7 +6417,7 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd_mask<N> mask,
   static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
 
   if constexpr (detail::has_cache_hints<PropertyListT>() ||
-                !__ESIMD_DNS::isPowerOf2(N, 32)) {
+                !__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) {
     return detail::atomic_update_impl<
         Op, T, N, detail::lsc_data_size::default_size, PropertyListT, Toffset>(
         p, byte_offset, mask);
@@ -6640,7 +6640,7 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
   if constexpr (detail::has_cache_hints<PropertyListT>() ||
                 (Op == atomic_op::fmin) || (Op == atomic_op::fmax) ||
                 (Op == atomic_op::fadd) || (Op == atomic_op::fsub) ||
-                !__ESIMD_DNS::isPowerOf2(N, 32)) {
+                !__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) {
     return detail::atomic_update_impl<
         Op, T, N, detail::lsc_data_size::default_size, PropertyListT, Toffset>(
         p, byte_offset, src0, mask);
@@ -6888,9 +6888,11 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
   static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
 
   // Use LSC atomic when cache hints are present, FP atomics is used,
-  // non-power of two length is used, or operation width greater than 32.
+  // non-power of two length is used, or operation width greater than 32, or the
+  // data size is less than 4 bytes.
   if constexpr (detail::has_cache_hints<PropertyListT>() ||
-                Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32)) {
+                Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32) ||
+                sizeof(T) < 4) {
     // 2-argument lsc_atomic_update arguments order matches the standard one -
     // expected value first, then new value. But atomic_update uses reverse
     // order, hence the src1/src0 swap.
@@ -7116,7 +7118,7 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd_mask<N> mask,
   static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
 
   if constexpr (detail::has_cache_hints<PropertyListT>() ||
-                !detail::isPowerOf2(N, 32)) {
+                !detail::isPowerOf2(N, 32) || sizeof(T) < 4) {
     return detail::atomic_update_impl<
         Op, T, N, detail::lsc_data_size::default_size, PropertyListT>(
         acc, byte_offset, mask);
@@ -7384,7 +7386,7 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
   if constexpr (detail::has_cache_hints<PropertyListT>() ||
                 Op == atomic_op::fmin || Op == atomic_op::fmax ||
                 Op == atomic_op::fadd || Op == atomic_op::fsub ||
-                !__ESIMD_DNS::isPowerOf2(N, 32)) {
+                !__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) {
     return detail::atomic_update_impl<
         Op, T, N, detail::lsc_data_size::default_size, PropertyListT>(
         acc, byte_offset, src0, mask);
@@ -7681,9 +7683,11 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
   static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
   static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported");
   // Use LSC atomic when cache hints are present, FP atomics is used,
-  // non-power of two length is used, or operation width greater than 32.
+  // non-power of two length is used, operation width greater than 32, or the
+  // data size is less than 4 bytes,
   if constexpr (detail::has_cache_hints<PropertyListT>() ||
-                Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32)) {
+                Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32) ||
+                sizeof(T) < 4) {
     // 2-argument lsc_atomic_update arguments order matches the standard one -
     // expected value first, then new value. But atomic_update uses reverse
     // order, hence the src1/src0 swap.
diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp
index 2c69d3a69d782..b23697d91922a 100644
--- a/sycl/test/esimd/memory_properties.cpp
+++ b/sycl/test/esimd/memory_properties.cpp
@@ -322,6 +322,17 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
           atomic_update<atomic_op::inc, int, VL>(ptr, offsets, pred);
     }
 
+    // Try with int16_t to check that LSC atomic is generated
+    // The result is later casted to int16, not captured here.
+    // CHECK: call <8 x i32> @llvm.genx.lsc.xatomic.stateless.v8i32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 8, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x i32> undef, <8 x i32> undef, i32 0, <8 x i32> undef)
+    {
+      int16_t *ptr = 0;
+      constexpr int VL = 8;
+      simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
+      auto atomic_res =
+          atomic_update<atomic_op::inc, int16_t, VL>(ptr, offsets);
+    }
+
     // Accessor
 
     // CHECK-STATEFUL:  call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 8, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 {{[^)]+}}, <4 x i32> undef)
@@ -377,6 +388,19 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
       auto atomic_res_acc =
           atomic_update<atomic_op::inc, int, VL>(acc, offsets, pred);
     }
+    // Try with int16_t to check that LSC atomic is generated
+    // The result is later casted to int16, not captured here.
+    // CHECK-STATEFUL:  call <8 x i32> @llvm.genx.lsc.xatomic.bti.v8i32.v8i1.v8i32(<8 x i1> {{[^)]+}}, i8 8, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <8 x i32> {{[^)]+}}, <8 x i32> undef, <8 x i32> undef, i32 {{[^)]+}}, <8 x i32> undef)
+    // CHECK-STATELESS: call <8 x i32> @llvm.genx.lsc.xatomic.stateless.v8i32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 8, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x i32> undef, <8 x i32> undef, i32 0, <8 x i32> undef)
+    {
+      using AccType =
+          sycl::accessor<int16_t, 1, sycl::access::mode::read_write>;
+      AccType *acc = nullptr;
+      constexpr int VL = 8;
+      simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
+      auto atomic_res =
+          atomic_update<atomic_op::inc, int16_t, VL>(*acc, offsets);
+    }
   }
 
   // Test atomic update with one operand.
@@ -432,6 +456,18 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
     auto res_atomic_8 =
         atomic_update<atomic_op::add, int>(ptr, offsets, add, pred);
 
+    // Try with int16_t to check that LSC atomic is generated
+    // The result is later casted to int16, not captured here.
+    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32>{{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+    {
+      int16_t *ptr = 0;
+      constexpr int VL = 4;
+      simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
+      auto add = simd<int16_t, VL>(5);
+      auto atomic_res =
+          atomic_update<atomic_op::add, int16_t, VL>(ptr, offsets, add);
+    }
+
     // Accessors
 
     // CHECK-STATEFUL-COUNT-14:  call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 {{[^)]+}}, <4 x i32> undef)
@@ -483,6 +519,21 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
     // CHECK-STATELESS: call <4 x i32> @llvm.genx.svm.atomic.sub.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
     auto res_atomic_17 =
         atomic_update<atomic_op::sub, int>(acc, offsets, add, pred);
+
+    // Try with int16_t to check that LSC atomic is generated
+    // The result is later casted to int16, not captured here.
+    // CHECK-STATEFUL: call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 {{[^)]+}}, <4 x i32> undef)
+    // CHECK-STATELESS: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+    {
+      using AccType =
+          sycl::accessor<int16_t, 1, sycl::access::mode::read_write>;
+      AccType *acc = nullptr;
+      constexpr int VL = 4;
+      simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
+      auto add = simd<int16_t, VL>(5);
+      auto atomic_res =
+          atomic_update<atomic_op::add, int16_t, VL>(*acc, offsets, add);
+    }
   }
 
   // Test atomic update with two operands.
@@ -626,6 +677,19 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
     auto res_atomic_100 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets, swap, compare, pred);
 
+    // Try with int16_t to check that LSC atomic is generated
+    // The result is later casted to int16, not captured here.
+    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    {
+      int16_t *ptr = 0;
+      constexpr int VL = 4;
+      simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
+      simd<int16_t, VL> swap = simd<int16_t, VL>(1) * sizeof(int);
+      auto compare = swap * 2;
+      auto atomic_res = atomic_update<atomic_op::cmpxchg, int16_t, VL>(
+          ptr, offsets, swap, compare);
+    }
+
     // Accessors
 
     // CHECK-STATEFUL-COUNT-30:  call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> undef)
@@ -751,6 +815,22 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
     // CHECK-STATELESS: call <4 x i32> @llvm.genx.svm.atomic.cmpxchg.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
     auto res_atomic_33 = atomic_update<atomic_op::cmpxchg, int, VL>(
         acc, offsets, swap, compare, pred);
+
+    // Try with int16_t to check that LSC atomic is generated
+    // The result is later casted to int16, not captured here.
+    // CHECK-STATEFUL: call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> undef)
+    // CHECK-STATELESS: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    {
+      using AccType =
+          sycl::accessor<int16_t, 1, sycl::access::mode::read_write>;
+      AccType *acc = nullptr;
+      constexpr int VL = 4;
+      simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
+      simd<int16_t, VL> swap = simd<int16_t, VL>(1) * sizeof(int);
+      auto compare = swap * 2;
+      auto atomic_res = atomic_update<atomic_op::cmpxchg, int16_t, VL>(
+          *acc, offsets, compare, swap);
+    }
   }
 
   // Test slm_atomic_update without operands.
@@ -824,12 +904,11 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
     {
       constexpr int VL = 16;
       simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
-      auto pred = simd_mask<VL>(1);
       simd<int16_t, VL> add = simd<int16_t, VL>(1) * sizeof(int);
 
       // CHECK: call <16 x i32> @llvm.genx.lsc.xatomic.slm.v16i32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <16 x i32> {{[^)]+}}, <16 x i32> {{[^)]+}}, <16 x i32> undef, i32 0, <16 x i32> undef)
       auto res_slm_atomic_0 =
-          slm_atomic_update<atomic_op::add, int16_t>(offsets, add, pred);
+          slm_atomic_update<atomic_op::add, int16_t>(offsets, add);
     }
     // Expect DWORD for fmin.
     {
@@ -934,6 +1013,19 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
         offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
         compare_view.select<VL, 1>());
 
+    // Expect LSC for short.
+    {
+      constexpr int VL = 16;
+      simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
+      auto compare = simd<int16_t, VL>(VL, 1);
+      auto swap = compare * 2;
+
+      // CHECK: call <16 x i32> @llvm.genx.lsc.xatomic.slm.v16i32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <16 x i32> {{[^)]+}}, <16 x i32> {{[^)]+}}, <16 x i32> {{[^)]+}}, i32 0, <16 x i32> undef)
+      auto res_slm_atomic_0 =
+          slm_atomic_update<atomic_op::cmpxchg, int16_t, VL>(offsets, swap,
+                                                             compare);
+    }
+
     // Expect LSC for int64_t.
     {
       constexpr int VL = 16;
@@ -964,6 +1056,15 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
         local_acc, offsets_view.select<VL, 1>(), pred);
     auto res_slm_atomic_6 = atomic_update<atomic_op::inc, int, VL>(
         local_acc, offsets_view.select<VL, 1>());
+
+    // Expect LSC for short.
+    {
+      using LocalAccType = sycl::local_accessor<int16_t, 1>;
+      LocalAccType *local_acc = nullptr;
+      // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.slm.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 8, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 0, <4 x i32> undef)
+      auto res_slm_atomic_1 =
+          atomic_update<atomic_op::inc, int16_t>(*local_acc, offsets);
+    }
   }
   // One operand atomic.
   {
@@ -997,6 +1098,16 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
         pred);
     res_slm_atomic_8 = atomic_update<atomic_op::add, int, VL>(
         local_acc, offsets_view.select<VL, 1>(), add_view.select<VL, 1>());
+
+    // Expect LSC for short.
+    {
+      using LocalAccType = sycl::local_accessor<int16_t, 1>;
+      LocalAccType *local_acc = nullptr;
+      simd<int16_t, VL> add = simd<int16_t, VL>(1) * sizeof(int);
+      // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.slm.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+      auto res_slm_atomic_1 =
+          atomic_update<atomic_op::add, int16_t>(*local_acc, offsets, add);
+    }
   }
   // Two operand atomic.
   {
@@ -1069,6 +1180,17 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
     res_slm_atomic_16 = atomic_update<atomic_op::cmpxchg, int, VL>(
         local_acc, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
         compare_view.select<VL, 1>());
+
+    // Expect LSC for short.
+    {
+      using LocalAccType = sycl::local_accessor<int16_t, 1>;
+      LocalAccType *local_acc = nullptr;
+      auto compare = simd<int16_t, VL>(VL, 1);
+      auto swap = compare * 2;
+      // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.slm.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+      auto res_slm_atomic_1 = atomic_update<atomic_op::cmpxchg, int16_t, VL>(
+          *local_acc, offsets, swap, compare);
+    }
   }
 }