Skip to content

Commit

Permalink
[ESIMD] Enable FADD/FSUB for slm_atomic_update (#13535)
Browse files Browse the repository at this point in the history
Those test cases required newer GPU driver and thus were disabled
previously.

GPU driver on DG2 still does not handle correctly atomic_update for 'float' and 'half' types.
GPU driver on PVC still does not handle correctly slm_atomic_update for 'half' types.

---------
Signed-off-by: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
  • Loading branch information
v-klochkov authored Apr 29, 2024
1 parent 004efa3 commit 3756fd1
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 84 deletions.
20 changes: 10 additions & 10 deletions sycl/include/sycl/ext/intel/esimd/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5823,7 +5823,7 @@ slm_atomic_update_impl(simd<uint32_t, N> offsets, simd<T, N> src0,
constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
if constexpr (std::is_same_v<T, double>) {
if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
return __esimd_lsc_xatomic_slm_1<T, IOp, cache_hint::none, cache_hint::none,
AddressScale, ImmOffset, EDS, VS,
Transposed, N>(pred.data(), offsets.data(),
Expand Down Expand Up @@ -5867,7 +5867,7 @@ __ESIMD_API simd<T, N> slm_atomic_update_impl(simd<uint32_t, N> offsets,
constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
if constexpr (std::is_same_v<T, double>) {
if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
return __esimd_lsc_xatomic_slm_2<T, IOp, cache_hint::none, cache_hint::none,
AddressScale, ImmOffset, EDS, VS,
Transposed, N>(pred.data(), offsets.data(),
Expand Down Expand Up @@ -6007,11 +6007,11 @@ template <atomic_op Op, typename T, int N>
__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1, simd<T, N>>
slm_atomic_update(simd<uint32_t, N> byte_offset, simd<T, N> src0,
simd_mask<N> mask = 1) {
// 2 byte, 8 byte types, non-power of two, and operations wider than
// 32 are supported only by LSC.
if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
!__ESIMD_DNS::isPowerOf2(N, 32)) {
// half and short are supported in LSC.
// Non-LSC atomic_update supports only 4-byte int vector operations with
// 1,2,4,8,16,32 vector length. Non-LSC supports only 'store' for FP types.
if constexpr (Op == atomic_op::fmin || Op == atomic_op::fmax ||
Op == atomic_op::fadd || Op == atomic_op::fsub ||
sizeof(T) != 4 || !__ESIMD_DNS::isPowerOf2(N, 32)) {
return slm_atomic_update_impl<Op, T, N,
detail::lsc_data_size::default_size>(
byte_offset, src0, mask);
Expand Down Expand Up @@ -6096,9 +6096,9 @@ template <atomic_op Op, typename T, int N>
__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2, simd<T, N>>
slm_atomic_update(simd<uint32_t, N> byte_offset, simd<T, N> src0,
simd<T, N> src1, simd_mask<N> mask = 1) {
// 2 byte, 8 byte types, non-power of two, and operations wider than
// 32 are supported only by LSC.
if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
// Non-LSC atomic_update supports only 4-byte int vector operations with
// 1,2,4,8,16,32 vector length.
if constexpr (sizeof(T) != 4 || Op == atomic_op::fcmpxchg ||
!__ESIMD_DNS::isPowerOf2(N, 32)) {
// 2-argument lsc_atomic_update arguments order matches the standard one -
// expected value first, then new value. But atomic_update uses reverse
Expand Down
43 changes: 14 additions & 29 deletions sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -670,16 +670,13 @@ template <int N, template <class, int> class Op, bool UseMask,
bool UseLSCFeatures, bool UseAcc>
bool test_fp_types(queue q, const Config &cfg) {
bool passed = true;
if constexpr (UseLSCFeatures) {
if constexpr (std::is_same_v<Op<sycl::half, N>, ImplFmin<sycl::half, N>> ||
std::is_same_v<Op<sycl::half, N>, ImplFmax<sycl::half, N>> ||
std::is_same_v<Op<sycl::half, N>,
ImplFcmpwr<sycl::half, N>>) {
auto dev = q.get_device();
if (dev.has(sycl::aspect::fp16)) {
passed &= run_test<UseAcc, sycl::half, N, Op, UseMask, UseLSCFeatures>(
q, cfg);
}
// TODO: Enable FADD/FSUB on DG2/PVC when the error in GPU driver is resolved.
if constexpr (UseLSCFeatures &&
!std::is_same_v<Op<sycl::half, N>, ImplFadd<sycl::half, N>> &&
!std::is_same_v<Op<sycl::half, N>, ImplFsub<sycl::half, N>>) {
if (q.get_device().has(sycl::aspect::fp16)) {
passed &=
run_test<UseAcc, sycl::half, N, Op, UseMask, UseLSCFeatures>(q, cfg);
}
}
passed &= run_test<UseAcc, float, N, Op, UseMask, UseLSCFeatures>(q, cfg);
Expand All @@ -688,7 +685,6 @@ bool test_fp_types(queue q, const Config &cfg) {
q.get_device().has(sycl::aspect::fp64)) {
passed &= run_test<UseAcc, double, N, Op, UseMask, UseLSCFeatures>(q, cfg);
}

#endif // CMPXCHG_TEST
return passed;
}
Expand All @@ -703,7 +699,6 @@ bool test_int_types_and_sizes(queue q, const Config &cfg) {
test_int_types<2, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg);
passed &=
test_int_types<4, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg);

passed &=
test_int_types<8, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg);
passed &=
Expand All @@ -715,13 +710,10 @@ bool test_int_types_and_sizes(queue q, const Config &cfg) {
if constexpr (UseLSCFeatures) {
passed &= test_int_types<64, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(
q, cfg);
// non power of two values are supported only in newer driver.
// TODO: Enable this when the new driver reaches test infrastructure
// (v27556).
#if 0
passed &= test_int_types<12, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg);
passed &= test_int_types<33, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg);
#endif
passed &= test_int_types<12, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(
q, cfg);
passed &= test_int_types<33, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(
q, cfg);
}

return passed;
Expand All @@ -734,21 +726,14 @@ bool test_fp_types_and_sizes(queue q, const Config &cfg) {
passed &= test_fp_types<1, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
passed &= test_fp_types<2, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
passed &= test_fp_types<4, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);

passed &= test_fp_types<8, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
// Supported by LSC atomic:
passed &= test_fp_types<16, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
passed &= test_fp_types<32, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);

if constexpr (UseLSCFeatures) {
passed &= test_fp_types<16, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
passed &= test_fp_types<32, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
passed &= test_fp_types<64, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);

// non power of two values are supported only in newer driver.
// TODO: Enable this when the new driver reaches test infrastructure
// (v27556).
#if 0
passed &= test_fp_types<12, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
passed &= test_fp_types<35, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
#endif
}
return passed;
}
Expand Down
70 changes: 27 additions & 43 deletions sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -593,33 +593,24 @@ template <int N, template <class, int> class Op, bool UseMask,
TestFeatures Features, bool UseAcc>
bool test_fp_types(queue q) {
bool passed = true;
if constexpr (Features == TestFeatures::DG2 ||
Features == TestFeatures::PVC) {
if constexpr (std::is_same_v<Op<sycl::half, N>,
ImplLSCFmax<sycl::half, N>> ||
std::is_same_v<Op<sycl::half, N>,
ImplLSCFmin<sycl::half, N>> ||
std::is_same_v<Op<sycl::half, N>,
ImplLSCFcmpwr<sycl::half, N>>) {
auto dev = q.get_device();
if (dev.has(sycl::aspect::fp16)) {
passed &= run_test<UseAcc, sycl::half, N, Op, UseMask>(q);
}

// TODO: Enable 'half' FADD/FSUB on DG2 when the error in GPU driver is fixed.
if constexpr (Features == TestFeatures::PVC ||
(Features == TestFeatures::DG2 &&
!std::is_same_v<Op<sycl::half, N>, ImplFadd<sycl::half, N>> &&
!std::is_same_v<Op<sycl::half, N>, ImplFsub<sycl::half, N>>)) {
if (q.get_device().has(sycl::aspect::fp16)) {
passed &= run_test<UseAcc, sycl::half, N, Op, UseMask>(q);
}
}

passed &= run_test<UseAcc, float, N, Op, UseMask>(q);

if constexpr (Features == TestFeatures::DG2 ||
Features == TestFeatures::PVC) {
// TODO: fmin/fmax/fcmpxchg for double requires a newer GPU driver.
if constexpr (!std::is_same_v<Op<double, N>, ImplLSCFmax<double, N>> &&
!std::is_same_v<Op<double, N>, ImplLSCFmin<double, N>> &&
!std::is_same_v<Op<double, N>, ImplLSCFcmpwr<double, N>>) {
if (q.get_device().has(sycl::aspect::atomic64) &&
q.get_device().has(sycl::aspect::fp64)) {
passed &= run_test<UseAcc, double, N, Op, UseMask>(q);
}
if (q.get_device().has(sycl::aspect::atomic64) &&
q.get_device().has(sycl::aspect::fp64)) {
passed &= run_test<UseAcc, double, N, Op, UseMask>(q);
}
}
return passed;
Expand All @@ -633,7 +624,6 @@ bool test_int_types_and_sizes(queue q) {
passed &= test_int_types<2, Op, UseMask, Features, UseAcc, SignMask>(q);
passed &= test_int_types<4, Op, UseMask, Features, UseAcc, SignMask>(q);
passed &= test_int_types<8, Op, UseMask, Features, UseAcc, SignMask>(q);
// TODO: N=16 and N=32 does not pass on Gen12 with mask due to older driver.
if (UseMask && Features == TestFeatures::Generic &&
esimd_test::isGPUDriverGE(q, esimd_test::GPUDriverOS::LinuxAndWindows,
"26918", "101.4953", false)) {
Expand All @@ -645,13 +635,8 @@ bool test_int_types_and_sizes(queue q) {
if constexpr (Features == TestFeatures::DG2 ||
Features == TestFeatures::PVC) {
passed &= test_int_types<64, Op, UseMask, Features, UseAcc, SignMask>(q);
// non power of two values are supported only in newer driver.
// TODO: Enable this when the new driver reaches test infrastructure
// (v27556).
#if 0
passed &= test_int_types<12, Op, UseMask, Features, UseAcc, SignMask>(q);
passed &= test_int_types<33, Op, UseMask, Features, UseAcc, SignMask>(q);
#endif
}

return passed;
Expand All @@ -672,13 +657,8 @@ bool test_fp_types_and_sizes(queue q) {
if constexpr (Features == TestFeatures::DG2 ||
Features == TestFeatures::PVC) {
passed &= test_fp_types<64, Op, UseMask, Features, UseAcc>(q);
// non power of two values are supported only in newer driver.
// TODO: Enable this when the new driver reaches test infrastructure
// (v27556).
#if 0
passed &= test_fp_types<33, Op, UseMask, Features, UseAcc>(q);
passed &= test_fp_types<65, Op, UseMask, Features, UseAcc>(q);
#endif
}
return passed;
}
Expand All @@ -705,29 +685,33 @@ int test_with_mask(queue q) {
test_int_types_and_sizes<ImplUMin, UseMask, Features, UseAcc, Unsigned>(
q);

// Check load/store operations.
passed &= test_int_types_and_sizes<ImplLoad, UseMask, Features, UseAcc>(q);
passed &= test_int_types_and_sizes<ImplStore, UseMask, Features, UseAcc>(q);
// 'float' 'load' and 'store' do not require DG2/PVC.
passed &= test_fp_types_and_sizes<ImplLoad, UseMask, Features, UseAcc>(q);
passed &= test_fp_types_and_sizes<ImplStore, UseMask, Features, UseAcc>(q);

if constexpr (Features == TestFeatures::DG2 ||
Features == TestFeatures::PVC) {
passed &=
test_fp_types_and_sizes<ImplLSCFmax, UseMask, Features, UseAcc>(q);
passed &=
test_fp_types_and_sizes<ImplLSCFmin, UseMask, Features, UseAcc>(q);

// TODO: fadd/fsub are emulated in the newer driver, but do not pass
// validation.
#if 0
}
// TODO: GPU driver promised to support FADD/FSUB on DG2, but it doesn't.
// Report the issue to driver, enable FADD/FSUB for DG2 when it is fixed.
if constexpr (Features == TestFeatures::PVC) {
passed &= test_fp_types_and_sizes<ImplFadd, UseMask, Features, UseAcc>(q);
passed &= test_fp_types_and_sizes<ImplFsub, UseMask, Features, UseAcc>(q);
#endif

// Check load/store operations.
passed &= test_int_types_and_sizes<ImplLoad, UseMask, Features, UseAcc>(q);
passed &= test_int_types_and_sizes<ImplStore, UseMask, Features, UseAcc>(q);
passed &= test_fp_types_and_sizes<ImplStore, UseMask, Features, UseAcc>(q);
}
#else
passed &= test_int_types_and_sizes<ImplCmpxchg, UseMask, Features, UseAcc>(q);
passed &=
test_fp_types_and_sizes<ImplLSCFcmpwr, UseMask, Features, UseAcc>(q);
if constexpr (Features == TestFeatures::DG2 ||
Features == TestFeatures::PVC) {
passed &=
test_fp_types_and_sizes<ImplLSCFcmpwr, UseMask, Features, UseAcc>(q);
}
#endif
return passed;
}
Expand Down
17 changes: 15 additions & 2 deletions sycl/test/esimd/memory_properties.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -910,14 +910,14 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
auto res_slm_atomic_0 =
slm_atomic_update<atomic_op::add, int16_t>(offsets, add);
}
// Expect DWORD for fmin.
// Expect LSC for fmin.
{
constexpr int VL = 16;
simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(float);
auto pred = simd_mask<VL>(1);
simd<float, VL> min = simd<float, VL>(1) * sizeof(int);

// CHECK: call <16 x float> @llvm.genx.dword.atomic.fmin.v16f32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i32 {{[^)]+}}, <16 x i32> {{[^)]+}}, <16 x float> {{[^)]+}}, <16 x float> undef)
// CHECK: call <16 x float> @llvm.genx.lsc.xatomic.slm.v16f32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 21, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <16 x i32> {{[^)]+}}, <16 x float> {{[^)]+}}, <16 x float> undef, i32 0, <16 x float> undef)
auto res_slm_atomic_0 =
slm_atomic_update<atomic_op::fmin, float>(offsets, min, pred);
}
Expand Down Expand Up @@ -1038,6 +1038,19 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
auto res_slm_atomic_0 = slm_atomic_update<atomic_op::cmpxchg, int64_t>(
offsets, swap, compare, pred);
}

// Expect LSC for FP types.
{
constexpr int VL = 16;
simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int64_t);
auto compare = simd<float, VL>(VL, 1);
auto swap = compare * 2;
auto pred = simd_mask<VL>(1);

// CHECK: call <16 x float> @llvm.genx.lsc.xatomic.slm.v16f32.v16i1.v16i32(<16 x i1> {{[^)]+}} i8 23, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <16 x i32> {{[^)]+}}, <16 x float> {{[^)]+}}, <16 x float> {{[^)]+}}, i32 0, <16 x float> undef)
auto res_slm_atomic_0 = slm_atomic_update<atomic_op::fcmpxchg, float>(
offsets, swap, compare, pred);
}
}

// Test with local accessor.
Expand Down

0 comments on commit 3756fd1

Please sign in to comment.