From 3756fd1b778ae4ab36bd3988bfdf9ba910b779fd Mon Sep 17 00:00:00 2001 From: Vyacheslav Klochkov Date: Mon, 29 Apr 2024 16:44:14 -0500 Subject: [PATCH] [ESIMD] Enable FADD/FSUB for slm_atomic_update (#13535) Those test cases required newer GPU driver and thus were disabled previously. GPU driver on DG2 still does not handle correctly atomic_update for 'float' and 'half' types. GPU driver on PVC still does not handle correctly slm_atomic_update for 'half' types. --------- Signed-off-by: Vyacheslav N Klochkov --- sycl/include/sycl/ext/intel/esimd/memory.hpp | 20 +++--- .../Inputs/atomic_update.hpp | 43 ++++-------- .../Inputs/atomic_update_slm.hpp | 70 +++++++------------ sycl/test/esimd/memory_properties.cpp | 17 ++++- 4 files changed, 66 insertions(+), 84 deletions(-) diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index c0961a1e20f03..a44c8cad98c3c 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -5823,7 +5823,7 @@ slm_atomic_update_impl(simd offsets, simd src0, constexpr lsc_vector_size VS = to_lsc_vector_size<1>(); constexpr lsc_data_order Transposed = lsc_data_order::nontranspose; constexpr int IOp = lsc_to_internal_atomic_op(); - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v || std::is_same_v) { return __esimd_lsc_xatomic_slm_1(pred.data(), offsets.data(), @@ -5867,7 +5867,7 @@ __ESIMD_API simd slm_atomic_update_impl(simd offsets, constexpr lsc_vector_size VS = to_lsc_vector_size<1>(); constexpr lsc_data_order Transposed = lsc_data_order::nontranspose; constexpr int IOp = lsc_to_internal_atomic_op(); - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v || std::is_same_v) { return __esimd_lsc_xatomic_slm_2(pred.data(), offsets.data(), @@ -6007,11 +6007,11 @@ template __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 1, simd> slm_atomic_update(simd byte_offset, simd src0, simd_mask mask = 1) { - // 2 byte, 8 byte types, non-power of two, and operations wider than - // 32 are supported only by LSC. - if constexpr (sizeof(T) == 2 || sizeof(T) == 8 || - !__ESIMD_DNS::isPowerOf2(N, 32)) { - // half and short are supported in LSC. + // Non-LSC atomic_update supports only 4-byte int vector operations with + // 1,2,4,8,16,32 vector length. Non-LSC supports only 'store' for FP types. + if constexpr (Op == atomic_op::fmin || Op == atomic_op::fmax || + Op == atomic_op::fadd || Op == atomic_op::fsub || + sizeof(T) != 4 || !__ESIMD_DNS::isPowerOf2(N, 32)) { return slm_atomic_update_impl( byte_offset, src0, mask); @@ -6096,9 +6096,9 @@ template __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2, simd> slm_atomic_update(simd byte_offset, simd src0, simd src1, simd_mask mask = 1) { - // 2 byte, 8 byte types, non-power of two, and operations wider than - // 32 are supported only by LSC. - if constexpr (sizeof(T) == 2 || sizeof(T) == 8 || + // Non-LSC atomic_update supports only 4-byte int vector operations with + // 1,2,4,8,16,32 vector length. + if constexpr (sizeof(T) != 4 || Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32)) { // 2-argument lsc_atomic_update arguments order matches the standard one - // expected value first, then new value. But atomic_update uses reverse diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp index 985f44c0e6954..af1d1262de959 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp @@ -670,16 +670,13 @@ template class Op, bool UseMask, bool UseLSCFeatures, bool UseAcc> bool test_fp_types(queue q, const Config &cfg) { bool passed = true; - if constexpr (UseLSCFeatures) { - if constexpr (std::is_same_v, ImplFmin> || - std::is_same_v, ImplFmax> || - std::is_same_v, - ImplFcmpwr>) { - auto dev = q.get_device(); - if (dev.has(sycl::aspect::fp16)) { - passed &= run_test( - q, cfg); - } + // TODO: Enable FADD/FSUB on DG2/PVC when the error in GPU driver is resolved. + if constexpr (UseLSCFeatures && + !std::is_same_v, ImplFadd> && + !std::is_same_v, ImplFsub>) { + if (q.get_device().has(sycl::aspect::fp16)) { + passed &= + run_test(q, cfg); } } passed &= run_test(q, cfg); @@ -688,7 +685,6 @@ bool test_fp_types(queue q, const Config &cfg) { q.get_device().has(sycl::aspect::fp64)) { passed &= run_test(q, cfg); } - #endif // CMPXCHG_TEST return passed; } @@ -703,7 +699,6 @@ bool test_int_types_and_sizes(queue q, const Config &cfg) { test_int_types<2, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg); passed &= test_int_types<4, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg); - passed &= test_int_types<8, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg); passed &= @@ -715,13 +710,10 @@ bool test_int_types_and_sizes(queue q, const Config &cfg) { if constexpr (UseLSCFeatures) { passed &= test_int_types<64, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>( q, cfg); - // non power of two values are supported only in newer driver. - // TODO: Enable this when the new driver reaches test infrastructure - // (v27556). -#if 0 - passed &= test_int_types<12, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg); - passed &= test_int_types<33, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg); -#endif + passed &= test_int_types<12, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>( + q, cfg); + passed &= test_int_types<33, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>( + q, cfg); } return passed; @@ -734,21 +726,14 @@ bool test_fp_types_and_sizes(queue q, const Config &cfg) { passed &= test_fp_types<1, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg); passed &= test_fp_types<2, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg); passed &= test_fp_types<4, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg); - passed &= test_fp_types<8, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg); - // Supported by LSC atomic: + passed &= test_fp_types<16, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg); + passed &= test_fp_types<32, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg); + if constexpr (UseLSCFeatures) { - passed &= test_fp_types<16, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg); - passed &= test_fp_types<32, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg); passed &= test_fp_types<64, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg); - - // non power of two values are supported only in newer driver. - // TODO: Enable this when the new driver reaches test infrastructure - // (v27556). -#if 0 passed &= test_fp_types<12, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg); passed &= test_fp_types<35, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg); -#endif } return passed; } diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp index 92f000711e137..93075316f65d7 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp @@ -593,18 +593,14 @@ template class Op, bool UseMask, TestFeatures Features, bool UseAcc> bool test_fp_types(queue q) { bool passed = true; - if constexpr (Features == TestFeatures::DG2 || - Features == TestFeatures::PVC) { - if constexpr (std::is_same_v, - ImplLSCFmax> || - std::is_same_v, - ImplLSCFmin> || - std::is_same_v, - ImplLSCFcmpwr>) { - auto dev = q.get_device(); - if (dev.has(sycl::aspect::fp16)) { - passed &= run_test(q); - } + + // TODO: Enable 'half' FADD/FSUB on DG2 when the error in GPU driver is fixed. + if constexpr (Features == TestFeatures::PVC || + (Features == TestFeatures::DG2 && + !std::is_same_v, ImplFadd> && + !std::is_same_v, ImplFsub>)) { + if (q.get_device().has(sycl::aspect::fp16)) { + passed &= run_test(q); } } @@ -612,14 +608,9 @@ bool test_fp_types(queue q) { if constexpr (Features == TestFeatures::DG2 || Features == TestFeatures::PVC) { - // TODO: fmin/fmax/fcmpxchg for double requires a newer GPU driver. - if constexpr (!std::is_same_v, ImplLSCFmax> && - !std::is_same_v, ImplLSCFmin> && - !std::is_same_v, ImplLSCFcmpwr>) { - if (q.get_device().has(sycl::aspect::atomic64) && - q.get_device().has(sycl::aspect::fp64)) { - passed &= run_test(q); - } + if (q.get_device().has(sycl::aspect::atomic64) && + q.get_device().has(sycl::aspect::fp64)) { + passed &= run_test(q); } } return passed; @@ -633,7 +624,6 @@ bool test_int_types_and_sizes(queue q) { passed &= test_int_types<2, Op, UseMask, Features, UseAcc, SignMask>(q); passed &= test_int_types<4, Op, UseMask, Features, UseAcc, SignMask>(q); passed &= test_int_types<8, Op, UseMask, Features, UseAcc, SignMask>(q); - // TODO: N=16 and N=32 does not pass on Gen12 with mask due to older driver. if (UseMask && Features == TestFeatures::Generic && esimd_test::isGPUDriverGE(q, esimd_test::GPUDriverOS::LinuxAndWindows, "26918", "101.4953", false)) { @@ -645,13 +635,8 @@ bool test_int_types_and_sizes(queue q) { if constexpr (Features == TestFeatures::DG2 || Features == TestFeatures::PVC) { passed &= test_int_types<64, Op, UseMask, Features, UseAcc, SignMask>(q); - // non power of two values are supported only in newer driver. - // TODO: Enable this when the new driver reaches test infrastructure - // (v27556). -#if 0 passed &= test_int_types<12, Op, UseMask, Features, UseAcc, SignMask>(q); passed &= test_int_types<33, Op, UseMask, Features, UseAcc, SignMask>(q); -#endif } return passed; @@ -672,13 +657,8 @@ bool test_fp_types_and_sizes(queue q) { if constexpr (Features == TestFeatures::DG2 || Features == TestFeatures::PVC) { passed &= test_fp_types<64, Op, UseMask, Features, UseAcc>(q); - // non power of two values are supported only in newer driver. - // TODO: Enable this when the new driver reaches test infrastructure - // (v27556). -#if 0 passed &= test_fp_types<33, Op, UseMask, Features, UseAcc>(q); passed &= test_fp_types<65, Op, UseMask, Features, UseAcc>(q); -#endif } return passed; } @@ -705,29 +685,33 @@ int test_with_mask(queue q) { test_int_types_and_sizes( q); + // Check load/store operations. + passed &= test_int_types_and_sizes(q); + passed &= test_int_types_and_sizes(q); + // 'float' 'load' and 'store' do not require DG2/PVC. + passed &= test_fp_types_and_sizes(q); + passed &= test_fp_types_and_sizes(q); + if constexpr (Features == TestFeatures::DG2 || Features == TestFeatures::PVC) { passed &= test_fp_types_and_sizes(q); passed &= test_fp_types_and_sizes(q); - - // TODO: fadd/fsub are emulated in the newer driver, but do not pass - // validation. -#if 0 + } + // TODO: GPU driver promised to support FADD/FSUB on DG2, but it doesn't. + // Report the issue to driver, enable FADD/FSUB for DG2 when it is fixed. + if constexpr (Features == TestFeatures::PVC) { passed &= test_fp_types_and_sizes(q); passed &= test_fp_types_and_sizes(q); -#endif - - // Check load/store operations. - passed &= test_int_types_and_sizes(q); - passed &= test_int_types_and_sizes(q); - passed &= test_fp_types_and_sizes(q); } #else passed &= test_int_types_and_sizes(q); - passed &= - test_fp_types_and_sizes(q); + if constexpr (Features == TestFeatures::DG2 || + Features == TestFeatures::PVC) { + passed &= + test_fp_types_and_sizes(q); + } #endif return passed; } diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp index 0a43e31f62bd8..fc26b81684b7b 100644 --- a/sycl/test/esimd/memory_properties.cpp +++ b/sycl/test/esimd/memory_properties.cpp @@ -910,14 +910,14 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, auto res_slm_atomic_0 = slm_atomic_update(offsets, add); } - // Expect DWORD for fmin. + // Expect LSC for fmin. { constexpr int VL = 16; simd offsets = simd(1) * sizeof(float); auto pred = simd_mask(1); simd min = simd(1) * sizeof(int); - // CHECK: call <16 x float> @llvm.genx.dword.atomic.fmin.v16f32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i32 {{[^)]+}}, <16 x i32> {{[^)]+}}, <16 x float> {{[^)]+}}, <16 x float> undef) + // CHECK: call <16 x float> @llvm.genx.lsc.xatomic.slm.v16f32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 21, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <16 x i32> {{[^)]+}}, <16 x float> {{[^)]+}}, <16 x float> undef, i32 0, <16 x float> undef) auto res_slm_atomic_0 = slm_atomic_update(offsets, min, pred); } @@ -1038,6 +1038,19 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, auto res_slm_atomic_0 = slm_atomic_update( offsets, swap, compare, pred); } + + // Expect LSC for FP types. + { + constexpr int VL = 16; + simd offsets = simd(1) * sizeof(int64_t); + auto compare = simd(VL, 1); + auto swap = compare * 2; + auto pred = simd_mask(1); + + // CHECK: call <16 x float> @llvm.genx.lsc.xatomic.slm.v16f32.v16i1.v16i32(<16 x i1> {{[^)]+}} i8 23, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <16 x i32> {{[^)]+}}, <16 x float> {{[^)]+}}, <16 x float> {{[^)]+}}, i32 0, <16 x float> undef) + auto res_slm_atomic_0 = slm_atomic_update( + offsets, swap, compare, pred); + } } // Test with local accessor.