From cfd0d41a6a3b2e7a25fdea41202c7ad12091bde2 Mon Sep 17 00:00:00 2001 From: Yury Plyakhin Date: Mon, 29 Apr 2024 15:32:50 -0700 Subject: [PATCH] [SYCL][Joint Matrix] Test stores A and B for bfloat16 16x16x16, 32x64x16, 1x64x16 (#13572) --- .../Matrix/element_wise_all_ops_impl.hpp | 111 +++++++++++------- 1 file changed, 67 insertions(+), 44 deletions(-) diff --git a/sycl/test-e2e/Matrix/element_wise_all_ops_impl.hpp b/sycl/test-e2e/Matrix/element_wise_all_ops_impl.hpp index 3986d05eaea65..db686d23ae05d 100644 --- a/sycl/test-e2e/Matrix/element_wise_all_ops_impl.hpp +++ b/sycl/test-e2e/Matrix/element_wise_all_ops_impl.hpp @@ -22,12 +22,14 @@ void assert_ops_ref(host_accessor mat, } template -void verify_op_a(const T l, const T r, const float ref, OP op) { - T mat[NUM_ROWS][NUM_COLS]; - big_matrix big_mat((T *)&mat); + size_t SUB_COLS, use Use, layout Layout, size_t VF, class kernel_name, + typename OP> +void verify_op_ab(const T l, const T r, const float ref, OP op) { + T mat[NUM_ROWS / VF][NUM_COLS * VF]; + big_matrix big_mat((T *)&mat); - buffer bufMat(big_mat.get_data(), range<2>(NUM_ROWS, NUM_COLS)); + buffer bufMat(big_mat.get_data(), + range<2>(NUM_ROWS / VF, NUM_COLS * VF)); queue q; size_t sg_size = get_sg_size(q); @@ -47,20 +49,19 @@ void verify_op_a(const T l, const T r, const float ref, OP op) { const auto sg_starty = global_idy - spmd_item.get_local_id(1); sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_mat; + joint_matrix sub_mat; joint_matrix_fill(sg, sub_mat, l); joint_matrix_apply(sg, sub_mat, [=](T &x) { x = op(x, r); }); ext::intel::experimental::matrix::joint_matrix_store( sg, sub_mat, accessMat.template get_multi_ptr() + - (sg_startx * SUB_ROWS) * NUM_COLS + - sg_starty / sg_size * SUB_COLS, - NUM_COLS); + (sg_startx * SUB_ROWS / VF) * NUM_COLS * VF + + sg_starty / sg_size * SUB_COLS * VF, + NUM_COLS * VF); }); // parallel for }).wait(); - assert_ops_ref(bufMat.get_host_access(read_only), ref); + assert_ops_ref( + bufMat.get_host_access(read_only), ref); } template class ewops_a {}; -template void test_ewops_a() { - std::cout << "Test A " << SROWS << "x" << SCOLS << "\n"; +template +class ewops_ab {}; +template +void test_ewops_ab() { + if constexpr (Use == use::a) + std::cout << "Test A "; + else + std::cout << "Test B "; + std::cout << SROWS << "x" << SCOLS << "\n"; static constexpr size_t NROWS = SROWS * 2; static constexpr size_t NCOLS = SCOLS * 2; - verify_op_a>( + verify_op_ab>( T(5.0), T(2.0), 7.0, [](auto l, auto r) { return l + r; }); - verify_op_a>( + verify_op_ab>( T(5.0), T(2.0), 3.0, [](auto l, auto r) { return l - r; }); - verify_op_a>( + verify_op_ab>( T(5.0), T(2.0), 10.0, [](auto l, auto r) { return l * r; }); - verify_op_a>( + verify_op_ab>( T(5.0), T(2.0), 2.5, [](auto l, auto r) { return l / r; }); - verify_op_a>( + verify_op_ab>( T(5.0), T(5.0), 5.0, [](auto l, auto r) { return l == r ? l : T(1.0); }); - verify_op_a>( + verify_op_ab>( T(5.0), T(4.0), 4.0, [](auto l, auto r) { return l == r ? l : r; }); - verify_op_a>( + verify_op_ab>( T(5.0), T(5.0), 1.0, [](auto l, auto r) { return l != r ? l : T(1.0); }); - verify_op_a>( + verify_op_ab>( T(5.0), T(2.0), 3.0, [](auto l, auto r) { return l > r ? T(3.0) : T(2.0); }); - verify_op_a>( + verify_op_ab>( T(5.0), T(2.0), 2.0, [](auto l, auto r) { return l < r ? T(3.0) : T(2.0); }); - verify_op_a>( + verify_op_ab>( T(5.0), T(2.0), 3.0, [](auto l, auto r) { return l >= r ? T(3.0) : T(2.0); }); - verify_op_a>( + verify_op_ab>( T(5.0), T(2.0), 2.0, [](auto l, auto r) { return l <= r ? T(3.0) : T(2.0); }); } @@ -194,30 +213,34 @@ int main() { .get_info(); - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].atype == matrix_type::bf16) { - - if (combinations[i].nsize == 0 || - (combinations[i].msize == 0 && combinations[i].nsize == 16)) { - test_ewops_a(); - test_ewops_c(); - } - - if (combinations[i].msize == 16 && combinations[i].nsize == 16) { + for (auto &combination : combinations) { + if (combination.nsize == 0 || + combination.nsize == 16) { // Intel AMX or architecture::intel_gpu_pvc + test_ewops_ab(); + test_ewops_ab(); + test_ewops_ab(); + test_ewops_c(); + test_ewops_c(); + + if (combination.nsize == 16) { // architecture::intel_gpu_pvc + test_ewops_ab(); test_ewops_c(); - } - // This combination is not currently supported for sub group size = 32 in IGC #if (!defined(SG_SZ) || SG_SZ != 32) - if (combinations[i].msize == 32 && combinations[i].nsize == 64) { + test_ewops_ab(); + test_ewops_ab(); + test_ewops_c(); test_ewops_c(); - } #endif - - if (combinations[i].nsize == 8) { - test_ewops_a(); - test_ewops_c(); } + break; + } + + if (combination.nsize == 8) { // architecture::intel_gpu_dg2* + test_ewops_ab(); + test_ewops_ab(); + test_ewops_c(); + break; } }