From 4da961b2bda81c02bc952f9b813db895c61db40e Mon Sep 17 00:00:00 2001 From: iyamazaki Date: Sun, 20 Oct 2024 22:15:35 -0700 Subject: [PATCH] Tacho : move workspace allocation to symbolic from numeric Signed-off-by: iyamazaki --- packages/amesos2/src/Amesos2_Tacho_decl.hpp | 2 + packages/amesos2/src/Amesos2_Tacho_def.hpp | 15 +- .../tacho/src/impl/Tacho_Driver_Impl.hpp | 2 +- .../src/impl/Tacho_NumericTools_Base.hpp | 3 +- .../src/impl/Tacho_NumericTools_Factory.hpp | 3 +- .../src/impl/Tacho_NumericTools_LevelSet.hpp | 188 ++++++++++-------- 6 files changed, 126 insertions(+), 87 deletions(-) diff --git a/packages/amesos2/src/Amesos2_Tacho_decl.hpp b/packages/amesos2/src/Amesos2_Tacho_decl.hpp index 1dd3f8e06988..95c71b184dc6 100644 --- a/packages/amesos2/src/Amesos2_Tacho_decl.hpp +++ b/packages/amesos2/src/Amesos2_Tacho_decl.hpp @@ -194,6 +194,8 @@ class TachoSolver : public SolverCore int method; int variant; int small_problem_threshold_size; + int streams; + bool verbose; // int num_kokkos_threads; // int max_num_superblocks; } data_; diff --git a/packages/amesos2/src/Amesos2_Tacho_def.hpp b/packages/amesos2/src/Amesos2_Tacho_def.hpp index 4056c53604be..221e505dbc54 100644 --- a/packages/amesos2/src/Amesos2_Tacho_def.hpp +++ b/packages/amesos2/src/Amesos2_Tacho_def.hpp @@ -27,8 +27,10 @@ TachoSolver::TachoSolver( Teuchos::RCP B ) : SolverCore(A, X, B) { - data_.method = 1; // Cholesky - data_.variant = 2; // solver variant + data_.method = 1; // Cholesky + data_.variant = 2; // solver variant + data_.streams = 1; // # of streams + data_.verbose = false; // verbose } @@ -74,7 +76,8 @@ TachoSolver::symbolicFactorization_impl() data_.solver.setSolutionMethod(data_.method); data_.solver.setLevelSetOptionAlgorithmVariant(data_.variant); data_.solver.setSmallProblemThresholdsize(data_.small_problem_threshold_size); - + data_.solver.setVerbose(data_.verbose); + data_.solver.setLevelSetOptionNumStreams(data_.streams); // TODO: Confirm param options // data_.solver.setMaxNumberOfSuperblocks(data_.max_num_superblocks); @@ -216,6 +219,10 @@ TachoSolver::setParameters_impl(const Teuchos::RCPget ("variant", 2); // small problem threshold data_.small_problem_threshold_size = parameterList->get ("small problem threshold size", 1024); + // verbosity + data_.verbose = parameterList->get ("verbose", false); + // # of streams + data_.streams = parameterList->get ("num-streams", 1); // TODO: Confirm param options // data_.num_kokkos_threads = parameterList->get("kokkos-threads", 1); // data_.max_num_superblocks = parameterList->get("max-num-superblocks", 4); @@ -234,6 +241,8 @@ TachoSolver::getValidParameters_impl() const pl->set("method", "chol", "Type of factorization, chol, ldl, or lu"); pl->set("variant", 2, "Type of solver variant, 0, 1, or 2"); pl->set("small problem threshold size", 1024, "Problem size threshold below with Tacho uses LAPACK."); + pl->set("verbose", false, "Verbosity"); + pl->set("num-streams", 1, "Number of GPU streams"); // TODO: Confirm param options // pl->set("kokkos-threads", 1, "Number of threads"); diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_Driver_Impl.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_Driver_Impl.hpp index c6fd7d732527..bf5e720265ee 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_Driver_Impl.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_Driver_Impl.hpp @@ -22,7 +22,7 @@ namespace Tacho { template Driver::Driver() - : _method(1), _order_connected_graph_separately(0), _m(0), _nnz(0), _ap(), _h_ap(), _aj(), _h_aj(), _perm(), + : _method(1), _order_connected_graph_separately(1), _m(0), _nnz(0), _ap(), _h_ap(), _aj(), _h_aj(), _perm(), _h_perm(), _peri(), _h_peri(), _m_graph(0), _nnz_graph(0), _h_ap_graph(), _h_aj_graph(), _h_perm_graph(), _h_peri_graph(), _nsupernodes(0), _N(nullptr), _verbose(0), _small_problem_thres(1024), _serial_thres_size(-1), _mb(-1), _nb(-1), _front_update_mode(-1), _levelset(0), _device_level_cut(0), _device_factor_thres(128), diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Base.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Base.hpp index aafda0482428..312c2bfcefd9 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Base.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Base.hpp @@ -125,9 +125,10 @@ template class NumericToolsBase { virtual void print_stat_factor() { const double kilo(1024); printf(" Time\n"); + printf(" time for extra tasks (allocation): %10.6f s\n", stat.t_extra); printf(" time for copying A into supernodes: %10.6f s\n", stat.t_copy); printf(" time for numeric factorization: %10.6f s\n", stat.t_factor); - printf(" total time spent: %10.6f s\n", (stat.t_copy + stat.t_factor)); + printf(" total time spent: %10.6f s\n", (stat.t_extra + stat.t_copy + stat.t_factor)); printf("\n"); printf(" Memory\n"); printf(" memory used in factorization: %10.3f MB\n", stat.m_used / kilo / kilo); diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Factory.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Factory.hpp index b29f3a1d27e7..ce31b241aa13 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Factory.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_Factory.hpp @@ -110,8 +110,7 @@ template class NumericToolsFactory; _gid_colidx, _sid_ptr, _sid_colidx, _blk_colidx, _stree_parent, \ _stree_ptr, _stree_children, _stree_level, _stree_roots); \ numeric_tools_levelset_name *N = dynamic_cast(object); \ - N->initialize(_device_level_cut, _device_factor_thres, _device_solve_thres, _verbose); \ - N->createStream(_nstreams, _verbose); \ + N->initialize(_device_level_cut, _device_factor_thres, _device_solve_thres, _nstreams, _verbose); \ } while (false) /// diff --git a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp index 322151e6015a..18897036922a 100644 --- a/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp +++ b/packages/shylu/shylu_node/tacho/src/impl/Tacho_NumericTools_LevelSet.hpp @@ -173,12 +173,15 @@ class NumericToolsLevelSet : public NumericToolsBase { size_type_array _buf_factor_ptr; // workspace meta data for solve + ordinal_type _nrhs; size_type_array_host _h_buf_solve_ptr, _h_buf_solve_nrhs_ptr; size_type_array _buf_solve_ptr, _buf_solve_nrhs_ptr; // workspace size_type _bufsize_factorize, _bufsize_solve; + size_type _worksize; value_type_array _buf; + value_type_array _work; // for using SpMV rowptr_view rowptrU; @@ -405,7 +408,7 @@ class NumericToolsLevelSet : public NumericToolsBase { /// initialization / release /// inline void initialize(const ordinal_type device_level_cut, const ordinal_type device_factorize_thres, - const ordinal_type device_solve_thres, const ordinal_type verbose = 0) { + const ordinal_type device_solve_thres, const int nstreams = 1, const ordinal_type verbose = 0) { stat_level.n_device_factorize = 0; stat_level.n_device_solve = 0; stat_level.n_team_factorize = 0; @@ -533,7 +536,8 @@ class NumericToolsLevelSet : public NumericToolsBase { _h_buf_solve_nrhs_ptr = size_type_array_host(do_not_initialize_tag("h_buf_solve_nrhs_ptr"), _h_buf_solve_ptr.extent(0)); - _buf_solve_nrhs_ptr = Kokkos::create_mirror_view(exec_memory_space(), _h_buf_solve_nrhs_ptr); + Kokkos::deep_copy(_h_buf_solve_nrhs_ptr, _h_buf_solve_ptr); + _buf_solve_nrhs_ptr = Kokkos::create_mirror_view_and_copy(exec_memory_space(), _h_buf_solve_nrhs_ptr); track_alloc(_buf_solve_nrhs_ptr.span() * sizeof(size_type)); /// @@ -559,6 +563,42 @@ class NumericToolsLevelSet : public NumericToolsBase { _handle_lapack = _handle_blas; } #endif + // pre-allocate buf + _nrhs = 1; + Kokkos::resize(_buf, max(_bufsize_factorize, _bufsize_solve)); + track_alloc(_buf.span() * sizeof(value_type)); + // pre-allocate work + _worksize = 0; + switch (this->getSolutionMethod()) { + case 1: { /// Cholesky +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) + value_type_matrix T(NULL, _info.max_supernode_size, _info.max_supernode_size); + _worksize = Chol::invoke(_handle_lapack, T, _work); +#endif + break; + } + case 2: { /// LDL +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) + value_type_matrix T(NULL, _info.max_supernode_size, _info.max_supernode_size); + ordinal_type_array P(NULL, _info.max_supernode_size); + _worksize = LDL::invoke(_handle_lapack, T, P, _work); +#else + _worksize = 32 * _info.max_supernode_size; +#endif + break; + } + case 3: { /// LU +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) + value_type_matrix T(NULL, _info.max_supernode_size, _info.max_num_cols); + ordinal_type_array P(NULL, std::min(_info.max_supernode_size, _info.max_num_cols)); + _worksize = LU::invoke(_handle_lapack, T, P, _work); +#endif + break; + } + } + size_type worksize = _worksize * (nstreams + 1); + Kokkos::resize(_work, worksize); + track_alloc(_work.span() * sizeof(value_type)); stat.t_init = timer.seconds(); /// @@ -621,6 +661,7 @@ class NumericToolsLevelSet : public NumericToolsBase { _solve_mode = Kokkos::create_mirror_view_and_copy(exec_memory_space(), _h_solve_mode); track_alloc(_solve_mode.span() * sizeof(ordinal_type)); + createStream(nstreams, verbose); stat.t_mode_classification = timer.seconds(); if (verbose) { switch (this->getSolutionMethod()) { @@ -653,10 +694,15 @@ class NumericToolsLevelSet : public NumericToolsBase { track_free(_buf_factor_ptr.span() * sizeof(size_type)); track_free(_buf_solve_ptr.span() * sizeof(size_type)); track_free(_buf_solve_nrhs_ptr.span() * sizeof(size_type)); - track_free(_buf.span() * sizeof(value_type)); track_free(_factorize_mode.span() * sizeof(ordinal_type)); track_free(_solve_mode.span() * sizeof(ordinal_type)); track_free(_level_sids.span() * sizeof(ordinal_type)); + + track_free(_buf.span() * sizeof(value_type)); + track_free(_work.span() * sizeof(value_type)); + Kokkos::resize(_buf, 0); + Kokkos::resize(_work, 0); + if (verbose) { printf("Summary: LevelSetTools-Variant-%d (Release)\n", variant); printf("===========================================\n"); @@ -745,13 +791,14 @@ class NumericToolsLevelSet : public NumericToolsBase { inline void createStream(const ordinal_type nstreams, const ordinal_type verbose = 0) { #if defined(KOKKOS_ENABLE_CUDA) + _nstreams = nstreams; + if (_streams.size() == size_t(nstreams)) return; // destroy previously created streams - for (ordinal_type i = 0; i < _nstreams; ++i) { + for (size_t i = 0; i < _streams.size(); ++i) { _status = cudaStreamDestroy(_streams[i]); checkDeviceStatus("cudaStreamDestroy"); } // new streams - _nstreams = nstreams; _streams.clear(); _streams.resize(_nstreams); for (ordinal_type i = 0; i < _nstreams; ++i) { @@ -760,8 +807,10 @@ class NumericToolsLevelSet : public NumericToolsBase { } #endif #if defined(KOKKOS_ENABLE_HIP) + _nstreams = nstreams; + if (_streams.size() == size_t(nstreams)) return; // destroy previously created streams - for (ordinal_type i = 0; i < _nstreams; ++i) { + for (size_t i = 0; i < _streams.size(); ++i) { _status = rocblas_destroy_handle(_handles[i]); checkDeviceLapackStatus("rocblasDestroy"); @@ -769,7 +818,6 @@ class NumericToolsLevelSet : public NumericToolsBase { checkDeviceStatus("hipStreamDestroy"); } // new streams - _nstreams = nstreams; _streams.clear(); _streams.resize(_nstreams); _handles.resize(_nstreams); @@ -2098,19 +2146,22 @@ class NumericToolsLevelSet : public NumericToolsBase { double time_update = 0.0; timer.reset(); - value_type_array work; - { - _buf = value_type_array(do_not_initialize_tag("buf"), _bufsize_factorize); + if (_buf.span() < size_t(_bufsize_factorize)) { + if (_buf.span() > 0) track_free(_buf.span() * sizeof(value_type)); + Kokkos::resize(_buf, _bufsize_factorize); track_alloc(_buf.span() * sizeof(value_type)); + } #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) - value_type_matrix T(NULL, _info.max_supernode_size, _info.max_supernode_size); - const size_type worksize = Chol::invoke(_handle_lapack, T, work); - - work = value_type_array(do_not_initialize_tag("work"), worksize * (_nstreams + 1)); - track_alloc(work.span() * sizeof(value_type)); -#endif + { + size_type worksize = _worksize * (_nstreams + 1); + if (size_t(worksize) > _work.span()) { + if (_work.span() > 0) track_free(_work.span() * sizeof(value_type)); + Kokkos::resize(_work, worksize); + track_alloc(_work.span() * sizeof(value_type)); + } } +#endif stat.t_extra = timer.seconds(); timer.reset(); @@ -2197,7 +2248,7 @@ class NumericToolsLevelSet : public NumericToolsBase { if (verbose) { Kokkos::fence(); tick.reset(); } - factorizeCholeskyOnDevice(pbeg, pend, h_buf_factor_ptr, work); + factorizeCholeskyOnDevice(pbeg, pend, h_buf_factor_ptr, _work); if (verbose) { Kokkos::fence(); time_device += tick.seconds(); tick.reset(); @@ -2224,13 +2275,6 @@ class NumericToolsLevelSet : public NumericToolsBase { bool lu = false; extractCRS(lu); } - { -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) - track_free(work.span() * sizeof(value_type)); -#endif - track_free(_buf.span() * sizeof(value_type)); - _buf = value_type_array(); - } stat.t_extra += timer.seconds(); if (verbose) { @@ -2368,7 +2412,7 @@ class NumericToolsLevelSet : public NumericToolsBase { // compute t = L^{-1}*w const value_type alpha (1); const value_type beta (0); - if (_w_vec.extent(1) != nrhs) { + if (_w_vec.extent(1) != size_t(nrhs)) { // expand workspace Kokkos::resize(_w_vec, m, nrhs); // attach to Cusparse/Rocsparse data struct @@ -3612,18 +3656,20 @@ class NumericToolsLevelSet : public NumericToolsBase { const size_type buf_extent = _bufsize_solve * nrhs; const size_type buf_span = _buf.span(); - if (buf_extent != buf_span) { - _buf = value_type_array(do_not_initialize_tag("buf"), buf_extent); - track_free(buf_span * sizeof(value_type)); + if (buf_extent > buf_span) { + if (_buf.span() > 0) track_free(buf_span * sizeof(value_type)); + Kokkos::resize(_buf, buf_extent); track_alloc(_buf.span() * sizeof(value_type)); - { - const Kokkos::RangePolicy policy(0, _buf_solve_ptr.extent(0)); - const auto buf_solve_nrhs_ptr = _buf_solve_nrhs_ptr; - const auto buf_solve_ptr = _buf_solve_ptr; - Kokkos::parallel_for( - policy, KOKKOS_LAMBDA(const ordinal_type &i) { buf_solve_nrhs_ptr(i) = nrhs * buf_solve_ptr(i); }); - } + } + if (nrhs > _nrhs) { + // update pointer to solver-workspace with differet nrhs + const Kokkos::RangePolicy policy(0, _buf_solve_ptr.extent(0)); + const auto buf_solve_nrhs_ptr = _buf_solve_nrhs_ptr; + const auto buf_solve_ptr = _buf_solve_ptr; + Kokkos::parallel_for( + policy, KOKKOS_LAMBDA(const ordinal_type &i) { buf_solve_nrhs_ptr(i) = nrhs * buf_solve_ptr(i); }); Kokkos::deep_copy(_h_buf_solve_nrhs_ptr, _buf_solve_nrhs_ptr); + _nrhs = nrhs; } } } @@ -3823,23 +3869,21 @@ class NumericToolsLevelSet : public NumericToolsBase { double time_update = 0.0; timer.reset(); - value_type_array work; - { - _buf = value_type_array(do_not_initialize_tag("buf"), _bufsize_factorize); + if (_buf.span() < size_t(_bufsize_factorize)) { + if (_buf.span() > 0) track_free(_buf.span() * sizeof(value_type)); + Kokkos::resize(_buf, _bufsize_factorize); track_alloc(_buf.span() * sizeof(value_type)); - + } #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) - value_type_matrix T(NULL, _info.max_supernode_size, _info.max_supernode_size); - ordinal_type_array P(NULL, _info.max_supernode_size); - const size_type worksize = LDL::invoke(_handle_lapack, T, P, work); - - work = value_type_array(do_not_initialize_tag("work"), worksize * (_nstreams + 1) * max(8, _nstreams)); -#else - const size_type worksize = 32 * _info.max_supernode_size; - work = value_type_array(do_not_initialize_tag("work"), worksize); -#endif - track_alloc(work.span() * sizeof(value_type)); + { + size_type worksize = _worksize * (_nstreams + 1); + if (size_t(worksize) > _work.span()) { + if (_work.span() > 0) track_free(_work.span() * sizeof(value_type)); + Kokkos::resize(_work, worksize); + track_alloc(_work.span() * sizeof(value_type)); + } } +#endif stat.t_extra = timer.seconds(); timer.reset(); @@ -3934,7 +3978,7 @@ class NumericToolsLevelSet : public NumericToolsBase { if (verbose) { Kokkos::fence(); tick.reset(); } - factorizeLDL_OnDevice(pbeg, pend, h_buf_factor_ptr, work); + factorizeLDL_OnDevice(pbeg, pend, h_buf_factor_ptr, _work); if (verbose) { Kokkos::fence(); time_device += tick.seconds(); tick.reset(); @@ -3957,15 +4001,6 @@ class NumericToolsLevelSet : public NumericToolsBase { } } // end of LDL stat.t_factor = timer.seconds(); - timer.reset(); - { -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) - track_free(work.span() * sizeof(value_type)); -#endif - track_free(_buf.span() * sizeof(value_type)); - _buf = value_type_array(); - } - stat.t_extra += timer.seconds(); if (verbose) { printf("Summary: LevelSetTools-Variant-%d (LDL Factorize)\n", variant); @@ -4178,22 +4213,22 @@ class NumericToolsLevelSet : public NumericToolsBase { double time_update = 0.0; timer.reset(); - value_type_array work; - { - _buf = value_type_array(do_not_initialize_tag("buf"), _bufsize_factorize); + if (_buf.span() < size_t(_bufsize_factorize)) { + if (_buf.span() > 0) track_free(_buf.span() * sizeof(value_type)); + Kokkos::resize(_buf, _bufsize_factorize); track_alloc(_buf.span() * sizeof(value_type)); + } #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) - // NOTE : move this to symbolic with the actual max worksize? - value_type_matrix T(NULL, _info.max_supernode_size, _info.max_num_cols); - ordinal_type_array P(NULL, std::min(_info.max_supernode_size, _info.max_num_cols)); - const size_type worksize = LU::invoke(_handle_lapack, T, P, work); - - work = value_type_array(do_not_initialize_tag("work"), worksize * (_nstreams + 1)); - // work = value_type_array(do_not_initialize_tag("work"), worksize*_nstreams); -#endif - track_alloc(work.span() * sizeof(value_type)); + { + size_type worksize = _worksize * (_nstreams + 1); + if (size_t(worksize) > _work.span()) { + if (_work.span() > 0) track_free(_work.span() * sizeof(value_type)); + Kokkos::resize(_work, worksize); + track_alloc(_work.span() * sizeof(value_type)); + } } +#endif stat.t_extra = timer.seconds(); timer.reset(); @@ -4289,7 +4324,7 @@ class NumericToolsLevelSet : public NumericToolsBase { if (verbose) { Kokkos::fence(); tick.reset(); } - factorizeLU_OnDevice(pbeg, pend, h_buf_factor_ptr, work); + factorizeLU_OnDevice(pbeg, pend, h_buf_factor_ptr, _work); if (verbose) { Kokkos::fence(); time_device += tick.seconds(); tick.reset(); @@ -4318,19 +4353,12 @@ class NumericToolsLevelSet : public NumericToolsBase { bool lu = true; extractCRS(lu); } - { -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) - track_free(work.span() * sizeof(value_type)); -#endif - track_free(_buf.span() * sizeof(value_type)); - _buf = value_type_array(); - } stat.t_extra += timer.seconds(); if (verbose) { printf("Summary: LevelSetTools-Variant-%d (LU Factorize)\n", variant); printf("================================================\n"); - printf( "\n ** Team = %f s, Device = %f s, Update = %f s **\n\n",time_parallel,time_device,time_update ); + printf( "\n ** Team = %f s, Device = %f s, Update = %f s (%d streams) **\n\n",time_parallel,time_device,time_update,_nstreams ); print_stat_factor(); fflush(stdout); }