diff --git a/.github/workflows/PR-gcc-openmpi.yml b/.github/workflows/AT2.yml similarity index 76% rename from .github/workflows/PR-gcc-openmpi.yml rename to .github/workflows/AT2.yml index 99b513d3f145..d4c6c226d80b 100644 --- a/.github/workflows/PR-gcc-openmpi.yml +++ b/.github/workflows/AT2.yml @@ -1,11 +1,13 @@ -name: PR-gcc-openmpi +name: AT2-EXPERIMENTAL on: pull_request: types: - - labeled - opened - synchronize + pull_request_review: + types: + - submitted branches: - master - develop @@ -15,14 +17,9 @@ permissions: contents: read jobs: - trilogpu02-gcc: - permissions: - actions: write # for styfle/cancel-workflow-action to cancel/stop running workflows - contents: read # for actions/checkout to fetch code - runs-on: [self-hosted, trilogpu02] - container: - image: registry-ex.sandia.gov/trilinos-project/trilinos-containers/rhel8/trilinos-pr-env:gcc-8.5.0 - options: --hostname trilinos-container-gcc-8 + gcc10-openmpi416-EXPERIMENTAL: + runs-on: [self-hosted, gcc-10.3.0_openmpi-4.1.6] + if: ${{ github.event.action == 'synchronize' || github.event.action == 'opened' || github.event.review.state == 'APPROVED' }} steps: - name: env env: @@ -62,7 +59,7 @@ jobs: - name: Generate PR cmake fragment working-directory: /home/Trilinos/build run: | - bash -l -c "source ${GITHUB_WORKSPACE}/packages/framework/GenConfig/gen-config.sh rhel8_gcc-openmpi_debug_shared_no-kokkos-arch_no-asan_complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables --cmake-fragment PR-configuration.cmake ${GITHUB_WORKSPACE}" + bash -l -c "source ${GITHUB_WORKSPACE}/packages/framework/GenConfig/gen-config.sh rhel8_gcc-openmpi_debug_shared_no-kokkos-arch_no-asan_complex_no-fpic_mpi_no-pt_no-rdc_no-uvm_deprecated-on_no-package-enables --force --cmake-fragment PR-configuration.cmake ${GITHUB_WORKSPACE}" - name: Generate enable packages cmake fragment working-directory: /home/Trilinos/build run: | @@ -71,14 +68,9 @@ jobs: working-directory: /home/Trilinos/build run: | bash -l -c "cmake -C PR-configuration.cmake -C package-enables.cmake ${GITHUB_WORKSPACE}" - - name: CMakeCache.txt - if: always() - working-directory: /home/Trilinos/build - run: | - bash -l -c "cat CMakeCache.txt" - name: build trilinos working-directory: /home/Trilinos/build - run: bash -l -c "ninja" + run: bash -l -c "ninja -j36" - name: ctest working-directory: /home/Trilinos/build run: bash -l -c "ctest -j36" diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index 75cf7d3e8d5a..97323a31371c 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -24,4 +24,4 @@ jobs: - name: 'Checkout Repository' uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2 - name: 'Dependency Review' - uses: actions/dependency-review-action@733dd5d4a5203f238c33806593ec0f5fc5343d8c # v4.2.4 + uses: actions/dependency-review-action@5bbc3ba658137598168acb2ab73b21c432dd411b # v4.2.5 diff --git a/.github/workflows/spack.yml b/.github/workflows/spack.yml new file mode 100644 index 000000000000..bb9ef762863c --- /dev/null +++ b/.github/workflows/spack.yml @@ -0,0 +1,30 @@ +name: spack +on: + pull_request: + types: + - opened + - synchronize + pull_request_review: + types: + - submitted + workflow_dispatch: + +jobs: + gcc10-openmpi416: + runs-on: [self-hosted, gcc-10.3.0_openmpi-4.1.6] + steps: + - name: Cancel Previous Runs + uses: styfle/cancel-workflow-action@0.11.0 + with: + access_token: ${{ github.token }} + - name: Clone Trilinos + uses: actions/checkout@v4 + with: + fetch-depth: 1 + - name: Spack build + shell: bash -l {0} + run: | + spack develop --no-clone --path $GITHUB_WORKSPACE trilinos@develop + spack add trilinos@develop + spack concretize -f + spack install --cdash-upload-url=https://sems-cdash-son.sandia.gov/cdash/submit.php?project=Trilinos -j16 diff --git a/.github/workflows/title_to_mention.yml b/.github/workflows/title_to_mention.yml index 3c802e6b5831..664e99c194b6 100644 --- a/.github/workflows/title_to_mention.yml +++ b/.github/workflows/title_to_mention.yml @@ -31,3 +31,10 @@ jobs: issue-number: ${{ github.event.issue.number }} body: | Automatic mention of the @trilinos/ifpack2 team + - name: Mention ROL + uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4.0.0 + if: (contains(github.event.action, 'labeled') && contains(github.event.label.name, 'ROL')) || (contains(github.event.action, 'opened') && contains(github.event.issue.title, 'ROL')) + with: + issue-number: ${{ github.event.issue.number }} + body: | + Automatic mention of @aj463 diff --git a/commonTools/framework/ProjectCiFileChangeLogic.py b/commonTools/framework/ProjectCiFileChangeLogic.py index 04359219583a..2d38575c8f1f 100644 --- a/commonTools/framework/ProjectCiFileChangeLogic.py +++ b/commonTools/framework/ProjectCiFileChangeLogic.py @@ -85,6 +85,12 @@ def isGlobalBuildFileRequiringGlobalRebuild(self, modifiedFileFullPath): # Changes under packages/framework/ likely impact the GenConfig PR build # configurations and therefore to be safe, everything needs to be tested. return True + elif lenPathArray >= 2 and modifiedFileFullPathArray[0] == '.github' and \ + modifiedFileFullPathArray[1] == 'workflows' \ + : + # Changes under .github/workflows/ impact CI-type runs on GitHub Actions + # and therefore to be safe, everything needs to be tested. + return True # Any other files not already covered above should *not* trigger a global # build return False diff --git a/commonTools/framework/ProjectCiFileChangeLogic_UnitTests.py b/commonTools/framework/ProjectCiFileChangeLogic_UnitTests.py index a3c0a15e547d..2a81ea5dc031 100644 --- a/commonTools/framework/ProjectCiFileChangeLogic_UnitTests.py +++ b/commonTools/framework/ProjectCiFileChangeLogic_UnitTests.py @@ -116,6 +116,9 @@ def test_packages_something(self): def test_packages_framework(self): self.check('packages/framework/something', True) + def test_dotgithub_workflows(self): + self.check('.github/workflows/something', True) + if __name__ == '__main__': unittest.main() diff --git a/packages/framework/ini-files/config-specs.ini b/packages/framework/ini-files/config-specs.ini index aaefa957b526..12b755ad62a7 100644 --- a/packages/framework/ini-files/config-specs.ini +++ b/packages/framework/ini-files/config-specs.ini @@ -2826,6 +2826,7 @@ opt-set-cmake-var ROL_example_PDE-OPT_navier-stokes_example_01_MPI_4_DISABLE opt-set-cmake-var Pliris_vector_random_MPI_3_DISABLE BOOL : ON opt-set-cmake-var Pliris_vector_random_MPI_4_DISABLE BOOL : ON +opt-set-cmake-var Zoltan_ch_7944_parmetis_parallel_DISABLE BOOL : ON opt-set-cmake-var Zoltan_ch_simple_parmetis_parallel_DISABLE BOOL : ON opt-set-cmake-var Belos_bl_gmres_complex_hb_3_MPI_4_DISABLE BOOL : ON opt-set-cmake-var Belos_hybrid_gmres_complex_hb_0_MPI_4_DISABLE BOOL : ON diff --git a/packages/ifpack2/example/BlockTriDi.cpp b/packages/ifpack2/example/BlockTriDi.cpp index 3277179656c1..0a27ed1bb198 100644 --- a/packages/ifpack2/example/BlockTriDi.cpp +++ b/packages/ifpack2/example/BlockTriDi.cpp @@ -18,7 +18,7 @@ namespace { // (anonymous) // Values of command-line arguments. struct CmdLineArgs { - CmdLineArgs ():blockSize(-1),numIters(10),numRepeats(1),tol(1e-12),nx(172),ny(-1),nz(-1),mx(1),my(1),mz(1),sublinesPerLine(1),sublinesPerLineSchur(1),useStackedTimer(false),overlapCommAndComp(false){} + CmdLineArgs ():blockSize(-1),numIters(10),numRepeats(1),tol(1e-12),nx(172),ny(-1),nz(-1),mx(1),my(1),mz(1),sublinesPerLine(1),sublinesPerLineSchur(1),useStackedTimer(false),usePointMatrix(false),overlapCommAndComp(false){} std::string mapFilename; std::string matrixFilename; @@ -37,6 +37,7 @@ struct CmdLineArgs { int sublinesPerLine; int sublinesPerLineSchur; bool useStackedTimer; + bool usePointMatrix; bool overlapCommAndComp; std::string problemName; std::string matrixType; @@ -68,6 +69,8 @@ getCmdLineArgs (CmdLineArgs& args, int argc, char* argv[]) cmdp.setOption ("sublinesPerLine", &args.sublinesPerLine, "If using inline meshing, number of sublines per mesh x line. If set to -1 the block Jacobi algorithm is used."); cmdp.setOption ("withStackedTimer", "withoutStackedTimer", &args.useStackedTimer, "Whether to run with a StackedTimer and print the timer tree at the end (and try to output Watchr report)"); + cmdp.setOption ("withPointMatrix", "withoutPointMatrix", &args.usePointMatrix, + "Whether to run with a point matrix"); cmdp.setOption ("withOverlapCommAndComp", "withoutOverlapCommAndComp", &args.overlapCommAndComp, "Whether to run with overlapCommAndComp)"); cmdp.setOption("problemName", &args.problemName, "Human-readable problem name for Watchr plot"); @@ -279,7 +282,6 @@ main (int argc, char* argv[]) using std::cerr; using std::endl; typedef Tpetra::CrsMatrix<> crs_matrix_type; - typedef Tpetra::BlockCrsMatrix<> block_crs_matrix_type; typedef Tpetra::Map<> map_type; typedef Tpetra::MultiVector<> MV; typedef Tpetra::RowMatrix<> row_matrix_type; @@ -358,11 +360,15 @@ main (int argc, char* argv[]) // Read sparse matrix A from Matrix Market file. RCP A; - RCP Ablock; + RCP Ablock; RCP B,X; RCP line_info; #if defined(HAVE_IFPACK2_XPETRA) if(args.matrixFilename == "") { + if (args.usePointMatrix) { + std::string msg = "usePointMatrix with inline matrix is not yet implemented"; + throw std::runtime_error(msg); + } // matrix Teuchos::ParameterList plist; if(args.matrixType == "") { @@ -575,7 +581,10 @@ main (int argc, char* argv[]) { Teuchos::TimeMonitor precSetupTimeMon (*precSetupTime); - precond = rcp(new BTDC(Ablock,parts,args.sublinesPerLineSchur,args.overlapCommAndComp)); + if(args.usePointMatrix) + precond = rcp(new BTDC(A,parts,args.sublinesPerLineSchur,args.overlapCommAndComp, false, args.blockSize)); + else + precond = rcp(new BTDC(Ablock,parts,args.sublinesPerLineSchur,args.overlapCommAndComp)); if(rank0) std::cout<<"Initializing preconditioner..."<initialize (); diff --git a/packages/ifpack2/src/Ifpack2_BlockRelaxation_decl.hpp b/packages/ifpack2/src/Ifpack2_BlockRelaxation_decl.hpp index a8e291807474..5246b3b286cf 100644 --- a/packages/ifpack2/src/Ifpack2_BlockRelaxation_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockRelaxation_decl.hpp @@ -139,6 +139,8 @@ class BlockRelaxation : typedef Tpetra::Vector vector_type; typedef Teuchos::ScalarTraits STS; typedef Teuchos::ScalarTraits STM; + typedef Tpetra::CrsMatrix crs_matrix_type; typedef Tpetra::BlockCrsMatrix block_crs_matrix_type; typedef Tpetra::Map map_type; diff --git a/packages/ifpack2/src/Ifpack2_BlockRelaxation_def.hpp b/packages/ifpack2/src/Ifpack2_BlockRelaxation_def.hpp index 5d780489a933..ebf713ef64a4 100644 --- a/packages/ifpack2/src/Ifpack2_BlockRelaxation_def.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockRelaxation_def.hpp @@ -53,6 +53,7 @@ #include "Ifpack2_LocalFilter.hpp" #include "Ifpack2_Parameters.hpp" #include "Teuchos_TimeMonitor.hpp" +#include "Tpetra_BlockCrsMatrix_Helpers_decl.hpp" namespace Ifpack2 { @@ -176,6 +177,8 @@ getValidParameters () const validParams->set("partitioner: coordinates",dummy); validParams->set("timer for apply", true); validParams->set("partitioner: subparts per part", 1); + validParams->set("partitioner: block size", -1); + validParams->set("partitioner: print level", false); return validParams; } @@ -629,11 +632,16 @@ initialize () Teuchos::RCP A_bcrs = Teuchos::rcp_dynamic_cast (A_); hasBlockCrsMatrix_ = !A_bcrs.is_null(); - if (A_bcrs.is_null ()) { - hasBlockCrsMatrix_ = false; - } - else { + + if(!hasBlockCrsMatrix_ && List_.isParameter("relaxation: container") && List_.get("relaxation: container") == "BlockTriDi" ) { + TEUCHOS_FUNC_TIME_MONITOR("Ifpack2::BlockRelaxation::initialize::convertToBlockCrsMatrix"); + int block_size = List_.get("partitioner: block size"); + TEUCHOS_TEST_FOR_EXCEPT_MSG + (block_size == -1, "A pointwise matrix and block_size = -1 were given as inputs."); + A_bcrs = Tpetra::convertToBlockCrsMatrix(*Teuchos::rcp_dynamic_cast(A_), block_size); + A_ = A_bcrs; hasBlockCrsMatrix_ = true; + Kokkos::DefaultExecutionSpace().fence(); } NumLocalRows_ = A_->getLocalNumRows (); diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_decl.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_decl.hpp index 4f066f9b76fc..e72565179cc4 100644 --- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_decl.hpp @@ -174,6 +174,8 @@ namespace Ifpack2 { //typedef Tpetra::MultiVector local_mv_type; //typedef typename Kokkos::View HostViewLocal; + typedef Tpetra::CrsMatrix + crs_matrix_type; typedef Tpetra::BlockCrsMatrix block_crs_matrix_type; @@ -231,7 +233,9 @@ namespace Ifpack2 { BlockTriDiContainer (const Teuchos::RCP& matrix, const Teuchos::Array >& partitions, const int n_subparts_per_part = 1, - bool overlapCommAndComp = false, bool useSequentialMethod = false); + bool overlapCommAndComp = false, + bool useSequentialMethod = false, + const int block_size = -1); //! Destructor (declared virtual for memory safety of derived classes). ~BlockTriDiContainer () override; @@ -398,7 +402,8 @@ namespace Ifpack2 { void initInternal (const Teuchos::RCP& matrix, const Teuchos::RCP &importer, const bool overlapCommAndComp, - const bool useSeqMethod); + const bool useSeqMethod, + const int block_size = -1); void clearInternal(); }; diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp index 8e5709e0e937..6fbccc4d92ee 100644 --- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp @@ -47,6 +47,7 @@ #include #include +#include #include #include @@ -81,7 +82,8 @@ namespace Ifpack2 { ::initInternal (const Teuchos::RCP& matrix, const Teuchos::RCP& importer, const bool overlapCommAndComp, - const bool useSeqMethod) + const bool useSeqMethod, + const int block_size) { IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::initInternal"); @@ -98,8 +100,15 @@ namespace Ifpack2 { { IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::setA"); impl_->A = Teuchos::rcp_dynamic_cast(matrix); - TEUCHOS_TEST_FOR_EXCEPT_MSG - (impl_->A.is_null(), "BlockTriDiContainer currently supports Tpetra::BlockCrsMatrix only."); + if (impl_->A.is_null()) { + TEUCHOS_TEST_FOR_EXCEPT_MSG + (block_size == -1, "A pointwise matrix and block_size = -1 were given as inputs."); + { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::setA::convertToBlockCrsMatrix"); + impl_->A = Tpetra::convertToBlockCrsMatrix(*Teuchos::rcp_dynamic_cast(matrix), block_size); + IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) + } + } IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } @@ -197,11 +206,12 @@ namespace Ifpack2 { const Teuchos::Array >& partitions, const int n_subparts_per_part, const bool overlapCommAndComp, - const bool useSeqMethod) + const bool useSeqMethod, + const int block_size) : Container(matrix, partitions, false), partitions_(partitions) { IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::BlockTriDiContainer"); - initInternal(matrix, Teuchos::null, overlapCommAndComp, useSeqMethod); + initInternal(matrix, Teuchos::null, overlapCommAndComp, useSeqMethod, block_size); n_subparts_per_part_ = n_subparts_per_part; IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp index 4804ce3099b7..8d47c85d1a50 100644 --- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp @@ -300,6 +300,7 @@ namespace Ifpack2 { #else using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view; #endif + using impl_scalar_type_1d_view_host = Kokkos::View; using impl_scalar_type_2d_view = typename impl_type::impl_scalar_type_2d_view; using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra; @@ -322,6 +323,7 @@ namespace Ifpack2 { SendRecvPair offset_host; // offsets to local id list and data buffer SendRecvPair lids; // local id list SendRecvPair buffer; // data buffer + SendRecvPair buffer_host; // data buffer local_ordinal_type_1d_view dm2cm; // permutation @@ -478,6 +480,11 @@ namespace Ifpack2 { buffer.send = impl_scalar_type_1d_view(do_not_initialize_tag("buffer send"), send_buffer_size); buffer.recv = impl_scalar_type_1d_view(do_not_initialize_tag("buffer recv"), recv_buffer_size); + + if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) { + buffer_host.send = impl_scalar_type_1d_view_host(do_not_initialize_tag("buffer send"), send_buffer_size); + buffer_host.recv = impl_scalar_type_1d_view_host(do_not_initialize_tag("buffer recv"), recv_buffer_size); + } } } @@ -558,15 +565,12 @@ namespace Ifpack2 { &reqs.recv[i]); } else { - const auto buffer_recv_host = Kokkos::create_mirror_view( - Kokkos::view_alloc(Kokkos::WithoutInitializing), buffer.recv); irecv(comm, - reinterpret_cast(buffer_recv_host.data() + offset_host.recv[i]*mv_blocksize), + reinterpret_cast(buffer_host.recv.data() + offset_host.recv[i]*mv_blocksize), (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type), pids.recv[i], 42, &reqs.recv[i]); - Kokkos::deep_copy(buffer.recv, buffer_recv_host); } } @@ -582,7 +586,21 @@ namespace Ifpack2 { mv, blocksize, //execution_space()); exec_instances[i%8]); - + if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) { + //if (i<8) exec_instances[i%8].fence(); + const local_ordinal_type num_vectors = mv.extent(1); + const local_ordinal_type mv_blocksize = blocksize*num_vectors; + + Kokkos::deep_copy(exec_instances[i%8], + Kokkos::subview(buffer_host.send, + Kokkos::pair( + offset_host.send(i)*mv_blocksize, + offset_host.send(i+1)*mv_blocksize)), + Kokkos::subview(buffer.send, + Kokkos::pair( + offset_host.send(i)*mv_blocksize, + offset_host.send(i+1)*mv_blocksize))); + } } /// somehow one unit test fails when we use exec_instance[i%8] //execution_space().fence(); @@ -598,11 +616,8 @@ namespace Ifpack2 { &reqs.send[i]); } else { - const auto buffer_send_host = Kokkos::create_mirror_view( - Kokkos::view_alloc(Kokkos::WithoutInitializing), buffer.send); - Kokkos::deep_copy(buffer_send_host, buffer.send); isend(comm, - reinterpret_cast(buffer_send_host.data() + offset_host.send[i]*mv_blocksize), + reinterpret_cast(buffer_host.send.data() + offset_host.send[i]*mv_blocksize), (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type), pids.send[i], 42, @@ -630,6 +645,21 @@ namespace Ifpack2 { // 0.0. wait any waitany(pids.recv.extent(0), reqs.recv.data(), &idx); + if (!Tpetra::Details::Behavior::assumeMpiIsGPUAware()) { + const local_ordinal_type num_vectors = remote_multivector.extent(1); + const local_ordinal_type mv_blocksize = blocksize*num_vectors; + + Kokkos::deep_copy( + Kokkos::subview(buffer.recv, + Kokkos::pair( + offset_host.recv(idx)*mv_blocksize, + offset_host.recv(idx+1)*mv_blocksize)), + Kokkos::subview(buffer_host.recv, + Kokkos::pair( + offset_host.recv(idx)*mv_blocksize, + offset_host.recv(idx+1)*mv_blocksize))); + } + // 0.1. unpack data after data is moved into a device copy(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx+1), @@ -731,15 +761,12 @@ namespace Ifpack2 { &reqs.recv[i]); } else { - const auto buffer_recv_host = Kokkos::create_mirror_view( - Kokkos::view_alloc(Kokkos::WithoutInitializing), buffer.recv); irecv(comm, - reinterpret_cast(buffer_recv_host.data() + offset_host.recv[i]*mv_blocksize), + reinterpret_cast(buffer_host.recv.data() + offset_host.recv[i]*mv_blocksize), (offset_host.recv[i+1] - offset_host.recv[i])*mv_blocksize*sizeof(impl_scalar_type), pids.recv[i], 42, &reqs.recv[i]); - Kokkos::deep_copy(buffer.recv, buffer_recv_host); } } @@ -757,11 +784,17 @@ namespace Ifpack2 { &reqs.send[i]); } else { - const auto buffer_send_host = Kokkos::create_mirror_view( - Kokkos::view_alloc(Kokkos::WithoutInitializing), buffer.send); - Kokkos::deep_copy(buffer_send_host, buffer.send); + Kokkos::deep_copy( + Kokkos::subview(buffer_host.send, + Kokkos::pair( + offset_host.send(i)*mv_blocksize, + offset_host.send(i+1)*mv_blocksize)), + Kokkos::subview(buffer.send, + Kokkos::pair( + offset_host.send(i)*mv_blocksize, + offset_host.send(i+1)*mv_blocksize))); isend(comm, - reinterpret_cast(buffer_send_host.data() + offset_host.send[i]*mv_blocksize), + reinterpret_cast(buffer_host.send.data() + offset_host.send[i]*mv_blocksize), (offset_host.send[i+1] - offset_host.send[i])*mv_blocksize*sizeof(impl_scalar_type), pids.send[i], 42, @@ -787,6 +820,19 @@ namespace Ifpack2 { for (local_ordinal_type i=0,iend=pids.recv.extent(0);i( + offset_host.recv(idx)*mv_blocksize, + offset_host.recv(idx+1)*mv_blocksize)), + Kokkos::subview(buffer_host.recv, + Kokkos::pair( + offset_host.recv(idx)*mv_blocksize, + offset_host.recv(idx+1)*mv_blocksize))); + } copy(lids.recv, buffer.recv, offset_host.recv(idx), offset_host.recv(idx+1), remote_multivector, blocksize); } diff --git a/packages/ifpack2/src/Ifpack2_Relaxation_def.hpp b/packages/ifpack2/src/Ifpack2_Relaxation_def.hpp index 5dcdccfffb33..459853ec12fe 100644 --- a/packages/ifpack2/src/Ifpack2_Relaxation_def.hpp +++ b/packages/ifpack2/src/Ifpack2_Relaxation_def.hpp @@ -346,6 +346,7 @@ template void Relaxation::setParametersImpl (Teuchos::ParameterList& pl) { using Teuchos::getIntegralValue; + using Teuchos::getStringValue; using Teuchos::ParameterList; using Teuchos::RCP; typedef scalar_type ST; // just to make code below shorter @@ -362,6 +363,10 @@ void Relaxation::setParametersImpl (Teuchos::ParameterList& pl) const Details::RelaxationType precType = getIntegralValue (pl, "relaxation: type"); + const std::string precTypeStr = getStringValue(pl, "relaxation: type"); + // We only access "relaxation: type" using strings in the rest of the code + pl.set("relaxation: type", precTypeStr); + pl.get("relaxation: type"); // We need to mark the parameter as "used" const int numSweeps = pl.get ("relaxation: sweeps"); const ST dampingFactor = pl.get ("relaxation: damping factor"); const bool zeroStartSol = pl.get ("relaxation: zero starting solution"); diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainer.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainer.cpp index 9c606798e890..1e26b4996649 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainer.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainer.cpp @@ -279,31 +279,34 @@ static LO run_teuchos_tests (const Input& in, Teuchos::FancyOStream& out, bool& for (const bool overlap_comm : {false, true}) { // temporary disabling overlap comm version if (seq_method && overlap_comm) continue; for (const bool nonuniform_lines : {false, true}) { - if (jacobi && nonuniform_lines) continue; - for (const int nvec : {1, 3}) { - std::stringstream ss; - ss << "test_BR_BTDC:" - << " bs " << bs - << (contiguous ? " contig" : " noncontig") - << (jacobi ? " jacobi" : " tridiag") - << (seq_method ? " seq_method" : "") - << (overlap_comm ? " overlap_comm" : "") - << (nonuniform_lines ? " nonuniform_lines" : " uniform_lines") - << " nvec " << nvec; - const std::string details = ss.str(); - bool threw = false; - try { - ne = btdct::test_BR_BTDC(in.comm, sb, sbp, bs, nvec, nonuniform_lines, - different_maps, jacobi, overlap_comm, seq_method, - details); - nerr += ne; - } catch (const std::exception& e) { - threw = true; - } - if (threw) - printf("Exception threw from rank %d, %s\n", in.comm->getRank(), details.c_str()); + for (const bool pointwise : {false, true}) { + if (jacobi && nonuniform_lines) continue; + for (const int nvec : {1, 3}) { + std::stringstream ss; + ss << "test_BR_BTDC:" + << " bs " << bs + << (contiguous ? " contig" : " noncontig") + << (jacobi ? " jacobi" : " tridiag") + << (seq_method ? " seq_method" : "") + << (overlap_comm ? " overlap_comm" : "") + << (pointwise ? " point_wise" : "") + << (nonuniform_lines ? " nonuniform_lines" : " uniform_lines") + << " nvec " << nvec; + const std::string details = ss.str(); + bool threw = false; + try { + ne = btdct::test_BR_BTDC(in.comm, sb, sbp, bs, nvec, nonuniform_lines, + different_maps, jacobi, overlap_comm, seq_method, pointwise, + details); + nerr += ne; + } catch (const std::exception& e) { + threw = true; + } + if (threw) + printf("Exception threw from rank %d, %s\n", in.comm->getRank(), details.c_str()); - TEUCHOS_TEST(ne == 0 && ! threw, details); + TEUCHOS_TEST(ne == 0 && ! threw, details); + } } } } diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp index c2216b454b51..7b9c8a28bb39 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp @@ -52,6 +52,7 @@ #include #ifdef HAVE_IFPACK2_EXPERIMENTAL_KOKKOSKERNELS_FEATURES #include +#include "Tpetra_BlockCrsMatrix_Helpers_decl.hpp" namespace tif_utest { @@ -149,6 +150,39 @@ struct BlockTriDiContainerTester { } } + // Make BlockRelaxation smoother with BlockTriDiContainer + // with a pointwise matrix. + // N.B. Modifies A if nonuniform_lines is true. + static Teuchos::RCP > + make_BR_BTDC_PW (const StructuredBlock& sb, const StructuredBlockPart& sbp, + const Teuchos::RCP& A, + const bool nonuniform_lines = false, + const bool zero_starting_soln = true, + const int num_sweeps = 1, + const bool jacobi = false) { + Teuchos::Array > parts; + // make_parts modifies entries of A so the call to convertToCrsMatrix + // needs to happen after make_parts + make_parts(sb, sbp, *A, nonuniform_lines, jacobi, parts); + auto A_pw = Tpetra::convertToCrsMatrix(*A); + const auto T = Teuchos::rcp(new Ifpack2::BlockRelaxation(A_pw)); + { + Teuchos::ParameterList p; + p.set("relaxation: container", "BlockTriDi"); + p.set("relaxation: type", "MT Split Jacobi"); + p.set("relaxation: sweeps", 1); + p.set("partitioner: type", "user"); + p.set("relaxation: zero starting solution", zero_starting_soln); + p.set("relaxation: sweeps", num_sweeps); + p.set("partitioner: local parts", parts.size()); + p.set("partitioner: parts", parts); + p.set("partitioner: subparts per part", 1); + p.set("partitioner: block size", A->getBlockSize()); + T->setParameters(p); + } + return T; + } + // Make BlockRelaxation smoother with BlockTriDiContainer. // N.B. Modifies A if nonuniform_lines is true. static Teuchos::RCP > @@ -172,11 +206,30 @@ struct BlockTriDiContainerTester { p.set("partitioner: local parts", parts.size()); p.set("partitioner: parts", parts); p.set("partitioner: subparts per part", 1); + p.set("partitioner: block size", -1); T->setParameters(p); } return T; } + // Make a bare BlockTriDiContainer + // with a pointwise matrix. + // N.B. Modifies A if nonuniform_lines is true. + static Teuchos::RCP > + make_BTDC_PW (const StructuredBlock& sb, const StructuredBlockPart& sbp, + const Teuchos::RCP& A, + const bool overlap_comm = false, const bool nonuniform_lines = false, + const bool jacobi = false, const bool seq_method = false) { + Teuchos::Array > parts; + // make_parts modifies entries of A so the call to convertToCrsMatrix + // needs to happen after make_parts + make_parts(sb, sbp, *A, nonuniform_lines, jacobi, parts); + auto A_pw = Tpetra::convertToCrsMatrix(*A); + + return Teuchos::rcp(new Ifpack2::BlockTriDiContainer( + A_pw, parts, 1, overlap_comm, seq_method, A->getBlockSize())); + } + // Make a bare BlockTriDiContainer. // N.B. Modifies A if nonuniform_lines is true. static Teuchos::RCP > @@ -195,7 +248,7 @@ struct BlockTriDiContainerTester { const StructuredBlock& sb, const StructuredBlockPart& sbp, const Int bs, const Int nvec, const bool nonuniform_lines, const bool different_maps, const bool jacobi, const bool overlap_comm, - const bool seq_method, const std::string& details) { + const bool seq_method, const bool pointwise, const std::string& details) { #define TEST_BR_BTDC_FAIL(msg) do { \ ++nerr; \ if (comm->getRank() == 0) { \ @@ -232,10 +285,16 @@ struct BlockTriDiContainerTester { const bool use_br = ! (overlap_comm || seq_method); const Magnitude tol = 1e-3; const auto T_br = use_br ? - make_BR_BTDC(sb, sbp, A, nonuniform_lines, zero_starting, num_sweeps, jacobi) : + ( pointwise ? + make_BR_BTDC_PW(sb, sbp, A, nonuniform_lines, zero_starting, num_sweeps, jacobi): + make_BR_BTDC(sb, sbp, A, nonuniform_lines, zero_starting, num_sweeps, jacobi) + ): Teuchos::null; const auto T_bare = use_br ? Teuchos::null : - make_BTDC(sb, sbp, A, overlap_comm, nonuniform_lines, jacobi, seq_method); + ( pointwise ? + make_BTDC_PW(sb, sbp, A, overlap_comm, nonuniform_lines, jacobi, seq_method): + make_BTDC(sb, sbp, A, overlap_comm, nonuniform_lines, jacobi, seq_method) + ); if ( ! T_br.is_null()) { T_br->initialize(); T_br->compute(); diff --git a/packages/kokkos-kernels/.readthedocs.yaml b/packages/kokkos-kernels/.readthedocs.yaml new file mode 100644 index 000000000000..519282a17943 --- /dev/null +++ b/packages/kokkos-kernels/.readthedocs.yaml @@ -0,0 +1,35 @@ +# Read the Docs configuration file for Sphinx projects +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the OS, Python version and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.12" + # You can also specify other tool versions: + # nodejs: "20" + # rust: "1.70" + # golang: "1.20" + +# Build documentation in the "docs/" directory with Sphinx +sphinx: + configuration: docs/conf.py + # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs + # builder: "dirhtml" + # Fail on all warnings to avoid broken references + # fail_on_warning: true + +# Optionally build your docs in additional formats such as PDF and ePub +# formats: +# - pdf +# - epub + +# Optional but recommended, declare the Python requirements required +# to build your documentation +# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +python: + install: + - requirements: docs/requirements.txt \ No newline at end of file diff --git a/packages/kokkos-kernels/BUILD.md b/packages/kokkos-kernels/BUILD.md index 5be269bd7cf5..6fcea4dd3372 100644 --- a/packages/kokkos-kernels/BUILD.md +++ b/packages/kokkos-kernels/BUILD.md @@ -227,7 +227,7 @@ endif() * KokkosKernels_LAPACK_ROOT: PATH * Location of LAPACK install root. * Default: None or the value of the environment variable LAPACK_ROOT if set -* KokkosKernels_LINALG_OPT_LEVEL: BOOL +* KokkosKernels_LINALG_OPT_LEVEL: BOOL **DEPRECATED** * Optimization level for KokkosKernels computational kernels: a nonnegative integer. Higher levels result in better performance that is more uniform for corner cases, but increase build time and library size. The default value is 1, which should give performance within ten percent of optimal on most platforms, for most problems. * Default: 1 * KokkosKernels_MAGMA_ROOT: PATH diff --git a/packages/kokkos-kernels/CHANGELOG.md b/packages/kokkos-kernels/CHANGELOG.md index 3ebb102517e6..6bc9cb65a632 100644 --- a/packages/kokkos-kernels/CHANGELOG.md +++ b/packages/kokkos-kernels/CHANGELOG.md @@ -1,5 +1,99 @@ # Change Log +## [4.3.00](https://github.com/kokkos/kokkos-kernels/tree/4.3.00) (2024-03-19) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.2.01...4.3.00) + +### New Features + +#### BLAS updates +- Syr2 [\#1942](https://github.com/kokkos/kokkos-kernels/pull/1942) + +#### LAPACK updates +- Adding cuSOLVER [\#2038](https://github.com/kokkos/kokkos-kernels/pull/2038) + - Fix for MAGMA with CUDA [\#2044](https://github.com/kokkos/kokkos-kernels/pull/2044) +- Adding rocSOLVER [\#2034](https://github.com/kokkos/kokkos-kernels/pull/2034) + - Fix rocSOLVER issue with Trilinos dependency [\#2037](https://github.com/kokkos/kokkos-kernels/pull/2037) +- Lapack - SVD [\#2092](https://github.com/kokkos/kokkos-kernels/pull/2092) + - Adding benchmark for SVD [\#2103](https://github.com/kokkos/kokkos-kernels/pull/2103) + - Quick return to fix cuSOLVER and improve performance [\#2107](https://github.com/kokkos/kokkos-kernels/pull/2107) + - Fix Intel MKL tolerance for SVD tests [\#2110](https://github.com/kokkos/kokkos-kernels/pull/2110) + +#### Sparse updates +- Add block support to all SPILUK algorithms [\#2064](https://github.com/kokkos/kokkos-kernels/pull/2064) + - Block spiluk follow up [\#2085](https://github.com/kokkos/kokkos-kernels/pull/2085) + - Make spiluk_handle::reset backwards compatible [\#2087](https://github.com/kokkos/kokkos-kernels/pull/2087) +- Sptrsv improvements + - Add sptrsv execution space overloads [\#1982](https://github.com/kokkos/kokkos-kernels/pull/1982) + - Refactor Test_Sparse_sptrsv [\#2102](https://github.com/kokkos/kokkos-kernels/pull/2102) + - Add support for BSR matrices to some trsv routines [\#2104](https://github.com/kokkos/kokkos-kernels/pull/2104) +- GMRES: Add support for BSR matrices [\#2097](https://github.com/kokkos/kokkos-kernels/pull/2097) +- Spmv handle [\#2126](https://github.com/kokkos/kokkos-kernels/pull/2126) +- Option to apply RCM reordering to extracted CRS diagonal blocks [\#2125](https://github.com/kokkos/kokkos-kernels/pull/2125) + +#### ODE updates +- Adding adaptive BDF methods [\#1930](https://github.com/kokkos/kokkos-kernels/pull/1930) + +#### Misc updates +- Add HIPManagedSpace support [\#2079](https://github.com/kokkos/kokkos-kernels/pull/2079) + +### Enhancements: + +#### BLAS +- Axpby: improvement on unification attempt logic and on the execution of a diversity of situations [\#1895](https://github.com/kokkos/kokkos-kernels/pull/1895) + +#### Misc updates +- Use execution space operator== [\#2136](https://github.com/kokkos/kokkos-kernels/pull/2136) + +#### TPL support +- Add TPL support for KokkosBlas::dot [\#1949](https://github.com/kokkos/kokkos-kernels/pull/1949) +- Add CUDA/HIP TPL support for KokkosSparse::spadd [\#1962](https://github.com/kokkos/kokkos-kernels/pull/1962) +- Don't call optimize_gemv for one-shot MKL spmv [\#2073](https://github.com/kokkos/kokkos-kernels/pull/2073) +- Async matrix release for MKL >= 2023.2 in SpMV [\#2074](https://github.com/kokkos/kokkos-kernels/pull/2074) +- BLAS - MKL: fixing HostBlas calls to handle MKL_INT type [\#2112](https://github.com/kokkos/kokkos-kernels/pull/2112) + +### Build System: +- Support CUBLAS_{LIBRARIES,LIBRARY_DIRS,INCLUDE_DIRS,ROOT} and KokkosKernels_CUBLAS_ROOT CMake options [\#2075](https://github.com/kokkos/kokkos-kernels/pull/2075) +- Link std::filesystem for IntelLLVM in perf_test/sparse [\#2055](https://github.com/kokkos/kokkos-kernels/pull/2055) +- Fix Cuda TPL finding [\#2098](https://github.com/kokkos/kokkos-kernels/pull/2098) +- CMake: error out in certain case [\#2115](https://github.com/kokkos/kokkos-kernels/pull/2115) + +### Documentation and Testing: +- par_ilut: Update documentation for fill_in_limit [\#2001](https://github.com/kokkos/kokkos-kernels/pull/2001) +- Wiki examples for BLAS2 functions are added [\#2122](https://github.com/kokkos/kokkos-kernels/pull/2122) +- github workflows: update to v4 (use Node 20) [\#2119](https://github.com/kokkos/kokkos-kernels/pull/2119) + +### Benchmarks: +- gemm3 perf test: user CUDA, SYCL, or HIP device for kokkos:initialize [\#2058](https://github.com/kokkos/kokkos-kernels/pull/2058) +- Lapack: adding svd benchmark [\#2103](https://github.com/kokkos/kokkos-kernels/pull/2103) +- Benchmark: modifying spmv benchmark to fix interface and run range of spmv tests [\#2135](https://github.com/kokkos/kokkos-kernels/pull/2135) + +### Cleanup: +- Experimental hip cleanup [\#1999](https://github.com/kokkos/kokkos-kernels/pull/1999) +- iostream clean-up in benchmarks [\#2004](https://github.com/kokkos/kokkos-kernels/pull/2004) +- Update: implicit capture of 'this' via '[=]' is deprecated in C++20 warnings [\#2076](https://github.com/kokkos/kokkos-kernels/pull/2076) +- Deprecate KOKKOSLINALG_OPT_LEVEL [\#2072](https://github.com/kokkos/kokkos-kernels/pull/2072) +- Remove all mentions of HBWSpace [\#2101](https://github.com/kokkos/kokkos-kernels/pull/2101) +- Change name of yaml-cpp to yamlcpp (trilinos/Trilinos#12710) [\#2099](https://github.com/kokkos/kokkos-kernels/pull/2099) +- Hands off namespace Kokkos::Impl - cleanup couple violations that snuck in [\#2094](https://github.com/kokkos/kokkos-kernels/pull/2094) +- Kokkos Kernels: update version guards to drop old version of Kokkos [\#2133](https://github.com/kokkos/kokkos-kernels/pull/2133) +- Sparse MKL: changing the location of the MKL_SAFE_CALL macro [\#2134](https://github.com/kokkos/kokkos-kernels/pull/2134) + +### Bug Fixes: +- Bspgemm cusparse hang [\#2008](https://github.com/kokkos/kokkos-kernels/pull/2008) +- bhalf_t fix for isnan function [\#2007](https://github.com/kokkos/kokkos-kernels/pull/2007) +- Fence Kokkos before timed iterations [\#2066](https://github.com/kokkos/kokkos-kernels/pull/2066) +- CUDA 11.2.1 / cuSPARSE 11.4.0 changed SpMV enums [\#2011](https://github.com/kokkos/kokkos-kernels/pull/2011) +- Fix the spadd API [\#2090](https://github.com/kokkos/kokkos-kernels/pull/2090) +- Axpby reduce deep copy calls [\#2081](https://github.com/kokkos/kokkos-kernels/pull/2081) +- Correcting BLAS test failures with cuda when ETI_ONLY = OFF (issue #2061) [\#2077](https://github.com/kokkos/kokkos-kernels/pull/2077) +- Fix weird Trilinos compiler error [\#2117](https://github.com/kokkos/kokkos-kernels/pull/2117) +- Fix for missing STL inclusion [\#2113](https://github.com/kokkos/kokkos-kernels/pull/2113) +- Fix build error in trsv on gcc8 [\#2111](https://github.com/kokkos/kokkos-kernels/pull/2111) +- Add a workaround for compilation errors with cuda-12.2.0 + gcc-12.3 [\#2108](https://github.com/kokkos/kokkos-kernels/pull/2108) +- Increase tolerance on gesv test (Fix #2123) [\#2124](https://github.com/kokkos/kokkos-kernels/pull/2124) +- Fix usage of RAII to set cusparse/rocsparse stream [\#2141](https://github.com/kokkos/kokkos-kernels/pull/2141) +- Spmv bsr matrix fix missing matrix descriptor (rocsparse) [\#2138](https://github.com/kokkos/kokkos-kernels/pull/2138) + ## [4.2.01](https://github.com/kokkos/kokkos-kernels/tree/4.2.01) (2024-01-17) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.2.00...4.2.01) diff --git a/packages/kokkos-kernels/CMakeLists.txt b/packages/kokkos-kernels/CMakeLists.txt index 4847b51e9b3a..bd3d761bdbf5 100644 --- a/packages/kokkos-kernels/CMakeLists.txt +++ b/packages/kokkos-kernels/CMakeLists.txt @@ -10,8 +10,8 @@ SET(KOKKOSKERNELS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) SET(KOKKOSKERNELS_TOP_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) SET(KokkosKernels_VERSION_MAJOR 4) -SET(KokkosKernels_VERSION_MINOR 2) -SET(KokkosKernels_VERSION_PATCH 1) +SET(KokkosKernels_VERSION_MINOR 3) +SET(KokkosKernels_VERSION_PATCH 0) SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") #Set variables for config file @@ -127,13 +127,13 @@ ELSE() IF (NOT KOKKOSKERNELS_HAS_TRILINOS AND NOT KOKKOSKERNELS_HAS_PARENT) # This is a standalone build FIND_PACKAGE(Kokkos REQUIRED) - IF((${Kokkos_VERSION} VERSION_EQUAL "4.1.00") OR (${Kokkos_VERSION} VERSION_GREATER_EQUAL "4.2.00")) + IF((${Kokkos_VERSION} VERSION_GREATER_EQUAL "4.1.0") AND (${Kokkos_VERSION} VERSION_LESS_EQUAL "4.3.0")) MESSAGE(STATUS "Found Kokkos version ${Kokkos_VERSION} at ${Kokkos_DIR}") - IF((${Kokkos_VERSION} VERSION_GREATER "4.2.99")) + IF((${Kokkos_VERSION} VERSION_GREATER "4.3.99")) MESSAGE(WARNING "Configuring with Kokkos ${Kokkos_VERSION} which is newer than the expected develop branch - version check may need update") ENDIF() ELSE() - MESSAGE(FATAL_ERROR "Kokkos Kernels ${KokkosKernels_VERSION} requires 4.1.00, 4.2.00, 4.2.01 or develop") + MESSAGE(FATAL_ERROR "Kokkos Kernels ${KokkosKernels_VERSION} requires Kokkos_VERSION 4.1.0, 4.2.0, 4.2.1 or 4.3.0") ENDIF() ENDIF() @@ -156,9 +156,16 @@ ELSE() KOKKOSKERNELS_ADD_OPTION_AND_DEFINE( LINALG_OPT_LEVEL KOKKOSLINALG_OPT_LEVEL - "Optimization level for KokkosKernels computational kernels: a nonnegative integer. Higher levels result in better performance that is more uniform for corner cases, but increase build time and library size. The default value is 1, which should give performance within ten percent of optimal on most platforms, for most problems. Default: 1" + "DEPRECATED. Optimization level for KokkosKernels computational kernels: a nonnegative integer. Higher levels result in better performance that is more uniform for corner cases, but increase build time and library size. The default value is 1, which should give performance within ten percent of optimal on most platforms, for most problems. Default: 1" "1") + if (KokkosKernels_LINALG_OPT_LEVEL AND NOT KokkosKernels_LINALG_OPT_LEVEL STREQUAL "1") + message(WARNING "KokkosKernels_LINALG_OPT_LEVEL is deprecated!") + endif() + if(KokkosKernels_KOKKOSLINALG_OPT_LEVEL AND NOT KokkosKernels_KOKKOSLINALG_OPT_LEVEL STREQUAL "1") + message(WARNING "KokkosKernels_KOKKOSLINALG_OPT_LEVEL is deprecated!") + endif() + # Enable experimental features of KokkosKernels if set at configure # time. Default is no. KOKKOSKERNELS_ADD_OPTION_AND_DEFINE( @@ -375,8 +382,10 @@ ELSE() KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC MKL) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC CUBLAS) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC CUSPARSE) + KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC CUSOLVER) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ROCBLAS) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ROCSPARSE) + KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ROCSOLVER) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC METIS) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ARMPL) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC MAGMA) @@ -425,7 +434,7 @@ ELSE() IF (KOKKOSKERNELS_ALL_COMPONENTS_ENABLED) IF (KokkosKernels_ENABLE_PERFTESTS) MESSAGE(STATUS "Enabling perf tests.") - KOKKOSKERNELS_ADD_TEST_DIRECTORIES(perf_test) + add_subdirectory(perf_test) # doesn't require KokkosKernels_ENABLE_TESTS=ON ENDIF () IF (KokkosKernels_ENABLE_EXAMPLES) MESSAGE(STATUS "Enabling examples.") diff --git a/packages/kokkos-kernels/CheckHostBlasReturnComplex.cmake b/packages/kokkos-kernels/CheckHostBlasReturnComplex.cmake index b9528ce45a24..657a9f2286ae 100644 --- a/packages/kokkos-kernels/CheckHostBlasReturnComplex.cmake +++ b/packages/kokkos-kernels/CheckHostBlasReturnComplex.cmake @@ -21,8 +21,8 @@ FUNCTION(CHECK_HOST_BLAS_RETURN_COMPLEX VARNAME) extern \"C\" { void F77_BLAS_MANGLE(zdotc,ZDOTC)( - std::complex* result, const int* n, - const std::complex x[], const int* incx, + std::complex* result, const int* n, + const std::complex x[], const int* incx, const std::complex y[], const int* incy); } @@ -49,9 +49,9 @@ int main() { CHECK_CXX_SOURCE_RUNS("${SOURCE}" KK_BLAS_RESULT_AS_POINTER_ARG) IF(${KK_BLAS_RESULT_AS_POINTER_ARG}) - SET(VARNAME OFF) + SET(${VARNAME} OFF PARENT_SCOPE) ELSE() - SET(VARNAME ON) + SET(${VARNAME} ON PARENT_SCOPE) ENDIF() ENDFUNCTION() diff --git a/packages/kokkos-kernels/README.md b/packages/kokkos-kernels/README.md index 0da105787037..bdad1442ce45 100644 --- a/packages/kokkos-kernels/README.md +++ b/packages/kokkos-kernels/README.md @@ -1,4 +1,4 @@ -[![Generic badge](https://readthedocs.org/projects/pip/badge/?version=latest&style=flat)](https://kokkos-kernels.readthedocs.io/en/latest/) +[![Generic badge](https://readthedocs.org/projects/kokkos-kernels/badge/?version=latest)](https://kokkos-kernels.readthedocs.io/en/latest/) ![KokkosKernels](https://avatars2.githubusercontent.com/u/10199860?s=200&v=4) diff --git a/packages/kokkos-kernels/batched/KokkosBatched_Util.hpp b/packages/kokkos-kernels/batched/KokkosBatched_Util.hpp index 9078281e5955..fc14bd5a19be 100644 --- a/packages/kokkos-kernels/batched/KokkosBatched_Util.hpp +++ b/packages/kokkos-kernels/batched/KokkosBatched_Util.hpp @@ -626,18 +626,6 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, const Trans::NoTranspose) { return subview_wrapper(v, i1, i2, i3, layout_tag); } -#if KOKKOS_VERSION < 40099 -template -KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, - Kokkos::Impl::ALL_t i2, - Kokkos::Impl::ALL_t i3, - const BatchLayout::Left &layout_tag, - const Trans::Transpose) { - auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); - - return transpose_2d_view(sv_nt, layout_tag); -} -#else template KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, Kokkos::ALL_t i2, Kokkos::ALL_t i3, @@ -647,7 +635,6 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, return transpose_2d_view(sv_nt, layout_tag); } -#endif template KOKKOS_INLINE_FUNCTION auto subview_wrapper(ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3, @@ -671,16 +658,6 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper( const BatchLayout::Right &layout_tag, const Trans::NoTranspose &) { return subview_wrapper(v, i1, i2, i3, layout_tag); } -#if KOKKOS_VERSION < 40099 -template -KOKKOS_INLINE_FUNCTION auto subview_wrapper( - ViewType v, IdxType1 i1, Kokkos::Impl::ALL_t i2, Kokkos::Impl::ALL_t i3, - const BatchLayout::Right &layout_tag, const Trans::Transpose &) { - auto sv_nt = subview_wrapper(v, i1, i3, i2, layout_tag); - - return transpose_2d_view(sv_nt, layout_tag); -} -#else template KOKKOS_INLINE_FUNCTION auto subview_wrapper( ViewType v, IdxType1 i1, Kokkos::ALL_t i2, Kokkos::ALL_t i3, @@ -689,7 +666,6 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper( return transpose_2d_view(sv_nt, layout_tag); } -#endif template KOKKOS_INLINE_FUNCTION auto subview_wrapper( ViewType v, IdxType1 i1, IdxType2 i2, IdxType3 i3, diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp index e4e0d5b8b75d..86d0d0873efa 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp @@ -366,20 +366,24 @@ KOKKOS_INLINE_FUNCTION void TeamVectorHadamard1D(const MemberType &member, /// =========== template <> struct SerialGesv { - template + template KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, - const VectorType X, - const VectorType Y, + const XVectorType X, + const YVectorType Y, const MatrixType tmp) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: XVectorType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: YVectorType is not a Kokkos::View."); static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::rank == 1, - "KokkosBatched::gesv: VectorType must have rank 1."); + static_assert(XVectorType::rank == 1, + "KokkosBatched::gesv: XVectorType must have rank 1."); + static_assert(YVectorType::rank == 1, + "KokkosBatched::gesv: YVectorType must have rank 1."); // Check compatibility of dimensions at run time. @@ -462,20 +466,24 @@ struct SerialGesv { template <> struct SerialGesv { - template + template KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, - const VectorType X, - const VectorType Y, + const XVectorType X, + const YVectorType Y, const MatrixType /*tmp*/) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) static_assert(Kokkos::is_view::value, "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: XVectorType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: YVectorType is not a Kokkos::View."); static_assert(MatrixType::rank == 2, "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::rank == 1, - "KokkosBatched::gesv: VectorType must have rank 1."); + static_assert(XVectorType::rank == 1, + "KokkosBatched::gesv: XVectorType must have rank 1."); + static_assert(YVectorType::rank == 1, + "KokkosBatched::gesv: YVectorType must have rank 1."); // Check compatibility of dimensions at run time. diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp index f70fa6b963cd..464ea6d04a8a 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_HostLevel_Gemm_Impl.hpp @@ -93,11 +93,9 @@ int BatchedGemmImpl(BatchedGemmHandleType *const handle, const ScalarType alpha, case BaseKokkosBatchedAlgos::KK_SERIAL: case BaseHeuristicAlgos::SQUARE: case BaseTplAlgos::ARMPL: -#if KOKKOS_VERSION > 40099 assert(A.rank_dynamic() == 3 && "AViewType must have rank 3."); assert(B.rank_dynamic() == 3 && "BViewType must have rank 3."); assert(C.rank_dynamic() == 3 && "CViewType must have rank 3."); -#endif break; default: std::ostringstream os; diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp index c9fd0417f63d..34c92c2d244f 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp @@ -55,11 +55,7 @@ struct SerialSVDInternal { value_type a = Kokkos::ArithTraits::one(); value_type b = -a11 - a22; value_type c = a11 * a22 - a21 * a21; -#if KOKKOS_VERSION >= 30699 using Kokkos::sqrt; -#else - using Kokkos::Experimental::sqrt; -#endif value_type sqrtDet = sqrt(b * b - 4 * a * c); e1 = (-b + sqrtDet) / (2 * a); e2 = (-b - sqrtDet) / (2 * a); diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp index 268df195ce76..4d094c24d254 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_Serial_Impl.hpp @@ -176,6 +176,32 @@ struct SerialTrsm +struct SerialTrsm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, + const AViewType &A, + const BViewType &B) { + return SerialTrsmInternalLeftLower::invoke( + ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), + A.stride_0(), A.stride_1(), B.data(), B.stride_1(), B.stride_0()); + } +}; + +template +struct SerialTrsm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, + const AViewType &A, + const BViewType &B) { + return SerialTrsmInternalLeftLower::invoke( + ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, A.data(), + A.stride_0(), A.stride_1(), B.data(), B.stride_1(), B.stride_0()); + } +}; + /// /// L/U/NT /// diff --git a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp index 41fe47a35ec8..a7430775ea4e 100644 --- a/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp +++ b/packages/kokkos-kernels/batched/dense/impl/KokkosBatched_Trsm_Team_Impl.hpp @@ -99,6 +99,36 @@ struct TeamTrsm +struct TeamTrsm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const ScalarType alpha, + const AViewType &A, + const BViewType &B) { + return TeamTrsmInternalLeftLower::invoke( + member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, + A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_1(), + B.stride_0()); + } +}; + +template +struct TeamTrsm { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const ScalarType alpha, + const AViewType &A, + const BViewType &B) { + return TeamTrsmInternalLeftLower::invoke( + member, ArgDiag::use_unit_diag, B.extent(1), B.extent(0), alpha, + A.data(), A.stride_0(), A.stride_1(), B.data(), B.stride_1(), + B.stride_0()); + } +}; + /// /// L/U/NT /// diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Gesv.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Gesv.hpp index 3abedfd0aa9e..c4821db4597f 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Gesv.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Gesv.hpp @@ -63,11 +63,18 @@ struct Gesv { template struct SerialGesv { - template + template KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, - const VectorType X, - const VectorType Y, + const XVectorType X, + const YVectorType Y, const MatrixType tmp); + + template + [[deprecated]] KOKKOS_INLINE_FUNCTION static int invoke( + const MatrixType A, const VectorType X, const VectorType Y, + const MatrixType tmp) { + return invoke(A, X, Y, tmp); + } }; /// \brief Team Batched GESV: diff --git a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Vector_SIMD.hpp b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Vector_SIMD.hpp index e27419e7c2c0..753904dbb9e0 100644 --- a/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Vector_SIMD.hpp +++ b/packages/kokkos-kernels/batched/dense/src/KokkosBatched_Vector_SIMD.hpp @@ -513,6 +513,11 @@ class Vector, 4> { #if defined(__KOKKOSBATCHED_ENABLE_AVX__) #if defined(__AVX__) || defined(__AVX2__) + +#if CUDA_VERSION < 12022 +#undef _Float16 +#endif + #include namespace KokkosBatched { @@ -668,6 +673,9 @@ class Vector >, 2> { #endif /* #if defined(__AVX__) || defined(__AVX2__) */ #if defined(__AVX512F__) +#if CUDA_VERSION < 12022 +#undef _Float16 +#endif #include namespace KokkosBatched { diff --git a/packages/kokkos-kernels/blas/CMakeLists.txt b/packages/kokkos-kernels/blas/CMakeLists.txt index 869b152e7b72..5bc7217cfdae 100644 --- a/packages/kokkos-kernels/blas/CMakeLists.txt +++ b/packages/kokkos-kernels/blas/CMakeLists.txt @@ -297,6 +297,13 @@ KOKKOSKERNELS_GENERATE_ETI(Blas2_syr syr TYPE_LISTS FLOATS LAYOUTS DEVICES ) +KOKKOSKERNELS_GENERATE_ETI(Blas2_syr2 syr2 + COMPONENTS blas + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LAYOUTS DEVICES +) + KOKKOSKERNELS_GENERATE_ETI(Blas3_gemm gemm COMPONENTS blas HEADER_LIST ETI_HEADERS diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp b/packages/kokkos-kernels/blas/eti/generated_specializations_cpp/syr2/KokkosBlas2_syr2_eti_spec_inst.cpp.in similarity index 73% rename from packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp rename to packages/kokkos-kernels/blas/eti/generated_specializations_cpp/syr2/KokkosBlas2_syr2_eti_spec_inst.cpp.in index a77a55ea6530..669b5fd1aa37 100644 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_cpp/syr2/KokkosBlas2_syr2_eti_spec_inst.cpp.in @@ -14,5 +14,12 @@ // //@HEADER -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_10 -#include +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosKernels_config.h" +#include "KokkosBlas2_syr2_spec.hpp" + +namespace KokkosBlas { +namespace Impl { +@BLAS2_SYR2_ETI_INST_BLOCK@ +} //IMPL +} //Kokkos diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spadd_tpl_spec_decl.hpp b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas2_syr2_eti_spec_avail.hpp.in similarity index 76% rename from packages/kokkos-kernels/sparse/tpls/KokkosSparse_spadd_tpl_spec_decl.hpp rename to packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas2_syr2_eti_spec_avail.hpp.in index 8f5ad83ed7b1..9e7a01653e85 100644 --- a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spadd_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/blas/eti/generated_specializations_hpp/KokkosBlas2_syr2_eti_spec_avail.hpp.in @@ -14,11 +14,12 @@ // //@HEADER -#ifndef KOKKOSPARSE_SPADD_TPL_SPEC_DECL_HPP_ -#define KOKKOSPARSE_SPADD_TPL_SPEC_DECL_HPP_ - -namespace KokkosSparse { -namespace Impl {} -} // namespace KokkosSparse +#ifndef KOKKOSBLAS2_SYR2_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSBLAS2_SYR2_ETI_SPEC_AVAIL_HPP_ +namespace KokkosBlas { +namespace Impl { +@BLAS2_SYR2_ETI_AVAIL_BLOCK@ +} //IMPL +} //Kokkos #endif diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_axpby_impl.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_axpby_impl.hpp index 4e468b0e56ff..b919d76a9479 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_axpby_impl.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_axpby_impl.hpp @@ -19,14 +19,23 @@ #include "KokkosKernels_config.h" #include "Kokkos_Core.hpp" #include "Kokkos_InnerProductSpaceTraits.hpp" - -#ifndef KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY -#define KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY 2 -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY +#include "KokkosKernels_Error.hpp" namespace KokkosBlas { namespace Impl { +template +constexpr typename std::enable_if, int>::type +axpbyVarExtent(T& v) { + return v.extent(0); +} + +template +constexpr typename std::enable_if, int>::type +axpbyVarExtent(T&) { + return 0; +} + // // axpby // @@ -44,8 +53,8 @@ namespace Impl { // // The template parameters scalar_x and scalar_y correspond to alpha // resp. beta in the operation y = alpha*x + beta*y. The values -1, -// 0, and -1 correspond to literal values of those coefficients. The -// value 2 tells the functor to use the corresponding vector of +// 0, and -1 correspond to literal values of those coefficients. +// The value 2 tells the functor to use the corresponding vector of // coefficients. Any literal coefficient of zero has BLAS semantics // of ignoring the corresponding (multi)vector entry. This does not // apply to coefficients in the a and b vectors, if they are used. @@ -61,32 +70,39 @@ struct Axpby_Functor { AV m_a; BV m_b; - Axpby_Functor(const XV& x, const YV& y, const AV& a, const BV& b, + Axpby_Functor(const XV& x, const YV& y, const AV& av, const BV& bv, const SizeType startingColumn) - : m_x(x), m_y(y), m_a(a), m_b(b) { + : m_x(x), m_y(y), m_a(av), m_b(bv) { static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_Functor: X is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_Functor(ABgeneric)" + ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_Functor: Y is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_Functor(ABgeneric)" + ": Y is not a Kokkos::View."); static_assert(std::is_same::value, - "KokkosBlas::Impl::Axpby_Functor: Y is const. " - "It must be nonconst, because it is an output argument " - "(we have to be able to write to its entries)."); + "KokkosBlas::Impl::Axpby_Functor(ABgeneric)" + ": Y must be nonconst, since it is an output argument" + " and we have to be able to write to its entries."); static_assert((int)YV::rank == (int)XV::rank, - "KokkosBlas::Impl::" - "Axpby_Functor: X and Y must have the same rank."); + "KokkosBlas::Impl::Axpby_Functor(ABgeneric)" + ": X and Y must have the same rank."); static_assert(YV::rank == 1, - "KokkosBlas::Impl::Axpby_Functor: " - "XV and YV must have rank 1."); - + "KokkosBlas::Impl::Axpby_Functor(ABgeneric)" + ": XV and YV must have rank 1."); + static_assert((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && + (scalar_y <= 2), + "KokkosBlas::Impl::Axpby_Functor(ABgeneric)" + ": scalar_x and/or scalar_y are out of range."); if (startingColumn != 0) { - m_a = Kokkos::subview( - a, std::make_pair(startingColumn, SizeType(a.extent(0)))); - m_b = Kokkos::subview( - b, std::make_pair(startingColumn, SizeType(b.extent(0)))); + if (axpbyVarExtent(m_a) > 1) { + m_a = Kokkos::subview( + av, std::make_pair(startingColumn, SizeType(av.extent(0)))); + } + if (axpbyVarExtent(m_b) > 1) { + m_b = Kokkos::subview( + bv, std::make_pair(startingColumn, SizeType(bv.extent(0)))); + } } } @@ -96,73 +112,83 @@ struct Axpby_Functor { // are template parameters), so the compiler should evaluate these // branches at compile time. -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY <= 2 - - if (scalar_x == 0 && scalar_y == 0) { - m_y(i) = ATS::zero(); - } - if (scalar_x == 0 && scalar_y == 2) { - m_y(i) = m_b(0) * m_y(i); - } - if (scalar_x == 2 && scalar_y == 0) { - m_y(i) = m_a(0) * m_x(i); - } - if (scalar_x == 2 && scalar_y == 2) { - m_y(i) = m_a(0) * m_x(i) + m_b(0) * m_y(i); - } - -#else // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - - if (scalar_x == 0 && scalar_y == 0) { - m_y(i) = ATS::zero(); - } - if (scalar_x == 0 && scalar_y == -1) { - m_y(i) = -m_y(i); - } - if (scalar_x == 0 && scalar_y == 1) { - return; // m_y(i) = m_y(i); - } - if (scalar_x == 0 && scalar_y == 2) { - m_y(i) = m_b(0) * m_y(i); - } - if (scalar_x == -1 && scalar_y == 0) { - m_y(i) = -m_x(i); - } - if (scalar_x == -1 && scalar_y == -1) { - m_y(i) = -m_x(i) - m_y(i); - } - if (scalar_x == -1 && scalar_y == 1) { - m_y(i) = -m_x(i) + m_y(i); - } - if (scalar_x == -1 && scalar_y == 2) { - m_y(i) = -m_x(i) + m_b(0) * m_y(i); + // ************************************************************** + // Possibilities with 'scalar_x == 0' + // ************************************************************** + if constexpr (scalar_x == 0) { + if constexpr (scalar_y == 0) { + m_y(i) = ATS::zero(); + } else if constexpr (scalar_y == -1) { + m_y(i) = -m_y(i); + } else if constexpr (scalar_y == 1) { + // Nothing to do: m_y(i) = m_y(i); + } else if constexpr (scalar_y == 2) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { + m_y(i) = + Kokkos::ArithTraits::zero(); + } else { + m_y(i) = m_b(0) * m_y(i); + } + } + } + // ************************************************************** + // Possibilities with 'scalar_x == -1' + // ************************************************************** + else if constexpr (scalar_x == -1) { + if constexpr (scalar_y == 0) { + m_y(i) = -m_x(i); + } else if constexpr (scalar_y == -1) { + m_y(i) = -m_x(i) - m_y(i); + } else if constexpr (scalar_y == 1) { + m_y(i) = -m_x(i) + m_y(i); + } else if constexpr (scalar_y == 2) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { + m_y(i) = -m_x(i); + } else { + m_y(i) = -m_x(i) + m_b(0) * m_y(i); + } + } + } + // ************************************************************** + // Possibilities with 'scalar_x == 1' + // ************************************************************** + else if constexpr (scalar_x == 1) { + if constexpr (scalar_y == 0) { + m_y(i) = m_x(i); + } else if constexpr (scalar_y == -1) { + m_y(i) = m_x(i) - m_y(i); + } else if constexpr (scalar_y == 1) { + m_y(i) = m_x(i) + m_y(i); + } else if constexpr (scalar_y == 2) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { + m_y(i) = m_x(i); + } else { + m_y(i) = m_x(i) + m_b(0) * m_y(i); + } + } + } + // ************************************************************** + // Possibilities with 'scalar_x == 2' + // ************************************************************** + else if constexpr (scalar_x == 2) { + if constexpr (scalar_y == 0) { + m_y(i) = m_a(0) * m_x(i); + } else if constexpr (scalar_y == -1) { + m_y(i) = m_a(0) * m_x(i) - m_y(i); + } else if constexpr (scalar_y == 1) { + m_y(i) = m_a(0) * m_x(i) + m_y(i); + } else if constexpr (scalar_y == 2) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { + m_y(i) = m_a(0) * m_x(i); + } else { + m_y(i) = m_a(0) * m_x(i) + m_b(0) * m_y(i); + } + } } - if (scalar_x == 1 && scalar_y == 0) { - m_y(i) = m_x(i); - } - if (scalar_x == 1 && scalar_y == -1) { - m_y(i) = m_x(i) - m_y(i); - } - if (scalar_x == 1 && scalar_y == 1) { - m_y(i) = m_x(i) + m_y(i); - } - if (scalar_x == 1 && scalar_y == 2) { - m_y(i) = m_x(i) + m_b(0) * m_y(i); - } - if (scalar_x == 2 && scalar_y == 0) { - m_y(i) = m_a(0) * m_x(i); - } - if (scalar_x == 2 && scalar_y == -1) { - m_y(i) = m_a(0) * m_x(i) - m_y(i); - } - if (scalar_x == 2 && scalar_y == 1) { - m_y(i) = m_a(0) * m_x(i) + m_y(i); - } - if (scalar_x == 2 && scalar_y == 2) { - m_y(i) = m_a(0) * m_x(i) + m_b(0) * m_y(i); - } - -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY } }; @@ -177,8 +203,8 @@ struct Axpby_Functor { // // The template parameters scalar_x and scalar_y correspond to alpha // resp. beta in the operation y = alpha*x + beta*y. The values -1, -// 0, and -1 correspond to literal values of those coefficients. The -// value 2 tells the functor to use the corresponding vector of +// 0, and -1 correspond to literal values of those coefficients. +// The value 2 tells the functor to use the corresponding vector of // coefficients. Any literal coefficient of zero has BLAS semantics // of ignoring the corresponding (multi)vector entry. This does not // apply to coefficients in the a and b vectors, if they are used. @@ -201,22 +227,26 @@ struct Axpby_Functor::value, - "KokkosBlas::Impl::" - "Axpby_Functor: X is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_Functor(ABscalars)" + ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_Functor: Y is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_Functor(ABscalars)" + ": Y is not a Kokkos::View."); static_assert(std::is_same::value, - "KokkosBlas::Impl::Axpby_Functor: R is const. " - "It must be nonconst, because it is an output argument " - "(we have to be able to write to its entries)."); + "KokkosBlas::Impl::Axpby_Functor(ABscalars)" + ": Y must be nonconst, since it is an output argument" + " and we have to be able to write to its entries."); static_assert((int)YV::rank == (int)XV::rank, - "KokkosBlas::Impl::" - "Axpby_Functor: X and Y must have the same rank."); + "KokkosBlas::Impl::Axpby_Functor(ABscalars)" + ": X and Y must have the same rank."); static_assert(YV::rank == 1, - "KokkosBlas::Impl::Axpby_Functor: " + "KokkosBlas::Impl::Axpby_Functor(ABscalars)" "XV and YV must have rank 1."); + static_assert((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && + (scalar_y <= 2), + "KokkosBlas::Impl::Axpby_Functor(ABscalars)" + ": scalar_x and/or scalar_y are out of range."); } KOKKOS_INLINE_FUNCTION @@ -225,80 +255,69 @@ struct Axpby_Functor(ATS::zero()); - } - if (scalar_x == 0 && scalar_y == 2) { - m_y(i) = static_cast(m_b * m_y(i)); - } - if (scalar_x == 2 && scalar_y == 0) { - m_y(i) = static_cast(m_a * m_x(i)); - } - if (scalar_x == 2 && scalar_y == 2) { - m_y(i) = static_cast(m_a * m_x(i) + - m_b * m_y(i)); - } - -#else // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - - if (scalar_x == 0 && scalar_y == 0) { - m_y(i) = ATS::zero(); - } - if (scalar_x == 0 && scalar_y == -1) { - m_y(i) = -m_y(i); - } - if (scalar_x == 0 && scalar_y == 1) { - return; // m_y(i) = m_y(i); - } - if (scalar_x == 0 && scalar_y == 2) { - m_y(i) = m_b * m_y(i); - } - if (scalar_x == -1 && scalar_y == 0) { - m_y(i) = -m_x(i); - } - if (scalar_x == -1 && scalar_y == -1) { - m_y(i) = -m_x(i) - m_y(i); + // ************************************************************** + // Possibilities with 'scalar_x == 0' + // ************************************************************** + if constexpr (scalar_x == 0) { + if constexpr (scalar_y == 0) { + m_y(i) = ATS::zero(); + } else if constexpr (scalar_y == -1) { + m_y(i) = -m_y(i); + } else if constexpr (scalar_y == 1) { + // Nothing to do: m_y(i) = m_y(i); + } else if constexpr (scalar_y == 2) { + m_y(i) = m_b * m_y(i); + } + } + // ************************************************************** + // Possibilities with 'scalar_x == -1' + // ************************************************************** + else if constexpr (scalar_x == -1) { + if constexpr (scalar_y == 0) { + m_y(i) = -m_x(i); + } else if constexpr (scalar_y == -1) { + m_y(i) = -m_x(i) - m_y(i); + } else if constexpr (scalar_y == 1) { + m_y(i) = -m_x(i) + m_y(i); + } else if constexpr (scalar_y == 2) { + m_y(i) = -m_x(i) + m_b * m_y(i); + } + } + // ************************************************************** + // Possibilities with 'scalar_x == 1' + // ************************************************************** + else if constexpr (scalar_x == 1) { + if constexpr (scalar_y == 0) { + m_y(i) = m_x(i); + } else if constexpr (scalar_y == -1) { + m_y(i) = m_x(i) - m_y(i); + } else if constexpr (scalar_y == 1) { + m_y(i) = m_x(i) + m_y(i); + } else if constexpr (scalar_y == 2) { + m_y(i) = m_x(i) + m_b * m_y(i); + } + } + // ************************************************************** + // Possibilities with 'scalar_x == 2' + // ************************************************************** + else if constexpr (scalar_x == 2) { + if constexpr (scalar_y == 0) { + m_y(i) = m_a * m_x(i); + } else if constexpr (scalar_y == -1) { + m_y(i) = m_a * m_x(i) - m_y(i); + } else if constexpr (scalar_y == 1) { + m_y(i) = m_a * m_x(i) + m_y(i); + } else if constexpr (scalar_y == 2) { + m_y(i) = m_a * m_x(i) + m_b * m_y(i); + } } - if (scalar_x == -1 && scalar_y == 1) { - m_y(i) = -m_x(i) + m_y(i); - } - if (scalar_x == -1 && scalar_y == 2) { - m_y(i) = -m_x(i) + m_b * m_y(i); - } - if (scalar_x == 1 && scalar_y == 0) { - m_y(i) = m_x(i); - } - if (scalar_x == 1 && scalar_y == -1) { - m_y(i) = m_x(i) - m_y(i); - } - if (scalar_x == 1 && scalar_y == 1) { - m_y(i) = m_x(i) + m_y(i); - } - if (scalar_x == 1 && scalar_y == 2) { - m_y(i) = m_x(i) + m_b * m_y(i); - } - if (scalar_x == 2 && scalar_y == 0) { - m_y(i) = m_a * m_x(i); - } - if (scalar_x == 2 && scalar_y == -1) { - m_y(i) = m_a * m_x(i) - m_y(i); - } - if (scalar_x == 2 && scalar_y == 1) { - m_y(i) = m_a * m_x(i) + m_y(i); - } - if (scalar_x == 2 && scalar_y == 2) { - m_y(i) = m_a * m_x(i) + m_b * m_y(i); - } - -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY } }; // Variant of Axpby_MV_Generic for single vectors (1-D Views) x and y. -// As above, either av and bv are both 1-D Views (and only the first -// entry of each will be read), or both av and bv are scalars. +// As above, av and bv are either: +// - both 1-D views (and only the first entry of each are read), or +// - both scalars. // // This takes the starting column, so that if av and bv are both 1-D // Views, then the functor can take a subview if appropriate. @@ -306,7 +325,7 @@ template void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, const BV& bv, const YV& y, const SizeType startingColumn, - int a = 2, int b = 2) { + int scalar_x = 2, int scalar_y = 2) { static_assert(Kokkos::is_view::value, "KokkosBlas::Impl::" "Axpby_Generic: X is not a Kokkos::View."); @@ -325,118 +344,106 @@ void Axpby_Generic(const execution_space& space, const AV& av, const XV& x, "KokkosBlas::Impl::Axpby_Generic: " "XV and YV must have rank 1."); - const SizeType numRows = x.extent(0); - Kokkos::RangePolicy policy(space, 0, numRows); - - if (a == 0 && b == 0) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S0", policy, op); - return; - } - -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - if (a == 0 && b == -1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S1", policy, op); - return; + if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && + (scalar_y <= 2)) { + // Ok + } else { + KokkosKernels::Impl::throw_runtime_exception( + "KokkosBlas::Impl::Axpby_Generic()" + ": scalar_x and/or scalar_y are out of range."); } - if (a == 0 && b == 1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S2", policy, op); - return; - } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - if (a == 0 && b == 2) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S3", policy, op); - return; - } + const SizeType numRows = x.extent(0); + Kokkos::RangePolicy policy(space, 0, numRows); -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - // a == -1 - if (a == -1 && b == 0) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S4", policy, op); - return; - } - if (a == -1 && b == -1) { - Axpby_Functor op(x, y, av, bv, + // **************************************************************** + // Possibilities with 'scalar_x == 0' + // **************************************************************** + if (scalar_x == 0) { + if (scalar_y == 0) { + Axpby_Functor op(x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S5", policy, op); - return; - } - if (a == -1 && b == 1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S6", policy, op); - return; - } - if (a == -1 && b == 2) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S7", policy, op); - return; - } - // a == 1 - if (a == 1 && b == 0) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S8", policy, op); - return; - } - if (a == 1 && b == -1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S9", policy, op); - return; - } - if (a == 1 && b == 1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S10", policy, op); - return; - } - if (a == 1 && b == 2) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S11", policy, op); - return; + Kokkos::parallel_for("KokkosBlas::Axpby::S0", policy, op); + } else if (scalar_y == -1) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S1", policy, op); + } else if (scalar_y == 1) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S2", policy, op); + } else if (scalar_y == 2) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S3", policy, op); + } } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - - // a == 2 - if (a == 2 && b == 0) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S12", policy, op); - return; + // **************************************************************** + // Possibilities with 'scalar_x == -1' + // **************************************************************** + else if (scalar_x == -1) { + if (scalar_y == 0) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S4", policy, op); + } else if (scalar_y == -1) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S5", policy, op); + } else if (scalar_y == 1) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S6", policy, op); + } else if (scalar_y == 2) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S7", policy, op); + } } - -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - if (a == 2 && b == -1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S13", policy, op); - return; + // **************************************************************** + // Possibilities with 'scalar_x == 1' + // **************************************************************** + else if (scalar_x == 1) { + if (scalar_y == 0) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S8", policy, op); + } else if (scalar_y == -1) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S9", policy, op); + } else if (scalar_y == 1) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S10", policy, op); + } else if (scalar_y == 2) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S11", policy, op); + } } - if (a == 2 && b == 1) { - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S14", policy, op); - return; + // **************************************************************** + // Possibilities with 'scalar_x == 2' + // **************************************************************** + else if (scalar_x == 2) { + if (scalar_y == 0) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S12", policy, op); + } else if (scalar_y == -1) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S13", policy, op); + } else if (scalar_y == 1) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S14", policy, op); + } else if (scalar_y == 2) { + Axpby_Functor op(x, y, av, bv, + startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::S15", policy, op); + } } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - - // a and b arbitrary (not -1, 0, or 1) - Axpby_Functor op(x, y, av, bv, - startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::S15", policy, op); } } // namespace Impl diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_axpby_mv_impl.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_axpby_mv_impl.hpp index 32653b9cce1d..7db7b0abe3c3 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_axpby_mv_impl.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_axpby_mv_impl.hpp @@ -35,8 +35,8 @@ namespace Impl { // // The template parameters scalar_x and scalar_y correspond to alpha // resp. beta in the operation y = alpha*x + beta*y. The values -1, -// 0, and -1 correspond to literal values of those coefficients. The -// value 2 tells the functor to use the corresponding vector of +// 0, and -1 correspond to literal values of those coefficients. +// The value 2 tells the functor to use the corresponding vector of // coefficients. Any literal coefficient of zero has BLAS semantics // of ignoring the corresponding (multi)vector entry. This does not // apply to coefficients in the a and b vectors, if they are used. @@ -52,39 +52,41 @@ struct Axpby_MV_Functor { AV m_a; BV m_b; - Axpby_MV_Functor(const XMV& X, const YMV& Y, const AV& a, const BV& b) - : numCols(X.extent(1)), m_x(X), m_y(Y), m_a(a), m_b(b) { - // XMV and YMV must be Kokkos::View specializations. + Axpby_MV_Functor(const XMV& X, const YMV& Y, const AV& av, const BV& bv) + : numCols(X.extent(1)), m_x(X), m_y(Y), m_a(av), m_b(bv) { static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Functor: a is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" + ": 'a' is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Functor: X is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" + ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Functor: b is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" + ": 'b' is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Functor: Y is not a Kokkos::View."); - // YMV must be nonconst (else it can't be an output argument). + "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" + ": Y is not a Kokkos::View."); static_assert(std::is_same::value, - "KokkosBlas::Impl::Axpby_MV_Functor: Y is const. " - "It must be nonconst, because it is an output argument " - "(we have to be able to write to its entries)."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" + ": Y must be nonconst, since it is an output argument" + " and we have to be able to write to its entries."); static_assert((int)YMV::rank == (int)XMV::rank, - "KokkosBlas::Impl::Axpby_MV_Functor: " - "X and Y must have the same rank."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" + ": X and Y must have the same rank."); static_assert(YMV::rank == 2, - "KokkosBlas::Impl::Axpby_MV_Functor: " - "XMV and YMV must have rank 2."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" + ": XMV and YMV must have rank 2."); static_assert(AV::rank == 1, - "KokkosBlas::Impl::Axpby_MV_Functor: " - "AV must have rank 1."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" + ": AV must have rank 1."); static_assert(BV::rank == 1, - "KokkosBlas::Impl::Axpby_MV_Functor: " - "BV must have rank 1."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" + ": BV must have rank 1."); + static_assert((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && + (scalar_y <= 2), + "KokkosBlas::Impl::Axpby_MV_Functor(ABgeneric)" + ": scalar_x and/or scalar_y are out of range."); } KOKKOS_INLINE_FUNCTION @@ -92,175 +94,358 @@ struct Axpby_MV_Functor { // scalar_x and scalar_y are compile-time constants (since they // are template parameters), so the compiler should evaluate these // branches at compile time. - if (scalar_x == 0 && scalar_y == 0) { + + // ************************************************************** + // Possibilities with 'scalar_x == 0' + // ************************************************************** + if constexpr (scalar_x == 0) { + if constexpr (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = ATS::zero(); - } - } - if (scalar_x == 0 && scalar_y == -1) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = ATS::zero(); + } + } else if constexpr (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = -m_y(i, k); - } - } - if (scalar_x == 0 && scalar_y == 1) { - return; // Y(i,j) := Y(i,j) - } - if (scalar_x == 0 && scalar_y == 2) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = -m_y(i, k); + } + } else if constexpr (scalar_y == 1) { + // Nothing to do: Y(i,j) := Y(i,j) + } else if constexpr (scalar_y == 2) { + if (m_b.extent(0) == 1) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = m_b(k) * m_y(i, k); - } - } - if (scalar_x == -1 && scalar_y == 0) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = Kokkos::ArithTraits< + typename YMV::non_const_value_type>::zero(); + } + } else { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = -m_x(i, k); - } - } - if (scalar_x == -1 && scalar_y == -1) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_b(0) * m_y(i, k); + } + } + } else { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = -m_x(i, k) - m_y(i, k); + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_b(k) * m_y(i, k); + } + } } } - if (scalar_x == -1 && scalar_y == 1) { + // ************************************************************** + // Possibilities with 'scalar_x == -1' + // ************************************************************** + else if constexpr (scalar_x == -1) { + if constexpr (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = -m_x(i, k) + m_y(i, k); - } - } - if (scalar_x == -1 && scalar_y == 2) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = -m_x(i, k); + } + } else if constexpr (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = -m_x(i, k) + m_b(k) * m_y(i, k); - } - } - if (scalar_x == 1 && scalar_y == 0) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = -m_x(i, k) - m_y(i, k); + } + } else if constexpr (scalar_y == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = m_x(i, k); - } - } - if (scalar_x == 1 && scalar_y == -1) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = -m_x(i, k) + m_y(i, k); + } + } else if constexpr (scalar_y == 2) { + if (m_b.extent(0) == 1) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = m_x(i, k) - m_y(i, k); - } - } - if (scalar_x == 1 && scalar_y == 1) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = -m_x(i, k); + } + } else { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = m_x(i, k) + m_y(i, k); - } - } - if (scalar_x == 1 && scalar_y == 2) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = -m_x(i, k) + m_b(0) * m_y(i, k); + } + } + } else { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = m_x(i, k) + m_b(k) * m_y(i, k); + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = -m_x(i, k) + m_b(k) * m_y(i, k); + } + } } } - if (scalar_x == 2 && scalar_y == 0) { + // ************************************************************** + // Possibilities with 'scalar_x == 1' + // ************************************************************** + else if constexpr (scalar_x == 1) { + if constexpr (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k); - } - } - if (scalar_x == 2 && scalar_y == -1) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_x(i, k); + } + } else if constexpr (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k) - m_y(i, k); - } - } - if (scalar_x == 2 && scalar_y == 1) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_x(i, k) - m_y(i, k); + } + } else if constexpr (scalar_y == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k) + m_y(i, k); - } - } - if (scalar_x == 2 && scalar_y == 2) { + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_x(i, k) + m_y(i, k); + } + } else if constexpr (scalar_y == 2) { + if (m_b.extent(0) == 1) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_x(i, k); + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_x(i, k) + m_b(0) * m_y(i, k); + } + } + } else { #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif #ifdef KOKKOS_ENABLE_PRAGMA_VECTOR #pragma vector always #endif - for (size_type k = 0; k < numCols; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k) + m_b(k) * m_y(i, k); + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_x(i, k) + m_b(k) * m_y(i, k); + } + } } } - } + // ************************************************************** + // Possibilities with 'scalar_x == 2' + // ************************************************************** + else if constexpr (scalar_x == 2) { + if constexpr (scalar_y == 0) { + if (m_a.extent(0) == 1) { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k); + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k); + } + } + } else if constexpr (scalar_y == -1) { + if (m_a.extent(0) == 1) { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k) - m_y(i, k); + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k) - m_y(i, k); + } + } + } else if constexpr (scalar_y == 1) { + if (m_a.extent(0) == 1) { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k) + m_y(i, k); + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k) + m_y(i, k); + } + } + } else if constexpr (scalar_y == 2) { + if (m_a.extent(0) == 1) { + if (m_b.extent(0) == 1) { + if (m_b(0) == Kokkos::ArithTraits< + typename BV::non_const_value_type>::zero()) { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k); + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k) + m_b(0) * m_y(i, k); + } + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k) + m_b(k) * m_y(i, k); + } + } + } else { + if (m_b.extent(0) == 1) { + if (m_b(0) == Kokkos::ArithTraits< + typename BV::non_const_value_type>::zero()) { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k); + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k) + m_b(0) * m_y(i, k); + } + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP +#pragma ivdep +#endif +#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR +#pragma vector always +#endif + for (size_type k = 0; k < numCols; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k) + m_b(k) * m_y(i, k); + } + } + } + } // if constexpr (scalar_y == ...) else if + } // if constexpr (scalar_x == ...) else if + } // void operator() }; // Variant of Axpby_MV_Functor, where a and b are scalars. @@ -268,13 +453,13 @@ struct Axpby_MV_Functor { // // 1. Y(i,j) = alpha*X(i,j) + beta*Y(i,j) for alpha,beta in -1,0,1 // 2. Y(i,j) = a*X(i,j) + beta*Y(i,j) for beta in -1,0,1 -// 3. Y(i,j) = alpha*X(i,j) + beta*Y(i,j) for alpha in -1,0,1 +// 3. Y(i,j) = alpha*X(i,j) + b*Y(i,j) for alpha in -1,0,1 // 4. Y(i,j) = a*X(i,j) + b*Y(i,j) // // The template parameters scalar_x and scalar_y correspond to alpha // resp. beta in the operation y = alpha*x + beta*y. The values -1, -// 0, and -1 correspond to literal values of those coefficients. The -// value 2 tells the functor to use the corresponding vector of +// 0, and -1 correspond to literal values of those coefficients. +// The value 2 tells the functor to use the corresponding vector of // coefficients. Any literal coefficient of zero has BLAS semantics // of ignoring the corresponding (multi)vector entry. This does not // apply to coefficients in the a and b vectors, if they are used. @@ -299,22 +484,26 @@ struct Axpby_MV_Functor::value, - "KokkosBlas::Impl::" - "Axpby_MV_Functor: X is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABscalars)" + ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Functor: Y is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABscalars)" + ": Y is not a Kokkos::View."); static_assert(std::is_same::value, - "KokkosBlas::Impl::Axpby_MV_Functor: Y is const. " - "It must be nonconst, because it is an output argument " - "(we have to be able to write to its entries)."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABscalars)" + ": Y must be nonconst, since it is an output argument" + " and we have to be able to write to its entries."); static_assert((int)YMV::rank == (int)XMV::rank, - "KokkosBlas::Impl::" - "Axpby_MV_Functor: X and Y must have the same rank."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABscalars)" + ": X and Y must have the same rank."); static_assert(YMV::rank == 2, - "KokkosBlas::Impl::Axpby_MV_Functor: " - "XMV and YMV must have rank 2."); + "KokkosBlas::Impl::Axpby_MV_Functor(ABscalars)" + ": XMV and YMV must have rank 2."); + static_assert((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && + (scalar_y <= 2), + "KokkosBlas::Impl::Axpby_MV_Functor(ABscalars)" + ": scalar_x and/or scalar_y are out of range."); } KOKKOS_INLINE_FUNCTION @@ -322,175 +511,184 @@ struct Axpby_MV_Functor::value, - "KokkosBlas::Impl::" - "Axpby_MV_Unroll_Functor: a is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" + ": 'a' is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Unroll_Functor: X is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" + ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Unroll_Functor: b is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" + ": 'b' is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Unroll_Functor: Y is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" + ": Y is not a Kokkos::View."); static_assert(std::is_same::value, - "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: Y is const. " - "It must be nonconst, because it is an output argument " - "(we have to be able to write to its entries)."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" + ": Y must be nonconst, since it is an output argument" + " and we have to be able to write to its entries."); static_assert((int)YMV::rank == (int)XMV::rank, - "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: " - "X and Y must have the same rank."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" + ": X and Y must have the same rank."); static_assert(YMV::rank == 2, - "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: " - "XMV and YMV must have rank 2."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" + ": XMV and YMV must have rank 2."); static_assert(AV::rank == 1, - "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: " - "AV must have rank 1."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" + ": AV must have rank 1."); static_assert(BV::rank == 1, - "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: " - "BV must have rank 1."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" + ": BV must have rank 1."); + static_assert((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && + (scalar_y <= 2), + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABgeneric)" + ": scalar_x and/or scalar_y are out of range."); if (startingColumn != 0) { - m_a = Kokkos::subview( - a, std::make_pair(startingColumn, SizeType(a.extent(0)))); - m_b = Kokkos::subview( - b, std::make_pair(startingColumn, SizeType(b.extent(0)))); + if (axpbyVarExtent(m_a) > 1) { + m_a = Kokkos::subview( + av, std::make_pair(startingColumn, SizeType(av.extent(0)))); + } + if (axpbyVarExtent(m_b) > 1) { + m_b = Kokkos::subview( + bv, std::make_pair(startingColumn, SizeType(bv.extent(0)))); + } } } @@ -553,167 +759,269 @@ struct Axpby_MV_Unroll_Functor { // are template parameters), so the compiler should evaluate these // branches at compile time. -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY <= 2 - - if (scalar_x == 0 && scalar_y == 0) { + // ************************************************************** + // Possibilities with 'scalar_x == 0' + // ************************************************************** + if constexpr (scalar_x == 0) { + if constexpr (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = ATS::zero(); - } - } - if (scalar_x == 0 && scalar_y == 2) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = ATS::zero(); + } + } else if constexpr (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_b(k) * m_y(i, k); - } - } - if (scalar_x == 2 && scalar_y == 0) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = -m_y(i, k); + } + } else if constexpr (scalar_y == 1) { + // Nothing to do: Y(i,j) := Y(i,j) + } else if constexpr (scalar_y == 2) { + if (m_b.extent(0) == 1) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k); - } - } - if (scalar_x == 2 && scalar_y == 2) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = Kokkos::ArithTraits< + typename YMV::non_const_value_type>::zero(); + } + } else { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k) + m_b(k) * m_y(i, k); - } - } - -#else // KOKKOSBLAS_OPTIMIZATION_LEVEL >= 3 - - if (scalar_x == 0 && scalar_y == 0) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_b(0) * m_y(i, k); + } + } + } else { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = ATS::zero(); + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_b(k) * m_y(i, k); + } + } } } - if (scalar_x == 0 && scalar_y == -1) { + // ************************************************************** + // Possibilities with 'scalar_x == -1' + // ************************************************************** + else if constexpr (scalar_x == -1) { + if constexpr (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = -m_y(i, k); - } - } - if (scalar_x == 0 && scalar_y == 1) { - return; // Y(i,j) := Y(i,j) - } - if (scalar_x == 0 && scalar_y == 2) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = -m_x(i, k); + } + } else if constexpr (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_b(k) * m_y(i, k); - } - } - if (scalar_x == -1 && scalar_y == 0) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = -m_x(i, k) - m_y(i, k); + } + } else if constexpr (scalar_y == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = -m_x(i, k); - } - } - if (scalar_x == -1 && scalar_y == -1) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = -m_x(i, k) + m_y(i, k); + } + } else if constexpr (scalar_y == 2) { + if (m_b.extent(0) == 1) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = -m_x(i, k) - m_y(i, k); - } - } - if (scalar_x == -1 && scalar_y == 1) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = -m_x(i, k); + } + } else { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = -m_x(i, k) + m_y(i, k); - } - } - if (scalar_x == -1 && scalar_y == 2) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = -m_x(i, k) + m_b(0) * m_y(i, k); + } + } + } else { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = -m_x(i, k) + m_b(k) * m_y(i, k); + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = -m_x(i, k) + m_b(k) * m_y(i, k); + } + } } } - if (scalar_x == 1 && scalar_y == 0) { + // ************************************************************** + // Possibilities with 'scalar_x == 1' + // ************************************************************** + else if constexpr (scalar_x == 1) { + if constexpr (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_x(i, k); - } - } - if (scalar_x == 1 && scalar_y == -1) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_x(i, k); + } + } else if constexpr (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_x(i, k) - m_y(i, k); - } - } - if (scalar_x == 1 && scalar_y == 1) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_x(i, k) - m_y(i, k); + } + } else if constexpr (scalar_y == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_x(i, k) + m_y(i, k); - } - } - if (scalar_x == 1 && scalar_y == 2) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_x(i, k) + m_y(i, k); + } + } else if constexpr (scalar_y == 2) { + if (m_b.extent(0) == 1) { + if (m_b(0) == + Kokkos::ArithTraits::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_x(i, k) + m_b(k) * m_y(i, k); - } - } - if (scalar_x == 2 && scalar_y == 0) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_x(i, k); + } + } else { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k); - } - } - if (scalar_x == 2 && scalar_y == -1) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_x(i, k) + m_b(0) * m_y(i, k); + } + } + } else { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k) - m_y(i, k); + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_x(i, k) + m_b(k) * m_y(i, k); + } + } } } - if (scalar_x == 2 && scalar_y == 1) { + // ************************************************************** + // Possibilities with 'scalar_x == 2' + // ************************************************************** + else if constexpr (scalar_x == 2) { + if constexpr (scalar_y == 0) { + if (m_a.extent(0) == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k) + m_y(i, k); - } - } - if (scalar_x == 2 && scalar_y == 2) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k); + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k); + } + } + } else if constexpr (scalar_y == -1) { + if (m_a.extent(0) == 1) { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k) - m_y(i, k); + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k) - m_y(i, k); + } + } + } else if constexpr (scalar_y == 1) { + if (m_a.extent(0) == 1) { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k) + m_y(i, k); + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k) + m_y(i, k); + } + } + } else if constexpr (scalar_y == 2) { + if (m_a.extent(0) == 1) { + if (m_b.extent(0) == 1) { + if (m_b(0) == Kokkos::ArithTraits< + typename BV::non_const_value_type>::zero()) { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k); + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k) + m_b(0) * m_y(i, k); + } + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(0) * m_x(i, k) + m_b(k) * m_y(i, k); + } + } + } else { + if (m_b.extent(0) == 1) { + if (m_b(0) == Kokkos::ArithTraits< + typename BV::non_const_value_type>::zero()) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a(k) * m_x(i, k) + m_b(k) * m_y(i, k); + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k); + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k) + m_b(0) * m_y(i, k); + } + } + } else { +#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL +#pragma unroll +#endif + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a(k) * m_x(i, k) + m_b(k) * m_y(i, k); + } + } + } } } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY } }; @@ -739,22 +1047,26 @@ struct Axpby_MV_Unroll_Functor::value, - "KokkosBlas::Impl::" - "Axpby_MV_Unroll_Functor: X is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABscalars)" + ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Unroll_Functor: Y is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABscalars)" + ": Y is not a Kokkos::View."); static_assert(std::is_same::value, - "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: Y is const. " - "It must be nonconst, because it is an output argument " - "(we have to be able to write to its entries)."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABscalars)" + ": Y must be nonconst, since it is an output argument" + " and we have to be able to write to its entries."); static_assert((int)YMV::rank == (int)XMV::rank, - "KokkosBlas::Impl::" - "Axpby_MV_Unroll_Functor: X and Y must have the same rank."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABscalars)" + ": X and Y must have the same rank."); static_assert(YMV::rank == 2, - "KokkosBlas::Impl::Axpby_MV_Unroll_Functor: " - "XMV and YMV must have rank 2."); + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABscalars)" + ": XMV and YMV must have rank 2."); + static_assert((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && + (scalar_y <= 2), + "KokkosBlas::Impl::Axpby_MV_Unroll_Functor(ABscalars)" + ": scalar_x and/or scalar_y are out of range."); } KOKKOS_INLINE_FUNCTION @@ -763,168 +1075,137 @@ struct Axpby_MV_Unroll_Functor 2 - - if (scalar_x == 0 && scalar_y == 0) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = -m_x(i, k); + } + } else if constexpr (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = ATS::zero(); - } - } - if (scalar_x == 0 && scalar_y == -1) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = -m_x(i, k) - m_y(i, k); + } + } else if constexpr (scalar_y == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = -m_y(i, k); - } - } - if (scalar_x == 0 && scalar_y == 1) { - return; // Y(i,j) := Y(i,j) - } - if (scalar_x == 0 && scalar_y == 2) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = -m_x(i, k) + m_y(i, k); + } + } else if constexpr (scalar_y == 2) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_b * m_y(i, k); + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = -m_x(i, k) + m_b * m_y(i, k); + } } } - if (scalar_x == -1 && scalar_y == 0) { + // ************************************************************** + // Possibilities with 'scalar_x == 1' + // ************************************************************** + else if constexpr (scalar_x == 1) { + if constexpr (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = -m_x(i, k); - } - } - if (scalar_x == -1 && scalar_y == -1) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_x(i, k); + } + } else if constexpr (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = -m_x(i, k) - m_y(i, k); - } - } - if (scalar_x == -1 && scalar_y == 1) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_x(i, k) - m_y(i, k); + } + } else if constexpr (scalar_y == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = -m_x(i, k) + m_y(i, k); - } - } - if (scalar_x == -1 && scalar_y == 2) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_x(i, k) + m_y(i, k); + } + } else if constexpr (scalar_y == 2) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = -m_x(i, k) + m_b * m_y(i, k); + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_x(i, k) + m_b * m_y(i, k); + } } } - if (scalar_x == 1 && scalar_y == 0) { + // ************************************************************** + // Possibilities with 'scalar_x == 2' + // ************************************************************** + else if constexpr (scalar_x == 2) { + if constexpr (scalar_y == 0) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_x(i, k); - } - } - if (scalar_x == 1 && scalar_y == -1) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a * m_x(i, k); + } + } else if constexpr (scalar_y == -1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_x(i, k) - m_y(i, k); - } - } - if (scalar_x == 1 && scalar_y == 1) { -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_x(i, k) + m_y(i, k); - } - } - if (scalar_x == 1 && scalar_y == 2) { -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_x(i, k) + m_b * m_y(i, k); - } - } - if (scalar_x == 2 && scalar_y == 0) { -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a * m_x(i, k); - } - } - if (scalar_x == 2 && scalar_y == -1) { -#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL -#pragma unroll -#endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a * m_x(i, k) - m_y(i, k); - } - } - if (scalar_x == 2 && scalar_y == 1) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a * m_x(i, k) - m_y(i, k); + } + } else if constexpr (scalar_y == 1) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a * m_x(i, k) + m_y(i, k); - } - } - if (scalar_x == 2 && scalar_y == 2) { + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a * m_x(i, k) + m_y(i, k); + } + } else if constexpr (scalar_y == 2) { #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL #pragma unroll #endif - for (int k = 0; k < UNROLL; ++k) { - m_y(i, k) = m_a * m_x(i, k) + m_b * m_y(i, k); + for (int k = 0; k < UNROLL; ++k) { + m_y(i, k) = m_a * m_x(i, k) + m_b * m_y(i, k); + } } } - -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY } }; @@ -936,11 +1217,11 @@ struct Axpby_MV_Unroll_Functor void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, const BV& bv, const YMV& y, - const SizeType startingColumn, int a = 2, int b = 2) { + const SizeType startingColumn, int scalar_x = 2, + int scalar_y = 2) { static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Unrolled: X is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Unrolled()" + ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Unrolled: Y is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Unrolled()" + ": Y is not a Kokkos::View."); static_assert(std::is_same::value, - "KokkosBlas::Impl::Axpby_MV_Unrolled: Y is const. " - "It must be nonconst, because it is an output argument " - "(we have to be able to write to its entries)."); + "KokkosBlas::Impl::Axpby_MV_Unrolled()" + ": Y must be nonconst, since it is an output argument" + " and we have to be able to write to its entries."); static_assert((int)YMV::rank == (int)XMV::rank, - "KokkosBlas::Impl::" - "Axpby_MV_Unrolled: X and Y must have the same rank."); + "KokkosBlas::Impl::Axpby_MV_Unrolled()" + ": X and Y must have the same rank."); static_assert(YMV::rank == 2, - "KokkosBlas::Impl::Axpby_MV_Unrolled: " - "XMV and YMV must have rank 2."); + "KokkosBlas::Impl::Axpby_MV_Unrolled()" + ": XMV and YMV must have rank 2."); + if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && + (scalar_y <= 2)) { + // Ok + } else { + KokkosKernels::Impl::throw_runtime_exception( + "KokkosBlas::Impl::Axpby_MV_Unrolled()" + ": scalar_x and/or scalar_y are out of range."); + } const SizeType numRows = x.extent(0); Kokkos::RangePolicy policy(space, 0, numRows); - if (a == 0 && b == 0) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S0", policy, op); - return; - } - -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - if (a == 0 && b == -1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S1", policy, op); - return; - } - if (a == 0 && b == 1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S2", policy, op); - return; - } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY - - if (a == 0 && b == 2) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S3", policy, op); - return; - } - -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - // a == -1 - if (a == -1 && b == 0) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S4", policy, op); - return; - } - if (a == -1 && b == -1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S5", policy, op); - return; - } - if (a == -1 && b == 1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S6", policy, op); - return; - } - if (a == -1 && b == 2) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S7", policy, op); - return; - } - // a == 1 - if (a == 1 && b == 0) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S8", policy, op); - return; - } - if (a == 1 && b == -1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S9", policy, op); - return; - } - if (a == 1 && b == 1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S10", policy, op); - return; - } - if (a == 1 && b == 2) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S11", policy, op); - return; + // **************************************************************** + // Possibilities with 'scalar_x == 0' + // **************************************************************** + if (scalar_x == 0) { + if (scalar_y == 0) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S0", policy, op); + } else if (scalar_y == -1) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S1", policy, op); + } else if (scalar_y == 1) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S2", policy, op); + } else if (scalar_y == 2) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S3", policy, op); + } } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - - // a == 2 - if (a == 2 && b == 0) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S12", policy, op); - return; + // **************************************************************** + // Possibilities with 'scalar_x == -1' + // **************************************************************** + else if (scalar_x == -1) { + if (scalar_y == 0) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S4", policy, op); + } else if (scalar_y == -1) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S5", policy, op); + } else if (scalar_y == 1) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S6", policy, op); + } else if (scalar_y == 2) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S7", policy, op); + } } - -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - if (a == 2 && b == -1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S13", policy, op); - return; + // **************************************************************** + // Possibilities with 'scalar_x == 1' + // **************************************************************** + else if (scalar_x == 1) { + if (scalar_y == 0) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S8", policy, op); + } else if (scalar_y == -1) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S9", policy, op); + } else if (scalar_y == 1) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S10", policy, op); + } else if (scalar_y == 2) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S11", policy, op); + } } - if (a == 2 && b == 1) { - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S14", policy, op); - return; + // **************************************************************** + // Possibilities with 'scalar_x == 2' + // **************************************************************** + else if (scalar_x == 2) { + if (scalar_y == 0) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S12", policy, op); + } else if (scalar_y == -1) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S13", policy, op); + } else if (scalar_y == 1) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S14", policy, op); + } else if (scalar_y == 2) { + Axpby_MV_Unroll_Functor op( + x, y, av, bv, startingColumn); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S15", policy, op); + } } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - - // a and b arbitrary (not -1, 0, or 1) - Axpby_MV_Unroll_Functor op( - x, y, av, bv, startingColumn); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S15", policy, op); } // Invoke the "generic" (not unrolled) multivector functor that @@ -1092,11 +1361,11 @@ void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, // 3. Y(i,j) = a*X(i,j) + b*Y(i,j) for a in -1,0,1 // 4. Y(i,j) = av(j)*X(i,j) + bv(j)*Y(i,j) // -// a and b come in as integers. The values -1, 0, and 1 correspond to -// the literal values of the coefficients. The value 2 tells the -// functor to use the corresponding vector of coefficients: a == 2 -// means use av, and b == 2 means use bv. Otherwise, av resp. vb are -// ignored. +// scalar_x and scalar_y come in as integers. The values -1, 0, and 1 +// correspond to the literal values of the coefficients. The value 2 +// tells the functor to use the corresponding vector of coefficients: +// - scalar_x == 2 means use av, otherwise ignore av; +// - scalar_y == 2 means use bv, otherwise ignore bv. // // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding (multi)vector entry. This does NOT apply to @@ -1106,121 +1375,109 @@ void Axpby_MV_Unrolled(const execution_space& space, const AV& av, const XMV& x, template void Axpby_MV_Generic(const execution_space& space, const AV& av, const XMV& x, - const BV& bv, const YMV& y, int a = 2, int b = 2) { + const BV& bv, const YMV& y, int scalar_x = 2, + int scalar_y = 2) { static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Generic: X is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Generic()" + ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Generic: Y is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Generic()" + ": Y is not a Kokkos::View."); static_assert(std::is_same::value, - "KokkosBlas::Impl::Axpby_MV_Generic: Y is const. " - "It must be nonconst, because it is an output argument " - "(we have to be able to write to its entries)."); + "KokkosBlas::Impl::Axpby_MV_Generic()" + ": Y must be nonconst, since it is an output argument" + " and we have to be able to write to its entries."); static_assert((int)YMV::rank == (int)XMV::rank, - "KokkosBlas::Impl::" - "Axpby_MV_Generic: X and Y must have the same rank."); + "KokkosBlas::Impl::Axpby_MV_Generic()" + ": X and Y must have the same rank."); static_assert(YMV::rank == 2, - "KokkosBlas::Impl::Axpby_MV_Generic: " - "XMV and YMV must have rank 2."); + "KokkosBlas::Impl::Axpby_MV_Generic()" + ": XMV and YMV must have rank 2."); + if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && + (scalar_y <= 2)) { + // Ok + } else { + KokkosKernels::Impl::throw_runtime_exception( + "KokkosBlas::Impl::Axpby_MV_Generic()" + ": scalar_x and/or scalar_y are out of range."); + } const SizeType numRows = x.extent(0); Kokkos::RangePolicy policy(space, 0, numRows); - if (a == 0 && b == 0) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S16", policy, op); - return; - } - -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - if (a == 0 && b == -1) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S17", policy, op); - return; - } - if (a == 0 && b == 1) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S18", policy, op); - return; - } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - - if (a == 0 && b == 2) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S19", policy, op); - return; - } - -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - // a == -1 - if (a == -1 && b == 0) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S20", policy, op); - return; - } - if (a == -1 && b == -1) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S21", policy, op); - return; - } - if (a == -1 && b == 1) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S22", policy, op); - return; - } - if (a == -1 && b == 2) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S23", policy, op); - return; - } - // a == 1 - if (a == 1 && b == 0) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S24", policy, op); - return; - } - if (a == 1 && b == -1) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S25", policy, op); - return; - } - if (a == 1 && b == 1) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S26", policy, op); - return; - } - if (a == 1 && b == 2) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S27", policy, op); - return; + // **************************************************************** + // Possibilities with 'scalar_x == 0' + // **************************************************************** + if (scalar_x == 0) { + if (scalar_y == 0) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S16", policy, op); + } else if (scalar_y == -1) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S17", policy, op); + } else if (scalar_y == 1) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S18", policy, op); + } else if (scalar_y == 2) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S19", policy, op); + } } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - - // a == 2 - if (a == 2 && b == 0) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S28", policy, op); - return; + // **************************************************************** + // Possibilities with 'scalar_x == -1' + // **************************************************************** + else if (scalar_x == -1) { + if (scalar_y == 0) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S20", policy, op); + } else if (scalar_y == -1) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S21", policy, op); + } else if (scalar_y == 1) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S22", policy, op); + } else if (scalar_y == 2) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S23", policy, op); + } } - -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - if (a == 2 && b == -1) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S29", policy, op); - return; + // **************************************************************** + // Possibilities with 'scalar_x == 1' + // **************************************************************** + else if (scalar_x == 1) { + if (scalar_y == 0) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S24", policy, op); + } else if (scalar_y == -1) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S25", policy, op); + } else if (scalar_y == 1) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S26", policy, op); + } else if (scalar_y == 2) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S27", policy, op); + } } - if (a == 2 && b == 1) { - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S30", policy, op); - return; + // **************************************************************** + // Possibilities with 'scalar_x == 2' + // **************************************************************** + else if (scalar_x == 2) { + if (scalar_y == 0) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S28", policy, op); + } else if (scalar_y == -1) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S29", policy, op); + } else if (scalar_y == 1) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S30", policy, op); + } else if (scalar_y == 2) { + Axpby_MV_Functor op(x, y, av, bv); + Kokkos::parallel_for("KokkosBlas::Axpby::MV::S31", policy, op); + } } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - - // a and b arbitrary (not -1, 0, or 1) - Axpby_MV_Functor op(x, y, av, bv); - Kokkos::parallel_for("KokkosBlas::Axpby::MV::S31", policy, op); } // Compute any of the following, in a way optimized for X and Y @@ -1231,11 +1488,11 @@ void Axpby_MV_Generic(const execution_space& space, const AV& av, const XMV& x, // 3. Y(i,j) = a*X(i,j) + b*Y(i,j) for a in -1,0,1 // 4. Y(i,j) = av(j)*X(i,j) + bv(j)*Y(i,j) // -// a and b come in as integers. The values -1, 0, and 1 correspond to -// the literal values of the coefficients. The value 2 tells the -// functor to use the corresponding vector of coefficients: a == 2 -// means use av, and b == 2 means use bv. Otherwise, av resp. vb are -// ignored. +// scalar_x and scalar_y come in as integers. The values -1, 0, and 1 +// correspond to the literal values of the coefficients. The value 2 +// tells the functor to use the corresponding vector of coefficients: +// - scalar_x == 2 means use av, otherwise ignore av; +// - scalar_y == 2 means use bv, otherwise ignore bv. // // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding (multi)vector entry. This does NOT apply to @@ -1246,24 +1503,33 @@ template struct Axpby_MV_Invoke_Left { static void run(const execution_space& space, const AV& av, const XMV& x, - const BV& bv, const YMV& y, int a = 2, int b = 2) { + const BV& bv, const YMV& y, int scalar_x = 2, + int scalar_y = 2) { static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Invoke_Left: X is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Invoke_Left::run()" + ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Invoke_Left: Y is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Invoke_Left::run()" + ": Y is not a Kokkos::View."); static_assert(std::is_same::value, - "KokkosBlas::Impl::Axpby_MV_Invoke_Left: Y is const. " - "It must be nonconst, because it is an output argument " - "(we have to be able to write to its entries)."); + "KokkosBlas::Impl::Axpby_MV_Invoke_Left::run()" + ": Y must be nonconst, since it is an output argument" + " and we have to be able to write to its entries."); static_assert((int)YMV::rank == (int)XMV::rank, - "KokkosBlas::Impl::" - "Axpby_MV_Invoke_Left: X and Y must have the same rank."); + "KokkosBlas::Impl::Axpby_MV_Invoke_Left::run()" + ": X and Y must have the same rank."); static_assert(YMV::rank == 2, - "KokkosBlas::Impl::Axpby_MV_Invoke_Left: " - "X and Y must have rank 2."); + "KokkosBlas::Impl::Axpby_MV_Invoke_Left::run()" + ": X and Y must have rank 2."); + if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && + (scalar_y <= 2)) { + // Ok + } else { + KokkosKernels::Impl::throw_runtime_exception( + "KokkosBlas::Impl::Axpby_MV_Invoke_Left::run()" + ": scalar_x and/or scalar_y are out of range."); + } const SizeType numCols = x.extent(1); @@ -1279,7 +1545,7 @@ struct Axpby_MV_Invoke_Left { // subviews of av and bv, if they are Views. If they are scalars, // the functor doesn't have to do anything to them. Axpby_MV_Unrolled( - space, av, X_cur, bv, Y_cur, j, a, b); + space, av, X_cur, bv, Y_cur, j, scalar_x, scalar_y); } for (; j + 4 <= numCols; j += 4) { XMV X_cur = Kokkos::subview(x, Kokkos::ALL(), std::make_pair(j, j + 4)); @@ -1289,7 +1555,7 @@ struct Axpby_MV_Invoke_Left { // subviews of av and bv, if they are Views. If they are scalars, // the functor doesn't have to do anything to them. Axpby_MV_Unrolled( - space, av, X_cur, bv, Y_cur, j, a, b); + space, av, X_cur, bv, Y_cur, j, scalar_x, scalar_y); } for (; j < numCols; ++j) { auto x_cur = Kokkos::subview(x, Kokkos::ALL(), j); @@ -1301,24 +1567,24 @@ struct Axpby_MV_Invoke_Left { typedef decltype(x_cur) XV; typedef decltype(y_cur) YV; Axpby_Generic( - space, av, x_cur, bv, y_cur, j, a, b); + space, av, x_cur, bv, y_cur, j, scalar_x, scalar_y); } } }; -// Compute any of the following, in a way optimized for X, Y, and R +// Compute any of the following, in a way optimized for X and Y // being LayoutRight: // // 1. Y(i,j) = a*X(i,j) + b*Y(i,j) for a,b in -1,0,1 // 2. Y(i,j) = av(j)*X(i,j) + b*Y(i,j) for b in -1,0,1 -// 3. Y(i,j) = a*X(i,j) + b*Y(i,j) for a in -1,0,1 +// 3. Y(i,j) = a*X(i,j) + bv(j)*Y(i,j) for a in -1,0,1 // 4. Y(i,j) = av(j)*X(i,j) + bv(j)*Y(i,j) // -// a and b come in as integers. The values -1, 0, and 1 correspond to -// the literal values of the coefficients. The value 2 tells the -// functor to use the corresponding vector of coefficients: a == 2 -// means use av, and b == 2 means use bv. Otherwise, av resp. vb are -// ignored. +// scalar_x and scalar_y come in as integers. The values -1, 0, and 1 +// correspond to the literal values of the coefficients. The value 2 +// tells the functor to use the corresponding vector of coefficients: +// - scalar_x == 2 means use av, otherwise ignore av; +// - scalar_y == 2 means use bv, otherwise ignore bv. // // Any literal coefficient of zero has BLAS semantics of ignoring the // corresponding (multi)vector entry. This does NOT apply to @@ -1329,24 +1595,33 @@ template struct Axpby_MV_Invoke_Right { static void run(const execution_space& space, const AV& av, const XMV& x, - const BV& bv, const YMV& y, int a = 2, int b = 2) { + const BV& bv, const YMV& y, int scalar_x = 2, + int scalar_y = 2) { static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Invoke_Right: X is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Invoke_Right::run()" + ": X is not a Kokkos::View."); static_assert(Kokkos::is_view::value, - "KokkosBlas::Impl::" - "Axpby_MV_Invoke_Right: Y is not a Kokkos::View."); + "KokkosBlas::Impl::Axpby_MV_Invoke_Right::run()" + ": Y is not a Kokkos::View."); static_assert(std::is_same::value, - "KokkosBlas::Impl::Axpby_MV_Invoke_Right: Y is const. " - "It must be nonconst, because it is an output argument " - "(we have to be able to write to its entries)."); + "KokkosBlas::Impl::Axpby_MV_Invoke_Right::run()" + ": Y must be nonconst, since it is an output argument" + " and we have to be able to write to its entries."); static_assert((int)YMV::rank == (int)XMV::rank, - "KokkosBlas::Impl::" - "Axpby_MV_Invoke_Right: X and Y must have the same rank."); + "KokkosBlas::Impl::Axpby_MV_Invoke_Right::run()" + ": X and Y must have the same rank."); static_assert(YMV::rank == 2, - "KokkosBlas::Impl::Axpby_MV_Invoke_Right: " - "X and Y must have rank 2."); + "KokkosBlas::Impl::Axpby_MV_Invoke_Right::run()" + ": X and Y must have rank 2."); + if ((-1 <= scalar_x) && (scalar_x <= 2) && (-1 <= scalar_y) && + (scalar_y <= 2)) { + // Ok + } else { + KokkosKernels::Impl::throw_runtime_exception( + "KokkosBlas::Impl::Axpby_MV_Invoke_Right::run()" + ": scalar_x and/or scalar_y are out of range."); + } const SizeType numCols = x.extent(1); if (numCols == 1) { @@ -1355,10 +1630,10 @@ struct Axpby_MV_Invoke_Right { typedef decltype(x_0) XV; typedef decltype(y_0) YV; Axpby_Generic( - space, av, x_0, bv, y_0, 0, a, b); + space, av, x_0, bv, y_0, 0, scalar_x, scalar_y); } else { Axpby_MV_Generic( - space, av, x, bv, y, a, b); + space, av, x, bv, y, scalar_x, scalar_y); } } }; diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_axpby_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_axpby_spec.hpp index da2924c9f3f4..3aff21e0bedc 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas1_axpby_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_axpby_spec.hpp @@ -56,6 +56,23 @@ struct axpby_eti_spec_avail { Kokkos::MemoryTraits >, \ 1> { \ enum : bool { value = true }; \ + }; \ + template <> \ + struct axpby_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ }; // @@ -82,13 +99,13 @@ struct axpby_eti_spec_avail { template <> \ struct axpby_eti_spec_avail< \ EXEC_SPACE, \ - Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View, \ @@ -150,11 +167,17 @@ struct Axpby { }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -// Full specialization for XMV and YMV rank-2 Views. +// ********************************************************************** +// Full specialization for XMV and YMV rank-2 Views: +// --> AV = anything and BV = anything +// +// If axpby() runs at a device with rank-2 XMV and rank-2 YMV, then +// the unification process forces AV = view and BV = view +// ********************************************************************** template struct Axpby { - typedef typename YMV::size_type size_type; + using size_type = typename YMV::size_type; static void axpby(const execution_space& space, const AV& av, const XMV& X, const BV& bv, const YMV& Y) { @@ -193,49 +216,83 @@ struct Axpby) { + if constexpr (AV::rank == 1) { + if (av.extent(0) == 0) { + scalar_x = 0; + } + } + } else { + using ATA = Kokkos::ArithTraits; + if (av == ATA::zero()) { + scalar_x = 0; + } else if (av == -ATA::one()) { + scalar_x = -1; + } else if (av == ATA::one()) { + scalar_x = 1; + } } - if (bv.extent(0) == 0) { - b = 0; + + int scalar_y(2); + if constexpr (Kokkos::is_view_v) { + if constexpr (BV::rank == 1) { + if (bv.extent(0) == 0) { + scalar_y = 0; + } + } + } else { + using ATB = Kokkos::ArithTraits; + if (bv == ATB::zero()) { + scalar_y = 0; + } else if (bv == -ATB::one()) { + scalar_y = -1; + } else if (bv == ATB::one()) { + scalar_y = 1; + } } if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { - typedef int index_type; - typedef typename std::conditional< + using index_type = int; + using Axpby_MV_Invoke_Layout = typename std::conditional< std::is_same::value, - Axpby_MV_Invoke_Right, - Axpby_MV_Invoke_Left >::type Axpby_MV_Invoke_Layout; - Axpby_MV_Invoke_Layout::run(space, av, X, bv, Y, a, b); + Axpby_MV_Invoke_Left, + Axpby_MV_Invoke_Right >::type; + Axpby_MV_Invoke_Layout::run(space, av, X, bv, Y, scalar_x, scalar_y); } else { - typedef typename XMV::size_type index_type; - typedef typename std::conditional< + using index_type = typename XMV::size_type; + using Axpby_MV_Invoke_Layout = typename std::conditional< std::is_same::value, - Axpby_MV_Invoke_Right, - Axpby_MV_Invoke_Left >::type Axpby_MV_Invoke_Layout; - Axpby_MV_Invoke_Layout::run(space, av, X, bv, Y, a, b); + Axpby_MV_Invoke_Left, + Axpby_MV_Invoke_Right >::type; + Axpby_MV_Invoke_Layout::run(space, av, X, bv, Y, scalar_x, scalar_y); } Kokkos::Profiling::popRegion(); } }; -// Partial specialization for XMV, and YMV rank-2 Views, -// and AV and BV scalars. +// ********************************************************************** +// Partial specialization for XMV and YMV rank-2 Views: +// --> AV = scalar and BV = scalar +// +// If axpby() runs at the host with rank-2 XMV and rank-2 YMV, then +// the unification process _might_ force AV = scalar and BV = scalar +// ********************************************************************** template struct Axpby { - typedef typename XMV::non_const_value_type AV; - typedef typename YMV::non_const_value_type BV; - typedef typename YMV::size_type size_type; - typedef Kokkos::ArithTraits ATA; - typedef Kokkos::ArithTraits ATB; + using AV = typename XMV::non_const_value_type; + using BV = typename YMV::non_const_value_type; + using size_type = typename YMV::size_type; + using ATA = Kokkos::ArithTraits; + using ATB = Kokkos::ArithTraits; static void axpby(const execution_space& space, const AV& alpha, const XMV& X, const BV& beta, const YMV& Y) { @@ -275,69 +332,135 @@ struct Axpby 2 - else if (alpha == -ATA::one()) { - a = -1; + scalar_x = 0; + } else if (alpha == -ATA::one()) { + scalar_x = -1; } else if (alpha == ATA::one()) { - a = 1; - } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - else { - a = 2; + scalar_x = 1; } + + int scalar_y(2); if (beta == ATB::zero()) { - b = 0; - } -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - else if (beta == -ATB::one()) { - b = -1; + scalar_y = 0; + } else if (beta == -ATB::one()) { + scalar_y = -1; } else if (beta == ATB::one()) { - b = 1; - } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - else { - b = 2; + scalar_y = 1; } if (numRows < static_cast(INT_MAX) && numRows * numCols < static_cast(INT_MAX)) { - typedef int index_type; - typedef typename std::conditional< + using index_type = int; + using Axpby_MV_Invoke_Layout = typename std::conditional< std::is_same::value, - Axpby_MV_Invoke_Right, - Axpby_MV_Invoke_Left >::type Axpby_MV_Invoke_Layout; - Axpby_MV_Invoke_Layout::run(space, alpha, X, beta, Y, a, b); + Axpby_MV_Invoke_Left, + Axpby_MV_Invoke_Right >::type; + Axpby_MV_Invoke_Layout::run(space, alpha, X, beta, Y, scalar_x, scalar_y); } else { - typedef typename XMV::size_type index_type; - typedef typename std::conditional< + using index_type = typename XMV::size_type; + using Axpby_MV_Invoke_Layout = typename std::conditional< std::is_same::value, - Axpby_MV_Invoke_Right, - Axpby_MV_Invoke_Left >::type Axpby_MV_Invoke_Layout; - Axpby_MV_Invoke_Layout::run(space, alpha, X, beta, Y, a, b); + Axpby_MV_Invoke_Left, + Axpby_MV_Invoke_Right >::type; + Axpby_MV_Invoke_Layout::run(space, alpha, X, beta, Y, scalar_x, scalar_y); } Kokkos::Profiling::popRegion(); } }; -// Partial specialization for XV and YV rank-1 Views, -// and AV and BV scalars. +// ********************************************************************** +// Full specialization for XV and YV rank-1 Views: +// --> AV = anything and BV = anything +// +// If axpby() runs at a device with rank-1 XV and rank-1 YV, then +// the unification process forces AV = view and BV = view +// ********************************************************************** +template +struct Axpby { + using size_type = typename YV::size_type; + + static void axpby(const execution_space& space, const AV& av, const XV& X, + const BV& bv, const YV& Y) { + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + ? "KokkosBlas::axpby[ETI]" + : "KokkosBlas::axpby[noETI]"); + + size_type const numRows = X.extent(0); + + int scalar_x(2); + if constexpr (Kokkos::is_view_v) { + if constexpr (AV::rank == 1) { + if (av.extent(0) == 0) { + scalar_x = 0; + } + } + } else { + using ATA = Kokkos::ArithTraits; + if (av == ATA::zero()) { + scalar_x = 0; + } else if (av == -ATA::one()) { + scalar_x = -1; + } else if (av == ATA::one()) { + scalar_x = 1; + } + } + + int scalar_y(2); + if constexpr (Kokkos::is_view_v) { + if constexpr (BV::rank == 1) { + if (bv.extent(0) == 0) { + scalar_y = 0; + } + } + } else { + using ATB = Kokkos::ArithTraits; + if (bv == ATB::zero()) { + scalar_y = 0; + } else if (bv == -ATB::one()) { + scalar_y = -1; + } else if (bv == ATB::one()) { + scalar_y = 1; + } + } + + if (numRows < static_cast(INT_MAX)) { + using index_type = int; + Axpby_Generic( + space, av, X, bv, Y, 0, scalar_x, scalar_y); + } else { + using index_type = typename XV::size_type; + Axpby_Generic( + space, av, X, bv, Y, 0, scalar_x, scalar_y); + } + + Kokkos::Profiling::popRegion(); + } +}; + +// ********************************************************************** +// Partial specialization for XV and YV rank-1 Views: +// --> AV = scalar and BV = scalar +// +// If axpby() runs at the host with rank-1 XV and rank-1 YV, then +// the unification process forces AV = scalar and BV = scalar +// ********************************************************************** template struct Axpby { - typedef typename XV::non_const_value_type AV; - typedef typename YV::non_const_value_type BV; - typedef typename YV::size_type size_type; - typedef Kokkos::ArithTraits ATA; - typedef Kokkos::ArithTraits ATB; + using AV = typename XV::non_const_value_type; + using BV = typename YV::non_const_value_type; + using size_type = typename YV::size_type; + using ATA = Kokkos::ArithTraits; + using ATB = Kokkos::ArithTraits; static void axpby(const execution_space& space, const AV& alpha, const XV& X, const BV& beta, const YV& Y) { @@ -377,41 +500,36 @@ struct Axpby 2 - else if (alpha == -ATA::one()) { - a = -1; + scalar_x = 0; + } else if (alpha == -ATA::one()) { + scalar_x = -1; } else if (alpha == ATA::one()) { - a = 1; + scalar_x = 1; } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - int b = 2; + int scalar_y(2); if (beta == ATB::zero()) { - b = 0; - } -#if KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 - else if (beta == -ATB::one()) { - b = -1; + scalar_y = 0; + } else if (beta == -ATB::one()) { + scalar_y = -1; } else if (beta == ATB::one()) { - b = 1; + scalar_y = 1; } -#endif // KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 if (numRows < static_cast(INT_MAX)) { - typedef int index_type; + using index_type = int; Axpby_Generic( - space, alpha, X, beta, Y, 0, a, b); + space, alpha, X, beta, Y, 0, scalar_x, scalar_y); } else { - typedef typename XV::size_type index_type; + using index_type = typename XV::size_type; Axpby_Generic( - space, alpha, X, beta, Y, 0, a, b); + space, alpha, X, beta, Y, 0, scalar_x, scalar_y); } Kokkos::Profiling::popRegion(); } @@ -437,6 +555,20 @@ struct Axpby, \ Kokkos::MemoryTraits >, \ SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; \ + extern template struct Axpby< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, false, true>; @@ -448,6 +580,20 @@ struct Axpby, \ Kokkos::MemoryTraits >, \ SCALAR, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, false, true>; \ + template struct Axpby< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, false, true>; diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp new file mode 100644 index 000000000000..9d200e892d91 --- /dev/null +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas1_axpby_unification_attempt_traits.hpp @@ -0,0 +1,965 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOS_BLAS1_AXPBY_UNIFICATION_ATTEMPT_TRAITS_HPP_ +#define KOKKOS_BLAS1_AXPBY_UNIFICATION_ATTEMPT_TRAITS_HPP_ + +#include +#include +#include + +namespace KokkosBlas { +namespace Impl { + +// -------------------------------- + +template +constexpr int typeRank() { + if constexpr (Kokkos::is_view_v) { + return T::rank; + } + return -1; +} + +// -------------------------------- + +template +constexpr typename std::enable_if, bool>::type Tr0_val() { + return (T::rank == 0); +} + +template +constexpr typename std::enable_if, bool>::type Tr0_val() { + return false; +} + +// -------------------------------- + +template +constexpr typename std::enable_if, bool>::type Tr1s_val() { + return (T::rank == 1) && (T::rank_dynamic == 0); +} + +template +constexpr typename std::enable_if, bool>::type +Tr1s_val() { + return false; +} + +// -------------------------------- + +template +constexpr typename std::enable_if, bool>::type Tr1d_val() { + return (T::rank == 1) && (T::rank_dynamic == 1); +} + +template +constexpr typename std::enable_if, bool>::type +Tr1d_val() { + return false; +} + +// -------------------------------- + +template +struct getScalarTypeFromView { + using type = void; +}; + +template +struct getScalarTypeFromView { + using type = typename T::value_type; +}; + +// -------------------------------- + +template +struct getLayoutFromView { + using type = void; +}; + +template +struct getLayoutFromView { + using type = typename T::array_layout; +}; + +// -------------------------------- + +template +struct AxpbyUnificationAttemptTraits { + // ******************************************************************** + // Terminology: + // - variable names begin with lower case letters + // - type names begin with upper case letters + // ******************************************************************** + public: + static constexpr bool onDevice = + KokkosKernels::Impl::kk_is_gpu_exec_space(); + + private: + static constexpr bool onHost = !onDevice; + + public: + static constexpr bool a_is_scalar = !Kokkos::is_view_v; + + private: + static constexpr bool a_is_r0 = Tr0_val(); + static constexpr bool a_is_r1s = Tr1s_val(); + static constexpr bool a_is_r1d = Tr1d_val(); + + static constexpr bool x_is_r1 = Kokkos::is_view_v && (XMV::rank == 1); + static constexpr bool x_is_r2 = Kokkos::is_view_v && (XMV::rank == 2); + + public: + static constexpr bool b_is_scalar = !Kokkos::is_view_v; + + private: + static constexpr bool b_is_r0 = Tr0_val(); + static constexpr bool b_is_r1s = Tr1s_val(); + static constexpr bool b_is_r1d = Tr1d_val(); + + static constexpr bool y_is_r1 = Kokkos::is_view_v && (YMV::rank == 1); + static constexpr bool y_is_r2 = Kokkos::is_view_v && (YMV::rank == 2); + + static constexpr bool xyRank1Case = x_is_r1 && y_is_r1; + static constexpr bool xyRank2Case = x_is_r2 && y_is_r2; + + // ******************************************************************** + // Declare 'AtInputScalarTypeA_nonConst' + // ******************************************************************** + using ScalarTypeA2_onDevice = + typename getScalarTypeFromView::type; + using ScalarTypeA1_onDevice = + std::conditional_t; + + using ScalarTypeA2_onHost = + typename getScalarTypeFromView::type; + using ScalarTypeA1_onHost = + std::conditional_t; + + using AtInputScalarTypeA = + std::conditional_t; + + using AtInputScalarTypeA_nonConst = + typename std::remove_const::type; + + // ******************************************************************** + // Declare 'AtInputScalarTypeX_nonConst' + // ******************************************************************** + using AtInputScalarTypeX = typename XMV::value_type; + + using AtInputScalarTypeX_nonConst = typename XMV::non_const_value_type; + + // ******************************************************************** + // Declare 'AtInputScalarTypeB_nonConst' + // ******************************************************************** + using ScalarTypeB2_onDevice = + typename getScalarTypeFromView::type; + using ScalarTypeB1_onDevice = + std::conditional_t; + + using ScalarTypeB2_onHost = + typename getScalarTypeFromView::type; + using ScalarTypeB1_onHost = + std::conditional_t; + + using AtInputScalarTypeB = + std::conditional_t; + + using AtInputScalarTypeB_nonConst = + typename std::remove_const::type; + + // ******************************************************************** + // Declare 'AtInputScalarTypeY_nonConst' + // ******************************************************************** + using AtInputScalarTypeY = typename YMV::value_type; + + using AtInputScalarTypeY_nonConst = typename YMV::non_const_value_type; + + // ******************************************************************** + // Declare 'InternalLayoutX' and 'InternalLayoutY' + // ******************************************************************** + using InternalLayoutX = + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using InternalLayoutY = + typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< + YMV, InternalLayoutX>::array_layout; + + // ******************************************************************** + // Declare 'InternalTypeA_tmp' + // ******************************************************************** + using AtInputLayoutA = + typename getLayoutFromView::type; + + public: + static constexpr bool atInputLayoutA_isStride = + std::is_same_v; + + private: + using InternalLayoutA = + std::conditional_t<(a_is_r1d || a_is_r1s) && atInputLayoutA_isStride, + AtInputLayoutA, InternalLayoutX>; + + static constexpr bool atInputScalarTypeA_mustRemain = + Kokkos::ArithTraits::is_complex && + !Kokkos::ArithTraits::is_complex; + + using InternalScalarTypeA = std::conditional_t< + atInputScalarTypeA_mustRemain || ((a_is_r1d || a_is_r1s) && xyRank2Case), + AtInputScalarTypeA_nonConst // Yes, keep the input scalar type + , + AtInputScalarTypeX_nonConst // Yes, instead of + // 'AtInputScalarTypeA_nonConst' + >; + + using InternalTypeA_onDevice = std::conditional_t< + a_is_scalar && b_is_scalar && onDevice, // Keep 'a' as scalar + InternalScalarTypeA, + Kokkos::View>>; + + using InternalTypeA_onHost = std::conditional_t< + (a_is_r1d || a_is_r1s) && xyRank2Case && onHost, + Kokkos::View>, + InternalScalarTypeA>; + + using InternalTypeA_tmp = + std::conditional_t; + + // ******************************************************************** + // Declare 'InternalTypeX' + // ******************************************************************** + public: + using InternalTypeX = std::conditional_t< + x_is_r2, + Kokkos::View>, + Kokkos::View>>; + + // ******************************************************************** + // Declare 'InternalTypeB_tmp' + // ******************************************************************** + private: + using AtInputLayoutB = + typename getLayoutFromView::type; + + public: + static constexpr bool atInputLayoutB_isStride = + std::is_same_v; + + private: + using InternalLayoutB = + std::conditional_t<(b_is_r1d || b_is_r1s) && atInputLayoutB_isStride, + AtInputLayoutB, InternalLayoutY>; + + static constexpr bool atInputScalarTypeB_mustRemain = + Kokkos::ArithTraits::is_complex && + !Kokkos::ArithTraits::is_complex; + + using InternalScalarTypeB = std::conditional_t< + atInputScalarTypeB_mustRemain || ((b_is_r1d || b_is_r1s) && xyRank2Case), + AtInputScalarTypeB_nonConst // Yes, keep the input scalar type + , + AtInputScalarTypeY_nonConst // Yes, instead of + // 'AtInputScalarTypeB_nonConst' + >; + + using InternalTypeB_onDevice = std::conditional_t< + a_is_scalar && b_is_scalar && onDevice, // Keep 'b' as scalar + InternalScalarTypeB, + Kokkos::View>>; + + using InternalTypeB_onHost = std::conditional_t< + (b_is_r1d || b_is_r1s) && xyRank2Case && onHost, + Kokkos::View>, + InternalScalarTypeB>; + + using InternalTypeB_tmp = + std::conditional_t; + + // ******************************************************************** + // Declare 'InternalTypeY' + // ******************************************************************** + public: + using InternalTypeY = std::conditional_t< + y_is_r2, + Kokkos::View>, + Kokkos::View>>; + + // ******************************************************************** + // Declare 'InternalTypeA': if 'InternalTypeB_tmp' is a view then + // make sure 'InternalTypeA' is a view as well + // ******************************************************************** + using InternalTypeA = std::conditional_t< + !Kokkos::is_view_v && + Kokkos::is_view_v, + Kokkos::View>, + InternalTypeA_tmp>; + + // ******************************************************************** + // Declare 'InternalTypeA_managed' with the same scalar type in + // 'InternalTypeA' + // ******************************************************************** + private: + using InternalLayoutA_managed = InternalLayoutA; + + public: + using InternalTypeA_managed = std::conditional_t< + Kokkos::is_view_v, + Kokkos::View, + void>; + + // ******************************************************************** + // Declare 'InternalTypeB' if 'InternalTypeA_tmp' is a view then + // make sure 'InternalTypeB' is a view as well + // ******************************************************************** + using InternalTypeB = std::conditional_t< + Kokkos::is_view_v && + !Kokkos::is_view_v, + Kokkos::View>, + InternalTypeB_tmp>; + + // ******************************************************************** + // Declare 'InternalTypeB_managed' with the same scalar type in + // 'InternalTypeB' + // ******************************************************************** + private: + using InternalLayoutB_managed = InternalLayoutB; + + public: + using InternalTypeB_managed = std::conditional_t< + Kokkos::is_view_v, + Kokkos::View, + void>; + + // ******************************************************************** + // Auxiliary Boolean results on internal types + // ******************************************************************** + private: + static constexpr bool internalTypeA_is_scalar = + !Kokkos::is_view_v; + static constexpr bool internalTypeA_is_r1d = Tr1d_val(); + + static constexpr bool internalTypeB_is_scalar = + !Kokkos::is_view_v; + static constexpr bool internalTypeB_is_r1d = Tr1d_val(); + + public: + static constexpr bool internalTypesAB_bothScalars = + (internalTypeA_is_scalar && internalTypeB_is_scalar); + static constexpr bool internalTypesAB_bothViews = + (internalTypeA_is_r1d && internalTypeB_is_r1d); + + // ******************************************************************** + // Routine to perform checks (both compile time and run time) + // ******************************************************************** + static void performChecks(const AV& a, const XMV& X, const BV& b, + const YMV& Y) { + // ****************************************************************** + // Check 1/6: General checks + // ****************************************************************** + static_assert( + Kokkos::is_execution_space_v, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": tExecSpace must be a valid Kokkos execution space."); + + static_assert( + (xyRank1Case && !xyRank2Case) || (!xyRank1Case && xyRank2Case), + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": one must have either both X and Y as rank 1, or both X and Y as " + "rank 2"); + + if constexpr (!Kokkos::ArithTraits< + AtInputScalarTypeY_nonConst>::is_complex) { + static_assert( + (!Kokkos::ArithTraits::is_complex) && + (!Kokkos::ArithTraits::is_complex) && + (!Kokkos::ArithTraits::is_complex), + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": if Y is not complex, then A, X and B cannot be complex"); + } + + // ****************************************************************** + // Check 2/6: YMV is valid + // ****************************************************************** + static_assert( + Kokkos::is_view::value, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": Y is not a Kokkos::View."); + static_assert( + std::is_same::value, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": Y is const. It must be nonconst, " + "because it is an output argument " + "(we must be able to write to its entries)."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": XMV must be accessible from tExecSpace"); + + // ****************************************************************** + // Check 3/6: XMV is valid + // ****************************************************************** + static_assert( + Kokkos::is_view::value, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": X is not a Kokkos::View."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": XMV must be accessible from tExecSpace"); + + if constexpr (xyRank1Case) { + if (X.extent(0) != Y.extent(0)) { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks(" + ")" + << ", invalid rank-1 X extent" + << ": X.extent(0) = " << X.extent(0) + << ", Y.extent(0) = " << Y.extent(0); + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + } else { + if ((X.extent(0) != Y.extent(0)) || (X.extent(1) != Y.extent(1))) { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks(" + ")" + << ", invalid rank-2 X extents" + << ": X.extent(0) = " << X.extent(0) + << ", X.extent(1) = " << X.extent(1) + << ", Y.extent(0) = " << Y.extent(0) + << ", Y.extent(1) = " << Y.extent(1); + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + } + + // ****************************************************************** + // Check 4/6: AV is valid + // ****************************************************************** + static_assert( + (a_is_scalar && !a_is_r0 && !a_is_r1s && !a_is_r1d) || + (!a_is_scalar && a_is_r0 && !a_is_r1s && !a_is_r1d) || + (!a_is_scalar && !a_is_r0 && a_is_r1s && !a_is_r1d) || + (!a_is_scalar && !a_is_r0 && !a_is_r1s && a_is_r1d), + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": 'a' must be either scalar or rank 0 or rank 1 static or rank 1 " + "dynamic"); + + if constexpr (a_is_r1d || a_is_r1s) { + if constexpr (xyRank1Case) { + if (a.extent(0) != 1) { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::" + "performChecks()" + << ": view 'a' must have extent(0) == 1 for xyRank1Case" + << ", a.extent(0) = " << a.extent(0); + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + } else { + if ((a.extent(0) == 1) || + (a.extent(0) == Y.extent(1))) { // Yes, 'Y' is the reference + // Ok + } else { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::" + "performChecks()" + << ": view 'a' must have extent(0) == 1 or Y.extent(1) for " + "xyRank2Case" + << ", a.extent(0) = " << a.extent(0) + << ", Y.extent(0) = " << Y.extent(0) + << ", Y.extent(1) = " << Y.extent(1); + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + } // if (rank1Case) else + } // if a_is_r1d + + // ****************************************************************** + // Check 5/6: BV is valid + // ****************************************************************** + static_assert( + (b_is_scalar && !b_is_r0 && !b_is_r1s && !b_is_r1d) || + (!b_is_scalar && b_is_r0 && !b_is_r1s && !b_is_r1d) || + (!b_is_scalar && !b_is_r0 && b_is_r1s && !b_is_r1d) || + (!b_is_scalar && !b_is_r0 && !b_is_r1s && b_is_r1d), + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ": 'b' must be either scalar or rank 0 or rank 1 static or rank 1 " + "dynamic"); + + if constexpr (b_is_r1d || b_is_r1s) { + if constexpr (xyRank1Case) { + if (b.extent(0) != 1) { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::" + "performChecks()" + << ": view 'b' must have extent(0) == 1 for xyRank1Case" + << ", b.extent(0) = " << b.extent(0); + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + } else { + if ((b.extent(0) == 1) || (b.extent(0) == Y.extent(1))) { + // Ok + } else { + std::ostringstream msg; + msg << "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::" + "performChecks()" + << ": view 'b' must have extent(0) == 1 or Y.extent(1) for " + "xyRank2Case" + << ", b.extent(0) = " << b.extent(0) + << ", Y.extent(0) = " << Y.extent(0) + << ", Y.extent(1) = " << Y.extent(1); + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + } // if (rank1Case) else + } // if b_is_r1d + + // ****************************************************************** + // Check 6/6: Checks on InternalTypeA, X, B, Y + // ****************************************************************** + if constexpr (onHost) { + if constexpr (xyRank1Case) { + constexpr bool internalTypeA_isOk = + (internalTypeA_is_scalar || internalTypeA_is_r1d); + static_assert( + internalTypeA_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank1Case: InternalTypeA is wrong"); + + constexpr bool internalTypeX_isOk = std::is_same_v< + InternalTypeX, + Kokkos::View>>; + static_assert( + internalTypeX_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank1Case: InternalTypeX is wrong"); + + constexpr bool internalTypeB_isOk = + (internalTypeB_is_scalar || internalTypeB_is_r1d); + static_assert( + internalTypeB_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank1Case: InternalTypeB is wrong"); + + constexpr bool internalTypeY_isOk = std::is_same_v< + InternalTypeY, + Kokkos::View>>; + static_assert( + internalTypeY_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank1Case: InternalTypeY is wrong"); + } else { + constexpr bool internalTypeA_isOk = + (internalTypeA_is_scalar || internalTypeA_is_r1d); + static_assert( + internalTypeA_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank2Case: InternalTypeA is wrong"); + + constexpr bool internalTypeX_isOk = std::is_same_v< + InternalTypeX, + Kokkos::View>>; + static_assert( + internalTypeX_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank2Case: InternalTypeX is wrong"); + + constexpr bool internalTypeB_isOk = + (internalTypeB_is_scalar || internalTypeB_is_r1d); + static_assert( + internalTypeB_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank2Case: InternalTypeB is wrong"); + + constexpr bool internalTypeY_isOk = std::is_same_v< + InternalTypeY, + Kokkos::View>>; + static_assert( + internalTypeY_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, xyRank2Case: InternalTypeY is wrong"); + } + } else { + if constexpr (xyRank1Case) { + constexpr bool internalTypeA_isOk = + internalTypeA_is_r1d || + (a_is_scalar && b_is_scalar && internalTypeA_is_scalar); + static_assert( + internalTypeA_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank1Case: InternalTypeA is wrong"); + + constexpr bool internalTypeX_isOk = std::is_same_v< + InternalTypeX, + Kokkos::View>>; + static_assert( + internalTypeX_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank1Case: InternalTypeX is wrong"); + + constexpr bool internalTypeB_isOk = + internalTypeB_is_r1d || + (a_is_scalar && b_is_scalar && internalTypeA_is_scalar); + static_assert( + internalTypeB_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank1Case: InternalTypeB is wrong"); + + constexpr bool internalTypeY_isOk = std::is_same_v< + InternalTypeY, + Kokkos::View>>; + static_assert( + internalTypeY_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank1Case: InternalTypeY is wrong"); + } else { + constexpr bool internalTypeA_isOk = + internalTypeA_is_r1d || + (a_is_scalar && b_is_scalar && internalTypeA_is_scalar); + static_assert( + internalTypeA_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank2Case: InternalTypeA is wrong"); + + constexpr bool internalTypeX_isOk = std::is_same_v< + InternalTypeX, + Kokkos::View>>; + static_assert( + internalTypeX_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank2Case: InternalTypeX is wrong"); + + constexpr bool internalTypeB_isOk = + internalTypeB_is_r1d || + (a_is_scalar && b_is_scalar && internalTypeB_is_scalar); + static_assert( + internalTypeB_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank2Case: InternalTypeB is wrong"); + + constexpr bool internalTypeY_isOk = std::is_same_v< + InternalTypeY, + Kokkos::View>>; + static_assert( + internalTypeY_isOk, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, xyRank2Case: InternalTypeY is wrong"); + } + } + + if constexpr (onHost) { + // **************************************************************** + // We are in the 'onHost' case, with 2 possible subcases:: + // + // 1) xyRank1Case, with the following possible situations: + // - [InternalTypeA, B] = [S_a, S_b], or + // - [InternalTypeA, B] = [view, view] + // + // or + // + // 2) xyRank2Case, with the following possible situations: + // - [InternalTypeA, B] = [S_a, S_b], or + // - [InternalTypeA, B] = [view, view] + // **************************************************************** + static_assert( + internalTypesAB_bothScalars || internalTypesAB_bothViews, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onHost, invalid combination of types"); + } // If onHost + else if constexpr (onDevice) { + // **************************************************************** + // We are in the 'onDevice' case, with 2 possible subcases: + // + // 1) xyRank1Case, with the following possible situations: + // - [InternalTypeA, B] = [S_a, S_b], or + // - [InternalTypeA, B] = [view, view] + // + // or + // + // 2) xyRank2Case, with the following possible situations: + // - [InternalTypeA, B] = [S_a, S_b], or + // - [InternalTypeA, B] = [view, view] + // **************************************************************** + static_assert( + internalTypesAB_bothViews || + (a_is_scalar && b_is_scalar && internalTypesAB_bothScalars), + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", onDevice, invalid combination of types"); + } + + if constexpr (xyRank2Case && (a_is_r1d || a_is_r1s) && + atInputLayoutA_isStride) { + static_assert( + std::is_same_v< + typename getLayoutFromView< + InternalTypeA, Kokkos::is_view_v>::type, + Kokkos::LayoutStride>, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", xyRank2Case: coeff 'a' is rank-1 and has LayoutStride at input" + ", but no LayoutStride internally"); + } + + if constexpr (xyRank2Case && (b_is_r1d || b_is_r1s) && + atInputLayoutB_isStride) { + static_assert( + std::is_same_v< + typename getLayoutFromView< + InternalTypeB, Kokkos::is_view_v>::type, + Kokkos::LayoutStride>, + "KokkosBlas::Impl::AxpbyUnificationAttemptTraits::performChecks()" + ", xyRank2Case: coeff 'b' is rank-1 and has LayoutStride at input" + ", but no LayoutStride internally"); + } + } // Constructor + + // ******************************************************************** + // Routine to print information on input variables and internal variables + // ******************************************************************** +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static void printInformation(std::ostream& os, std::string const& headerMsg) { + os << headerMsg << ": AV = " + << typeid(AV).name() + //<< ", AV::const_data_type = " << typeid(AV::const_data_type).name() + //<< ", AV::non_const_data_type = " << + // typeid(AV::non_const_data_type).name() + << ", AtInputScalarTypeA = " << typeid(AtInputScalarTypeA).name() + << ", isConst = " + << std::is_const_v << ", isComplex = " + << Kokkos::ArithTraits::is_complex + << ", AtInputScalarTypeA_nonConst = " + << typeid(AtInputScalarTypeA_nonConst).name() + << ", InternalTypeA = " << typeid(InternalTypeA).name() << "\n" + << ", InternalTypeA_managed = " << typeid(InternalTypeA_managed).name() + << "\n" + << "\n" + << "XMV = " << typeid(XMV).name() << "\n" + << "XMV::value_type = " << typeid(typename XMV::value_type).name() + << "\n" + << "XMV::const_data_type = " + << typeid(typename XMV::const_data_type).name() << "\n" + << "XMV::non_const_data_type = " + << typeid(typename XMV::non_const_data_type).name() << "\n" + << "AtInputScalarTypeX = " << typeid(AtInputScalarTypeX).name() << "\n" + << "isConst = " << std::is_const_v << "\n" + << "isComplex = " + << Kokkos::ArithTraits::is_complex << "\n" + << "AtInputScalarTypeX_nonConst = " + << typeid(AtInputScalarTypeX_nonConst).name() << "\n" + << "InternalTypeX = " << typeid(InternalTypeX).name() << "\n" + << "\n" + << "BV = " + << typeid(BV).name() + //<< ", BV::const_data_type = " << typeid(BV::const_data_type).name() + //<< ", BV::non_const_data_type = " << + // typeid(BV::non_const_data_type).name() + << ", AtInputScalarTypeB = " << typeid(AtInputScalarTypeB).name() + << ", isConst = " + << std::is_const_v << ", isComplex = " + << Kokkos::ArithTraits::is_complex + << ", AtInputScalarTypeB_nonConst = " + << typeid(AtInputScalarTypeB_nonConst).name() + << ", InternalTypeB = " << typeid(InternalTypeB).name() << "\n" + << ", InternalTypeB_managed = " << typeid(InternalTypeB_managed).name() + << "\n" + << "\n" + << "YMV = " << typeid(YMV).name() << "\n" + << "YMV::value_type = " << typeid(typename YMV::value_type).name() + << "\n" + << "YMV::const_data_type = " + << typeid(typename YMV::const_data_type).name() << "\n" + << "YMV::non_const_data_type = " + << typeid(typename YMV::non_const_data_type).name() << "\n" + << "AtInputScalarTypeY = " << typeid(AtInputScalarTypeY).name() << "\n" + << "isConst = " << std::is_const_v << "\n" + << "isComplex = " + << Kokkos::ArithTraits::is_complex << "\n" + << "AtInputScalarTypeY_nonConst = " + << typeid(AtInputScalarTypeY_nonConst).name() << "\n" + << "InternalTypeY = " << typeid(InternalTypeY).name() << "\n" + << std::endl; + } +#endif + +}; // struct AxpbyUnificationAttemptTraits + +// -------------------------------- + +template +struct getScalarValueFromVariableAtHost { + getScalarValueFromVariableAtHost() { + static_assert((rankT == -1) || (rankT == 0) || (rankT == 1), + "Generic struct should not have been invoked!"); + } +}; + +template +struct getScalarValueFromVariableAtHost { + static T getValue(T const& var) { return var; } +}; + +template +struct getScalarValueFromVariableAtHost { + static typename T::value_type getValue(T const& var) { return var(); } +}; + +template +struct getScalarValueFromVariableAtHost { + static typename T::value_type getValue(T const& var) { return var[0]; } +}; + +// -------------------------------- + +template +size_t getAmountOfScalarsInCoefficient(T const& coeff) { + size_t result = 1; + if constexpr (Kokkos::is_view_v) { + if constexpr (T::rank == 1) { + result = coeff.extent(0); + } + } + return result; +} + +// -------------------------------- + +template +size_t getStrideInCoefficient(T const& coeff) { + size_t result = 1; + if constexpr (Kokkos::is_view_v) { + if constexpr ((T::rank == 1) && (std::is_same_v)) { + result = coeff.stride_0(); + } + } + return result; +} + +// -------------------------------- + +template +static void populateRank1Stride1ViewWithScalarOrNonStrideView( + T_in const& coeff_in, T_out& coeff_out) { + // *********************************************************************** + // 'coeff_out' is assumed to be rank-1, of LayoutLeft or LayoutRight + // + // One has to be careful with situations like the following: + // - a coeff_in that deals with 'double', and + // - a coeff_out deals with 'complex' + // *********************************************************************** + using ScalarOutType = + typename std::remove_const::type; + + if constexpr (!Kokkos::is_view_v) { + // ********************************************************************* + // 'coeff_in' is scalar + // ********************************************************************* + ScalarOutType scalarValue(coeff_in); + Kokkos::deep_copy(coeff_out, scalarValue); + } else if constexpr (T_in::rank == 0) { + // ********************************************************************* + // 'coeff_in' is rank-0 + // ********************************************************************* + typename T_in::HostMirror h_coeff_in("h_coeff_in"); + Kokkos::deep_copy(h_coeff_in, coeff_in); + ScalarOutType scalarValue(h_coeff_in()); + Kokkos::deep_copy(coeff_out, scalarValue); + } else { + // ********************************************************************* + // 'coeff_in' is also rank-1 + // ********************************************************************* + if (coeff_out.extent(0) != coeff_in.extent(0)) { + std::ostringstream msg; + msg << "In populateRank1Stride1ViewWithScalarOrNonStrideView()" + << ": 'in' and 'out' should have the same extent(0)" + << ", T_in = " << typeid(T_in).name() + << ", coeff_in.label() = " << coeff_in.label() + << ", coeff_in.extent(0) = " << coeff_in.extent(0) + << ", T_out = " << typeid(T_out).name() + << ", coeff_out.label() = " << coeff_out.label() + << ", coeff_out.extent(0) = " << coeff_out.extent(0); + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + + using ScalarInType = + typename std::remove_const::type; + if constexpr (std::is_same_v) { + coeff_out = coeff_in; + } else if (coeff_out.extent(0) == 1) { + typename T_in::HostMirror h_coeff_in("h_coeff_in"); + Kokkos::deep_copy(h_coeff_in, coeff_in); + ScalarOutType scalarValue(h_coeff_in[0]); + Kokkos::deep_copy(coeff_out, scalarValue); + } else { + std::ostringstream msg; + msg << "In populateRank1Stride1ViewWithScalarOrNonStrideView()" + << ": scalar types 'in' and 'out' should be the same" + << ", T_in = " << typeid(T_in).name() + << ", ScalarInType = " << typeid(ScalarInType).name() + << ", coeff_in.label() = " << coeff_in.label() + << ", coeff_in.extent(0) = " << coeff_in.extent(0) + << ", T_out = " << typeid(T_out).name() + << ", ScalarOutType = " << typeid(ScalarOutType).name() + << ", coeff_out.label() = " << coeff_out.label() + << ", coeff_out.extent(0) = " << coeff_out.extent(0); + KokkosKernels::Impl::throw_runtime_exception(msg.str()); + } + } +} // populateRank1Stride1ViewWithScalarOrNonStrideView() + +} // namespace Impl +} // namespace KokkosBlas + +#endif // KOKKOS_BLAS1_AXPBY_UNIFICATION_ATTEMPT_TRAITS_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas2_gemv_impl.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas2_gemv_impl.hpp index 730f88602a6b..dc0f531583a9 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas2_gemv_impl.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas2_gemv_impl.hpp @@ -199,10 +199,9 @@ struct SingleLevelTransposeGEMV { }; // Single-level parallel version of GEMV. -template -void singleLevelGemv(const typename AViewType::execution_space& space, - const char trans[], +template +void singleLevelGemv(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, @@ -222,9 +221,8 @@ void singleLevelGemv(const typename AViewType::execution_space& space, static_assert(std::is_integral::value, "IndexType must be an integer"); - using y_value_type = typename YViewType::non_const_value_type; - using execution_space = typename AViewType::execution_space; - using policy_type = Kokkos::RangePolicy; + using y_value_type = typename YViewType::non_const_value_type; + using policy_type = Kokkos::RangePolicy; using AlphaCoeffType = typename AViewType::non_const_value_type; using BetaCoeffType = typename YViewType::non_const_value_type; @@ -442,8 +440,8 @@ struct TwoLevelGEMV_LayoutRightTag {}; // --------------------------------------------------------------------------------------------- // Functor for a two-level parallel_reduce version of GEMV (non-transpose), // designed for performance on GPU. Kernel depends on the layout of A. -template +template struct TwoLevelGEMV { using y_value_type = typename YViewType::non_const_value_type; using AlphaCoeffType = typename AViewType::non_const_value_type; @@ -453,9 +451,8 @@ struct TwoLevelGEMV { std::is_same::value, float, y_value_type>::type; - using execution_space = typename AViewType::execution_space; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; TwoLevelGEMV(const AlphaCoeffType& alpha, const AViewType& A, const XViewType& x, const BetaCoeffType& beta, @@ -564,7 +561,8 @@ struct TwoLevelGEMV { // transpose GEMV. The functor uses parallel-for over the columns of the input // matrix A and each team uses parallel-reduce over the row of its column. // The output vector y is the reduction result. -template struct TwoLevelTransposeGEMV { using y_value_type = typename YViewType::non_const_value_type; @@ -575,9 +573,8 @@ struct TwoLevelTransposeGEMV { std::is_same::value, float, y_value_type>::type; - using execution_space = typename AViewType::execution_space; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; TwoLevelTransposeGEMV(const AlphaCoeffType& alpha, const AViewType& A, const XViewType& x, const BetaCoeffType& beta, @@ -637,10 +634,9 @@ struct TwoLevelTransposeGEMV { }; // Two-level parallel version of GEMV. -template -void twoLevelGemv(const typename AViewType::execution_space& space, - const char trans[], +template +void twoLevelGemv(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, @@ -661,9 +657,8 @@ void twoLevelGemv(const typename AViewType::execution_space& space, "IndexType must be an integer"); using y_value_type = typename YViewType::non_const_value_type; - using execution_space = typename AViewType::execution_space; - using team_policy_type = Kokkos::TeamPolicy; - using range_policy_type = Kokkos::RangePolicy; + using team_policy_type = Kokkos::TeamPolicy; + using range_policy_type = Kokkos::RangePolicy; using Kokkos::ArithTraits; using KAT = ArithTraits; @@ -704,19 +699,19 @@ void twoLevelGemv(const typename AViewType::execution_space& space, using layout_tag = typename std::conditional::type; - using tagged_policy = Kokkos::TeamPolicy; - using functor_type = - TwoLevelGEMV; + using tagged_policy = Kokkos::TeamPolicy; + using functor_type = TwoLevelGEMV; functor_type functor(alpha, A, x, beta, y); tagged_policy team; - if (isLayoutLeft) { + if constexpr (isLayoutLeft) { using AccumScalar = typename std::conditional< std::is_same::value || std::is_same::value, float, y_value_type>::type; size_t sharedPerTeam = 32 * sizeof(AccumScalar); IndexType numTeams = (A.extent(0) + 31) / 32; - tagged_policy temp(1, 1); + tagged_policy temp(space, 1, 1); temp.set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)); int teamSize = temp.team_size_recommended(functor, Kokkos::ParallelForTag()); @@ -727,7 +722,7 @@ void twoLevelGemv(const typename AViewType::execution_space& space, // FIXME SYCL: team_size_recommended() returns too big of a team size. // Kernel hangs with 1024 threads on XEHP. #ifdef KOKKOS_ENABLE_SYCL - if (std::is_same::value) { + if (std::is_same::value) { if (teamSize > 256) teamSize = 256; } #endif @@ -749,16 +744,18 @@ void twoLevelGemv(const typename AViewType::execution_space& space, } else if (tr == 'T') { // transpose, and not conj transpose team_policy_type team(space, A.extent(1), Kokkos::AUTO); - using functor_type = TwoLevelTransposeGEMV; + using functor_type = + TwoLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[twoLevelTranspose]", team, functor); } else if (tr == 'C' || tr == 'H') { // conjugate transpose team_policy_type team(space, A.extent(1), Kokkos::AUTO); - using functor_type = TwoLevelTransposeGEMV; + using functor_type = + TwoLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[twoLevelTranspose]", team, functor); @@ -769,11 +766,11 @@ void twoLevelGemv(const typename AViewType::execution_space& space, // generalGemv: use 1 level (Range) or 2 level (Team) implementation, // depending on whether execution space is CPU or GPU. enable_if makes sure // unused kernels are not instantiated. -template ()>::type* = nullptr> -void generalGemvImpl(const typename AViewType::execution_space& space, - const char trans[], + ExecutionSpace>()>::type* = nullptr> +void generalGemvImpl(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, @@ -781,11 +778,11 @@ void generalGemvImpl(const typename AViewType::execution_space& space, singleLevelGemv(space, trans, alpha, A, x, beta, y); } -template ()>::type* = nullptr> -void generalGemvImpl(const typename AViewType::execution_space& space, - const char trans[], + ExecutionSpace>()>::type* = nullptr> +void generalGemvImpl(const ExecutionSpace& space, const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas2_gemv_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas2_gemv_spec.hpp index 08842a61c064..97e6e2717e2e 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas2_gemv_spec.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas2_gemv_spec.hpp @@ -104,10 +104,10 @@ struct GEMV { // Prefer int as the index type, but use a larger type if needed. if (numRows < static_cast(INT_MAX) && numCols < static_cast(INT_MAX)) { - generalGemvImpl(space, trans, alpha, - A, x, beta, y); + generalGemvImpl( + space, trans, alpha, A, x, beta, y); } else { - generalGemvImpl( + generalGemvImpl( space, trans, alpha, A, x, beta, y); } Kokkos::Profiling::popRegion(); diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr2_impl.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr2_impl.hpp new file mode 100644 index 000000000000..69284e9547a0 --- /dev/null +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr2_impl.hpp @@ -0,0 +1,369 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_IMPL_HPP_ +#define KOKKOSBLAS2_SYR2_IMPL_HPP_ + +#include "KokkosKernels_config.h" +#include "Kokkos_Core.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" +#include "Kokkos_ArithTraits.hpp" + +namespace KokkosBlas { +namespace Impl { + +// Functor for the thread parallel version of SYR2. +// This functor parallelizes over rows of the input matrix A. +template +struct ThreadParallelSYR2 { + using AlphaCoeffType = typename AViewType::non_const_value_type; + using XComponentType = typename XViewType::non_const_value_type; + using YComponentType = typename YViewType::non_const_value_type; + using AComponentType = typename AViewType::non_const_value_type; + + ThreadParallelSYR2(const AlphaCoeffType& alpha, const XViewType& x, + const YViewType& y, const AViewType& A) + : alpha_(alpha), x_(x), y_(y), A_(A) { + // Nothing to do + } + + KOKKOS_INLINE_FUNCTION void operator()(const IndexType& i) const { + if (alpha_ == Kokkos::ArithTraits::zero()) { + // Nothing to do + } else if ((x_(i) == Kokkos::ArithTraits::zero()) && + (y_(i) == Kokkos::ArithTraits::zero())) { + // Nothing to do + } else { + const XComponentType x_fixed(x_(i)); + const YComponentType y_fixed(y_(i)); + const IndexType N(A_.extent(1)); + + if constexpr (tJustTranspose) { + if (x_fixed != Kokkos::ArithTraits::zero()) { + for (IndexType j = 0; j < N; ++j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_fixed * y_(j)); + } + } + } + if (y_fixed != Kokkos::ArithTraits::zero()) { + for (IndexType j = 0; j < N; ++j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * y_fixed * x_(j)); + } + } + } + } else { + if (x_fixed != Kokkos::ArithTraits::zero()) { + for (IndexType j = 0; j < N; ++j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType( + alpha_ * x_fixed * + Kokkos::ArithTraits::conj(y_(j))); + } + } + } + if (y_fixed != Kokkos::ArithTraits::zero()) { + for (IndexType j = 0; j < N; ++j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType( + Kokkos::ArithTraits::conj(alpha_) * y_fixed * + Kokkos::ArithTraits::conj(x_(j))); + } + } + } + } + } + } + + private: + AlphaCoeffType alpha_; + typename XViewType::const_type x_; + typename YViewType::const_type y_; + AViewType A_; +}; + +// Thread parallel version of SYR2. +template +void threadParallelSyr2(const ExecutionSpace& space, + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, + const AViewType& A) { + static_assert(std::is_integral::value, + "IndexType must be an integer"); + + using AlphaCoeffType = typename AViewType::non_const_value_type; + + if (x.extent(0) == 0) { + // no entries to update + } else if (y.extent(0) == 0) { + // no entries to update + } else if (alpha == Kokkos::ArithTraits::zero()) { + // no entries to update + } else { + Kokkos::RangePolicy rangePolicy(space, 0, + A.extent(0)); + ThreadParallelSYR2 + functor(alpha, x, y, A); + Kokkos::parallel_for("KokkosBlas::syr2[threadParallel]", rangePolicy, + functor); + } +} + +struct TeamParallelSYR2_LayoutLeftTag {}; +struct TeamParallelSYR2_LayoutRightTag {}; + +// --------------------------------------------------------------------------------------------- + +// Functor for the team parallel version of SYR2, designed for +// performance on GPUs. The kernel depends on the layout of A. +template +struct TeamParallelSYR2 { + using AlphaCoeffType = typename AViewType::non_const_value_type; + using XComponentType = typename XViewType::non_const_value_type; + using YComponentType = typename YViewType::non_const_value_type; + using AComponentType = typename AViewType::non_const_value_type; + + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + + TeamParallelSYR2(const AlphaCoeffType& alpha, const XViewType& x, + const YViewType& y, const AViewType& A) + : alpha_(alpha), x_(x), y_(y), A_(A) { + // Nothing to do + } + + public: + // LayoutLeft version: one team per column + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR2_LayoutLeftTag, + const member_type& team) const { + if (alpha_ == Kokkos::ArithTraits::zero()) { + // Nothing to do + } else { + const IndexType j(team.league_rank()); + if ((x_(j) == Kokkos::ArithTraits::zero()) && + (y_(j) == Kokkos::ArithTraits::zero())) { + // Nothing to do + } else { + const IndexType M(A_.extent(0)); + if constexpr (tJustTranspose) { + const XComponentType x_fixed(x_(j)); + const YComponentType y_fixed(y_(j)); + if (y_fixed != Kokkos::ArithTraits::zero()) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); + } + }); + } + if (x_fixed != Kokkos::ArithTraits::zero()) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * y_(i) * x_fixed); + } + }); + } + } else { + const XComponentType x_fixed( + Kokkos::ArithTraits::conj(x_(j))); + const YComponentType y_fixed( + Kokkos::ArithTraits::conj(y_(j))); + if (y_fixed != Kokkos::ArithTraits::zero()) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_(i) * y_fixed); + } + }); + } + if (x_fixed != Kokkos::ArithTraits::zero()) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, M), [&](const IndexType& i) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType( + Kokkos::ArithTraits::conj(alpha_) * + y_(i) * x_fixed); + } + }); + } + } + } + } + } + + // LayoutRight version: one team per row + KOKKOS_INLINE_FUNCTION void operator()(TeamParallelSYR2_LayoutRightTag, + const member_type& team) const { + if (alpha_ == Kokkos::ArithTraits::zero()) { + // Nothing to do + } else { + const IndexType i(team.league_rank()); + if ((x_(i) == Kokkos::ArithTraits::zero()) && + (y_(i) == Kokkos::ArithTraits::zero())) { + // Nothing to do + } else { + const IndexType N(A_.extent(1)); + const XComponentType x_fixed(x_(i)); + const YComponentType y_fixed(y_(i)); + if constexpr (tJustTranspose) { + if (x_fixed != Kokkos::ArithTraits::zero()) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * x_fixed * y_(j)); + } + }); + } + if (y_fixed != Kokkos::ArithTraits::zero()) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType(alpha_ * y_fixed * x_(j)); + } + }); + } + } else { + if (x_fixed != Kokkos::ArithTraits::zero()) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType( + alpha_ * x_fixed * + Kokkos::ArithTraits::conj(y_(j))); + } + }); + } + if (y_fixed != Kokkos::ArithTraits::zero()) { + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, N), [&](const IndexType& j) { + if (((tJustUp == true) && (i <= j)) || + ((tJustUp == false) && (i >= j))) { + A_(i, j) += AComponentType( + Kokkos::ArithTraits::conj(alpha_) * + y_fixed * + Kokkos::ArithTraits::conj(x_(j))); + } + }); + } + } + } + } + } + + private: + AlphaCoeffType alpha_; + typename XViewType::const_type x_; + typename YViewType::const_type y_; + AViewType A_; +}; + +// Team parallel version of SYR2. +template +void teamParallelSyr2(const ExecutionSpace& space, + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, + const AViewType& A) { + static_assert(std::is_integral::value, + "IndexType must be an integer"); + + using AlphaCoeffType = typename AViewType::non_const_value_type; + + if (x.extent(0) == 0) { + // no entries to update + return; + } else if (y.extent(0) == 0) { + // no entries to update + return; + } else if (alpha == Kokkos::ArithTraits::zero()) { + // no entries to update + return; + } + + constexpr bool isLayoutLeft = + std::is_same::value; + using layout_tag = + typename std::conditional::type; + using TeamPolicyType = Kokkos::TeamPolicy; + TeamPolicyType teamPolicy; + if (isLayoutLeft) { + // LayoutLeft: one team per column + teamPolicy = TeamPolicyType(space, A.extent(1), Kokkos::AUTO); + } else { + // LayoutRight: one team per row + teamPolicy = TeamPolicyType(space, A.extent(0), Kokkos::AUTO); + } + + TeamParallelSYR2 + functor(alpha, x, y, A); + Kokkos::parallel_for("KokkosBlas::syr2[teamParallel]", teamPolicy, functor); +} + +// --------------------------------------------------------------------------------------------- + +// generalSyr2Impl(): +// - use thread parallel code (rangePolicy) if execution space is CPU; +// - use team parallel code (teamPolicy) if execution space is GPU. +// +// The 'enable_if' makes sure unused kernels are not instantiated. + +template ()>::type* = nullptr> +void generalSyr2Impl(const ExecutionSpace& space, + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, + const AViewType& A) { + threadParallelSyr2(space, alpha, x, y, A); +} + +template ()>::type* = nullptr> +void generalSyr2Impl(const ExecutionSpace& space, + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, + const AViewType& A) { + teamParallelSyr2(space, alpha, x, y, A); +} + +} // namespace Impl +} // namespace KokkosBlas + +#endif // KOKKOSBLAS2_SYR2_IMPL_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr2_spec.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr2_spec.hpp new file mode 100644 index 000000000000..01637ba1d446 --- /dev/null +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr2_spec.hpp @@ -0,0 +1,180 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_SPEC_HPP_ +#define KOKKOSBLAS2_SYR2_SPEC_HPP_ + +#include "KokkosKernels_config.h" +#include "Kokkos_Core.hpp" + +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +#include +#endif + +namespace KokkosBlas { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct syr2_eti_spec_avail { + enum : bool { value = false }; +}; +} // namespace Impl +} // namespace KokkosBlas + +// +// Macro for declaration of full specialization availability +// KokkosBlas::Impl::SYR2. This is NOT for users!!! All the declarations of full +// specializations go in this header file. We may spread out definitions (see +// _INST macro below) across one or more .cpp files. +// +#define KOKKOSBLAS2_SYR2_ETI_SPEC_AVAIL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template <> \ + struct syr2_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +// Include the actual specialization declarations +#include +#include + +namespace KokkosBlas { +namespace Impl { + +// +// syr2 +// + +// Implementation of KokkosBlas::syr2. +template < + class ExecutionSpace, class XViewType, class YViewType, class AViewType, + bool tpl_spec_avail = syr2_tpl_spec_avail::value, + bool eti_spec_avail = syr2_eti_spec_avail::value> +struct SYR2 { + static void syr2(const ExecutionSpace& space, const char trans[], + const char uplo[], + const typename AViewType::const_value_type& alpha, + const XViewType& x, const YViewType& y, const AViewType& A) +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + { + Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + ? "KokkosBlas::syr2[ETI]" + : "KokkosBlas::syr2[noETI]"); + + typedef typename AViewType::size_type size_type; + const size_type numRows = A.extent(0); + const size_type numCols = A.extent(1); + + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); + bool justUp = (uplo[0] == 'U') || (uplo[0] == 'u'); + + // Prefer int as the index type, but use a larsyr2 type if needed. + if ((numRows < static_cast(INT_MAX)) && + (numCols < static_cast(INT_MAX))) { + if (justTranspose) { + if (justUp) { + generalSyr2Impl(space, alpha, x, y, A); + } else { + generalSyr2Impl(space, alpha, x, y, A); + } + } else { + if (justUp) { + generalSyr2Impl(space, alpha, x, y, A); + } else { + generalSyr2Impl(space, alpha, x, y, A); + } + } + } else { + if (justTranspose) { + if (justUp) { + generalSyr2Impl(space, alpha, x, y, A); + } else { + generalSyr2Impl(space, alpha, x, y, A); + } + } else { + if (justUp) { + generalSyr2Impl(space, alpha, x, y, A); + } else { + generalSyr2Impl(space, alpha, x, y, A); + } + } + } + + Kokkos::Profiling::popRegion(); + } +#else + ; +#endif // if !defined(KOKKOSKERNELS_ETI_ONLY) || + // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +}; + +} // namespace Impl +} // namespace KokkosBlas + +// +// Macro for declaration of full specialization of KokkosBlas::Impl::SYR2. +// This is NOT for users!!! +// All the declarations of full specializations go in this header file. +// We may spread out definitions (see _DEF macro below) across one or more .cpp +// files. +// +#define KOKKOSBLAS2_SYR2_ETI_SPEC_DECL(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + extern template struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#define KOKKOSBLAS2_SYR2_ETI_SPEC_INST(SCALAR, LAYOUT, EXEC_SPACE, MEM_SPACE) \ + template struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#include + +#endif // KOKKOSBLAS2_SYR2_SPEC_HPP_ diff --git a/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr_impl.hpp b/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr_impl.hpp index 439ed588dbbf..685ca75997cc 100644 --- a/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr_impl.hpp +++ b/packages/kokkos-kernels/blas/impl/KokkosBlas2_syr_impl.hpp @@ -94,7 +94,7 @@ void threadParallelSyr(const ExecutionSpace& space, A.extent(0)); ThreadParallelSYR functor(alpha, x, A); - Kokkos::parallel_for("KokkosBlas::syr[thredParallel]", rangePolicy, + Kokkos::parallel_for("KokkosBlas::syr[threadParallel]", rangePolicy, functor); } } diff --git a/packages/kokkos-kernels/blas/src/KokkosBlas1_axpby.hpp b/packages/kokkos-kernels/blas/src/KokkosBlas1_axpby.hpp index 2f59cb4cce9b..5cd03dd7c7bc 100644 --- a/packages/kokkos-kernels/blas/src/KokkosBlas1_axpby.hpp +++ b/packages/kokkos-kernels/blas/src/KokkosBlas1_axpby.hpp @@ -17,124 +17,262 @@ #ifndef KOKKOSBLAS1_AXPBY_HPP_ #define KOKKOSBLAS1_AXPBY_HPP_ +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) +#include +#endif // KOKKOSKERNELS_DEBUG_LEVEL + #include #include #include #include +#include // axpby() accepts both scalar coefficients a and b, and vector // coefficients (apply one for each column of the input multivectors). // This traits class helps axpby() select the correct specialization -// of AV and BV (the type of a resp. b) for invoking the +// of AV (type of 'a') and BV (type of 'b') for invoking the // implementation. namespace KokkosBlas { /// \brief Computes Y := a*X + b*Y /// -/// This function is non-blocking and thread safe. +/// This function is non-blocking and thread-safe. /// -/// \tparam execution_space a Kokkos execution space where the kernel will run. -/// \tparam AV 1-D or 2-D Kokkos::View specialization. -/// \tparam XMV 1-D or 2-D Kokkos::View specialization. -/// \tparam BV 1-D or 2-D Kokkos::View specialization. -/// \tparam YMV 1-D or 2-D Kokkos::View specialization. It must have -/// the same rank as XMV. +/// \tparam execution_space The type of execution space where the kernel +/// will run. +/// \tparam AV Scalar or 0-D or 1-D Kokkos::View. +/// \tparam XMV 1-D Kokkos::View or 2-D Kokkos::View. It +/// must have the same rank as YMV. +/// \tparam BV Scalar or 0-D or 1-D Kokkos::View. +/// \tparam YMV 1-D or 2-D Kokkos::View. /// -/// \param space [in] the execution space instance on which the kernel will run. -/// \param a [in] view of type AV, scaling parameter for X. -/// \param X [in] input view of type XMV. -/// \param b [in] view of type BV, scaling parameter for Y. -/// \param Y [in/out] view of type YMV in which the results will be stored. +/// \param exec_space [in] The execution space instance on which the kernel +/// will run. +/// \param a [in] Input of type AV: +/// - scaling parameter for 1-D or 2-D X, +/// - scaling parameters for 2-D X. +/// \param X [in] View of type XMV. It must have the same +/// extent(s) as Y. +/// \param b [in] input of type BV: +/// - scaling parameter for 1-D or 2-D Y, +/// - scaling parameters for 2-D Y. +/// \param Y [in/out] View of type YMV in which the results will be +/// stored. template -void axpby(const execution_space& space, const AV& a, const XMV& X, const BV& b, - const YMV& Y) { - static_assert(Kokkos::is_execution_space_v, - "KokkosBlas::axpby: execution_space must be a valid Kokkos " - "execution space."); - static_assert(Kokkos::is_view::value, - "KokkosBlas::axpby: " - "X is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::axpby: XMV must be accessible from execution_space"); - static_assert(Kokkos::is_view::value, - "KokkosBlas::axpby: " - "Y is not a Kokkos::View."); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::axpby: XMV must be accessible from execution_space"); - static_assert(std::is_same::value, - "KokkosBlas::axpby: Y is const. It must be nonconst, " - "because it is an output argument " - "(we must be able to write to its entries)."); - static_assert(int(YMV::rank) == int(XMV::rank), - "KokkosBlas::axpby: " - "X and Y must have the same rank."); - static_assert(YMV::rank == 1 || YMV::rank == 2, - "KokkosBlas::axpby: " - "XMV and YMV must either have rank 1 or rank 2."); - - // Check compatibility of dimensions at run time. - if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { - std::ostringstream os; - os << "KokkosBlas::axpby: Dimensions of X and Y do not match: " - << "X: " << X.extent(0) << " x " << X.extent(1) << ", Y: " << Y.extent(0) - << " x " << Y.extent(1); - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } +void axpby(const execution_space& exec_space, const AV& a, const XMV& X, + const BV& b, const YMV& Y) { + using AxpbyTraits = + Impl::AxpbyUnificationAttemptTraits; + using InternalTypeA = typename AxpbyTraits::InternalTypeA; + using InternalTypeX = typename AxpbyTraits::InternalTypeX; + using InternalTypeB = typename AxpbyTraits::InternalTypeB; + using InternalTypeY = typename AxpbyTraits::InternalTypeY; + + // ********************************************************************** + // Perform compile time checks and run time checks. + // ********************************************************************** + AxpbyTraits::performChecks(a, X, b, Y); +#if (KOKKOSKERNELS_DEBUG_LEVEL > 1) + AxpbyTraits::printInformation(std::cout, "axpby(), unif information"); +#endif // KOKKOSKERNELS_DEBUG_LEVEL + + // ********************************************************************** + // Call Impl::Axpby<...>::axpby(...) + // ********************************************************************** + InternalTypeX internal_X = X; + InternalTypeY internal_Y = Y; + + if constexpr (AxpbyTraits::internalTypesAB_bothScalars) { + // ******************************************************************** + // The unification logic applies the following general rules: + // 1) In a 'onHost' case, it makes the internal types for 'a' and 'b' + // to be both scalars (hence the name 'internalTypesAB_bothScalars') + // 2) In a 'onDevice' case, it makes the internal types for 'a' and 'b' + // to be Kokkos views. For performance reasons in Trilinos, the only + // exception for this rule is when the input types for both 'a' and + // 'b' are already scalars, in which case the internal types for 'a' + // and 'b' become scalars as well, eventually changing precision in + // order to match the precisions of 'X' and 'Y'. + // ******************************************************************** + if constexpr (AxpbyTraits::a_is_scalar && AxpbyTraits::b_is_scalar && + AxpbyTraits::onDevice) { + // ****************************************************************** + // We are in the exception situation for rule 2 + // ****************************************************************** + InternalTypeA internal_a(a); + InternalTypeA internal_b(b); - using UnifiedXLayout = - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; - using UnifiedYLayout = - typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< - YMV, UnifiedXLayout>::array_layout; - - // Create unmanaged versions of the input Views. XMV and YMV may be - // rank 1 or rank 2. AV and BV may be either rank-1 Views, or - // scalar values. - using XMV_Internal = Kokkos::View >; - using YMV_Internal = Kokkos::View >; - using AV_Internal = - typename KokkosKernels::Impl::GetUnifiedScalarViewType::type; - using BV_Internal = - typename KokkosKernels::Impl::GetUnifiedScalarViewType::type; - - AV_Internal a_internal = a; - XMV_Internal X_internal = X; - BV_Internal b_internal = b; - YMV_Internal Y_internal = Y; - - Impl::Axpby::axpby(space, a_internal, X_internal, b_internal, - Y_internal); + Impl::Axpby::axpby(exec_space, internal_a, internal_X, + internal_b, internal_Y); + } else { + // ****************************************************************** + // We are in rule 1, that is, we are in a 'onHost' case now + // ****************************************************************** + InternalTypeA internal_a(Impl::getScalarValueFromVariableAtHost< + AV, Impl::typeRank()>::getValue(a)); + InternalTypeB internal_b(Impl::getScalarValueFromVariableAtHost< + BV, Impl::typeRank()>::getValue(b)); + + Impl::Axpby::axpby(exec_space, internal_a, internal_X, + internal_b, internal_Y); + } + } else if constexpr (AxpbyTraits::internalTypesAB_bothViews) { + constexpr bool internalLayoutA_isStride( + std::is_same_v); + constexpr bool internalLayoutB_isStride( + std::is_same_v); + + const size_t numScalarsA(Impl::getAmountOfScalarsInCoefficient(a)); + const size_t numScalarsB(Impl::getAmountOfScalarsInCoefficient(b)); + + const size_t strideA(Impl::getStrideInCoefficient(a)); + const size_t strideB(Impl::getStrideInCoefficient(b)); + + Kokkos::LayoutStride layoutStrideA{numScalarsA, strideA}; + Kokkos::LayoutStride layoutStrideB{numScalarsB, strideB}; + + InternalTypeA internal_a; + InternalTypeB internal_b; + + if constexpr (internalLayoutA_isStride) { + // ****************************************************************** + // Prepare internal_a + // ****************************************************************** + typename AxpbyTraits::InternalTypeA_managed managed_a("managed_a", + layoutStrideA); + if constexpr (AxpbyTraits::atInputLayoutA_isStride) { + Kokkos::deep_copy(managed_a, a); + } else { + Impl::populateRank1Stride1ViewWithScalarOrNonStrideView(a, managed_a); + } + internal_a = managed_a; + + if constexpr (internalLayoutB_isStride) { + // **************************************************************** + // Prepare internal_b + // **************************************************************** + typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", + layoutStrideB); + if constexpr (AxpbyTraits::atInputLayoutB_isStride) { + Kokkos::deep_copy(managed_b, b); + } else { + Impl::populateRank1Stride1ViewWithScalarOrNonStrideView(b, managed_b); + } + internal_b = managed_b; + + // **************************************************************** + // Call Impl::Axpby<...>::axpby(...) + // **************************************************************** + Impl::Axpby::axpby(exec_space, internal_a, + internal_X, internal_b, + internal_Y); + } else { + // **************************************************************** + // Prepare internal_b + // **************************************************************** + typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", + numScalarsB); + if constexpr (AxpbyTraits::atInputLayoutB_isStride) { + Kokkos::deep_copy(managed_b, b); + } else { + Impl::populateRank1Stride1ViewWithScalarOrNonStrideView(b, managed_b); + } + internal_b = managed_b; + + // **************************************************************** + // Call Impl::Axpby<...>::axpby(...) + // **************************************************************** + Impl::Axpby::axpby(exec_space, internal_a, + internal_X, internal_b, + internal_Y); + } + } else { + // ****************************************************************** + // Prepare internal_a + // ****************************************************************** + typename AxpbyTraits::InternalTypeA_managed managed_a("managed_a", + numScalarsA); + if constexpr (AxpbyTraits::atInputLayoutA_isStride) { + Kokkos::deep_copy(managed_a, a); + } else { + Impl::populateRank1Stride1ViewWithScalarOrNonStrideView(a, managed_a); + } + internal_a = managed_a; + + if constexpr (internalLayoutB_isStride) { + // **************************************************************** + // Prepare internal_b + // **************************************************************** + typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", + layoutStrideB); + if constexpr (AxpbyTraits::atInputLayoutB_isStride) { + Kokkos::deep_copy(managed_b, b); + } else { + Impl::populateRank1Stride1ViewWithScalarOrNonStrideView(b, managed_b); + } + internal_b = managed_b; + + // **************************************************************** + // Call Impl::Axpby<...>::axpby(...) + // **************************************************************** + Impl::Axpby::axpby(exec_space, internal_a, + internal_X, internal_b, + internal_Y); + } else { + // **************************************************************** + // Prepare internal_b + // **************************************************************** + typename AxpbyTraits::InternalTypeB_managed managed_b("managed_b", + numScalarsB); + if constexpr (AxpbyTraits::atInputLayoutB_isStride) { + Kokkos::deep_copy(managed_b, b); + } else { + Impl::populateRank1Stride1ViewWithScalarOrNonStrideView(b, managed_b); + } + internal_b = managed_b; + + // **************************************************************** + // Call Impl::Axpby<...>::axpby(...) + // **************************************************************** + Impl::Axpby::axpby(exec_space, internal_a, + internal_X, internal_b, + internal_Y); + } + } + } } /// \brief Computes Y := a*X + b*Y /// -/// This function is non-blocking and thread-safe +/// This function is non-blocking and thread-safe. /// The kernel is executed in the default stream/queue /// associated with the execution space of XMV. /// -/// \tparam AV 1-D or 2-D Kokkos::View specialization. -/// \tparam XMV 1-D or 2-D Kokkos::View specialization. -/// \tparam BV 1-D or 2-D Kokkos::View specialization. -/// \tparam YMV 1-D or 2-D Kokkos::View specialization. It must have -/// the same rank as XMV. +/// \tparam AV Scalar or 0-D Kokkos::View or 1-D Kokkos::View. +/// \tparam XMV 1-D Kokkos::View or 2-D Kokkos::View. It must +/// have the same rank as YMV. +/// \tparam BV Scalar or 0-D Kokkos::View or 1-D Kokkos::View. +/// \tparam YMV 1-D Kokkos::View or 2-D Kokkos::View. /// -/// \param a [in] view of type AV, scaling parameter for X. -/// \param X [in] input view of type XMV. -/// \param b [in] view of type BV, scaling parameter for Y. -/// \param Y [in/out] view of type YMV in which the results will be stored. +/// \param a [in] Input of type AV: +/// - scaling parameter for 1-D or 2-D X, +/// - scaling parameters for 2-D X. +/// \param X [in] View of type XMV. It must have the same +/// extent(s) as Y. +/// \param b [in] input of type BV: +/// - scaling parameter for 1-D or 2-D Y, +/// - scaling parameters for 2-D Y. +/// \param Y [in/out] View of type YMV in which the results will be +/// stored. template void axpby(const AV& a, const XMV& X, const BV& b, const YMV& Y) { axpby(typename XMV::execution_space{}, a, X, b, Y); @@ -142,39 +280,49 @@ void axpby(const AV& a, const XMV& X, const BV& b, const YMV& Y) { /// \brief Computes Y := a*X + Y /// -/// This function is non-blocking and thread-safe +/// This function is non-blocking and thread-safe. /// -/// \tparam execution_space a Kokkos execution space where the kernel will run. -/// \tparam AV 1-D or 2-D Kokkos::View specialization. -/// \tparam XMV 1-D or 2-D Kokkos::View specialization. -/// \tparam YMV 1-D or 2-D Kokkos::View specialization. It must have -/// the same rank as XMV. +/// \tparam execution_space The type of execution space where the kernel +/// will run. +/// \tparam AV Scalar or 0-D or 1-D Kokkos::View. +/// \tparam XMV 1-D or 2-D Kokkos::View. It must have the +/// the same rank as YMV. +/// \tparam YMV 1-D or 2-D Kokkos::View. /// -/// \param space [in] the execution space instance on which the kernel will run. -/// \param a [in] view of type AV, scaling parameter for X. -/// \param X [in] input view of type XMV. -/// \param Y [in/out] view of type YMV in which the results will be stored. +/// \param exec_space [in] The execution space instance on which the kernel +/// will run. +/// \param a [in] Input of type AV: +/// - scaling parameter for 1-D or 2-D X, +/// - scaling parameters for 2-D X. +/// \param X [in] View of type XMV. It must have the same +/// extent(s) as Y. +/// \param Y [in/out] View of type YMV in which the results will be +/// stored. template -void axpy(const execution_space& space, const AV& a, const XMV& X, +void axpy(const execution_space& exec_space, const AV& a, const XMV& X, const YMV& Y) { - axpby(space, a, X, + axpby(exec_space, a, X, Kokkos::ArithTraits::one(), Y); } /// \brief Computes Y := a*X + Y /// -/// This function is non-blocking and thread-safe +/// This function is non-blocking and thread-safe. /// The kernel is executed in the default stream/queue /// associated with the execution space of XMV. /// -/// \tparam AV 1-D or 2-D Kokkos::View specialization. -/// \tparam XMV 1-D or 2-D Kokkos::View specialization. -/// \tparam YMV 1-D or 2-D Kokkos::View specialization. It must have -/// the same rank as XMV. +/// \tparam AV Scalar or 0-D Kokkos::View or 1-D Kokkos::View. +/// \tparam XMV 1-D Kokkos::View or 2-D Kokkos::View. It must +/// have the same rank as YMV. +/// \tparam YMV 1-D Kokkos::View or 2-D Kokkos::View. /// -/// \param a [in] view of type AV, scaling parameter for X. -/// \param X [in] input view of type XMV. -/// \param Y [in/out] view of type YMV in which the results will be stored. +/// \param a [in] Input of type AV: +/// - scaling parameter for 1-D or 2-D X, +/// - scaling parameters for 2-D X. +/// \param X [in] View of type XMV. It must have the same +/// extent(s) as Y. +/// \param Y [in/out] View of type YMV in which the results will be +/// stored. template void axpy(const AV& a, const XMV& X, const YMV& Y) { axpy(typename XMV::execution_space{}, a, X, Y); diff --git a/packages/kokkos-kernels/blas/src/KokkosBlas1_dot.hpp b/packages/kokkos-kernels/blas/src/KokkosBlas1_dot.hpp index ebccce7d7ce9..aa995836ebf9 100644 --- a/packages/kokkos-kernels/blas/src/KokkosBlas1_dot.hpp +++ b/packages/kokkos-kernels/blas/src/KokkosBlas1_dot.hpp @@ -96,25 +96,37 @@ dot(const execution_space& space, const XVector& x, const YVector& y) { Kokkos::View>; - result_type result{}; - RVector_Result R = RVector_Result(&result); XVector_Internal X = x; YVector_Internal Y = y; - // Even though RVector is the template parameter, Dot::dot has an overload - // that accepts RVector_Internal (with the special accumulator, if dot_type is - // 32-bit precision). Impl::Dot needs to support both cases, and it's easier - // to do this with overloading than by extending the ETI to deal with two - // different scalar types. - Impl::DotSpecialAccumulator::dot(space, R, - X, Y); - space.fence(); - // mfh 22 Jan 2020: We need the line below because - // Kokkos::complex lacks a constructor that takes a - // Kokkos::complex with U != T. - return Kokkos::Details::CastPossiblyComplex::cast( - result); + bool useFallback = false; + if (useFallback) { + // Even though RVector is the template parameter, Dot::dot has an overload + // that accepts RVector_Internal (with the special accumulator, if dot_type + // is 32-bit precision). Impl::Dot needs to support both cases, and it's + // easier to do this with overloading than by extending the ETI to deal with + // two different scalar types. + result_type result{}; + RVector_Result R = RVector_Result(&result); + Impl::DotSpecialAccumulator::dot(space, + R, X, + Y); + space.fence(); + // mfh 22 Jan 2020: We need the line below because + // Kokkos::complex lacks a constructor that takes a + // Kokkos::complex with U != T. + return Kokkos::Details::CastPossiblyComplex::cast( + result); + } else { + dot_type result{}; + RVector_Internal R = RVector_Internal(&result); + Impl::Dot::dot(space, R, X, Y); + space.fence(); + return Kokkos::Details::CastPossiblyComplex::cast( + result); + } } /// \brief Return the dot product of the two vectors x and y. diff --git a/packages/kokkos-kernels/blas/src/KokkosBlas1_swap.hpp b/packages/kokkos-kernels/blas/src/KokkosBlas1_swap.hpp index 26c529f3b79c..9ddcd106dfb9 100644 --- a/packages/kokkos-kernels/blas/src/KokkosBlas1_swap.hpp +++ b/packages/kokkos-kernels/blas/src/KokkosBlas1_swap.hpp @@ -26,12 +26,12 @@ namespace KokkosBlas { /// \brief Swaps the entries of vectors x and y. /// /// \tparam execution_space an execution space to perform parallel work -/// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. -/// \tparam YVector Type of the first vector y; a 1-D Kokkos::View. +/// \tparam XVector Type of the first vector x; a rank 1 Kokkos::View. +/// \tparam YVector Type of the first vector y; a rank 1 Kokkos::View. /// /// \param space [in] execution space passed to execution policies -/// \param x [in/out] 1-D View. -/// \param y [in/out] 1-D View. +/// \param x [in/out] rank 1 View. +/// \param y [in/out] rank 1 View. /// /// Swaps x and y. Note that this is akin to performing a deep_copy, swapping /// pointers inside view can only be performed if no aliasing, subviews, etc... @@ -100,11 +100,11 @@ void swap(execution_space const& space, XVector const& x, YVector const& y) { /// \brief Swaps the entries of vectors x and y. /// -/// \tparam XVector Type of the first vector x; a 1-D Kokkos::View. -/// \tparam YVector Type of the first vector y; a 1-D Kokkos::View. +/// \tparam XVector Type of the first vector x; a rank 1 Kokkos::View. +/// \tparam YVector Type of the first vector y; a rank 1 Kokkos::View. /// -/// \param x [in/out] 1-D View. -/// \param y [in/out] 1-D View. +/// \param x [in/out] rank 1 View. +/// \param y [in/out] rank 1 View. /// /// This function is non-blocking unless the underlying TPL requested /// at compile time is itself blocking. Note that the kernel will be diff --git a/packages/kokkos-kernels/blas/src/KokkosBlas2_ger.hpp b/packages/kokkos-kernels/blas/src/KokkosBlas2_ger.hpp index fbfc9c1f9854..8650577faf67 100644 --- a/packages/kokkos-kernels/blas/src/KokkosBlas2_ger.hpp +++ b/packages/kokkos-kernels/blas/src/KokkosBlas2_ger.hpp @@ -17,6 +17,8 @@ #ifndef KOKKOSBLAS2_GER_HPP_ #define KOKKOSBLAS2_GER_HPP_ +#include "KokkosKernels_helpers.hpp" + #include namespace KokkosBlas { @@ -42,15 +44,6 @@ template ::assignable, - "AViewType memory space must be assignable from XViewType"); - static_assert( - Kokkos::SpaceAccessibility::assignable, - "AViewType memory space must be assignable from YViewType"); - static_assert( Kokkos::SpaceAccessibility::accessible, diff --git a/packages/kokkos-kernels/blas/src/KokkosBlas2_syr.hpp b/packages/kokkos-kernels/blas/src/KokkosBlas2_syr.hpp index af66767ab4f0..00d1d8b3def5 100644 --- a/packages/kokkos-kernels/blas/src/KokkosBlas2_syr.hpp +++ b/packages/kokkos-kernels/blas/src/KokkosBlas2_syr.hpp @@ -17,6 +17,8 @@ #ifndef KOKKOSBLAS2_SYR_HPP_ #define KOKKOSBLAS2_SYR_HPP_ +#include "KokkosKernels_helpers.hpp" + #include namespace KokkosBlas { @@ -64,11 +66,6 @@ template void syr(const ExecutionSpace& space, const char trans[], const char uplo[], const typename AViewType::const_value_type& alpha, const XViewType& x, const AViewType& A) { - static_assert( - Kokkos::SpaceAccessibility::assignable, - "AViewType memory space must be assignable from XViewType"); - static_assert( Kokkos::SpaceAccessibility::accessible, diff --git a/packages/kokkos-kernels/blas/src/KokkosBlas2_syr2.hpp b/packages/kokkos-kernels/blas/src/KokkosBlas2_syr2.hpp new file mode 100644 index 000000000000..d86abd31c17c --- /dev/null +++ b/packages/kokkos-kernels/blas/src/KokkosBlas2_syr2.hpp @@ -0,0 +1,238 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_HPP_ +#define KOKKOSBLAS2_SYR2_HPP_ + +#include "KokkosKernels_helpers.hpp" + +#include +#include + +namespace KokkosBlas { + +/// \brief Rank-1 update (just lower portion or just upper portion) of a +/// matrix A that is: +/// - symmetric, A += alpha * x * y^T + alpha * y * x^T, or +/// - Hermitian, A += alpha * x * y^H + conj(alpha) * y * x^H. +/// +/// Important note 1: this routine encapsulates the syr2() and her2() +/// routines specified in BLAS documentations. It has the purpose of +/// updating a symmetric (or Hermitian) matrix A in such a way that +/// it continues to be symmetric (or Hermitian). +/// +/// Important note 2: however, this routine will honor all parameters +/// passed to it, even if A is not symmetric or not Hermitian. +/// Moreover, this routine will always compute either the lower +/// portion or the upper portion (per user's request) of the final +/// matrix A. So, in order to obtain meaningful results, the user +/// must make sure to follow the conditions specified in the +/// "important note 1" above. +/// +/// Important note 3: if TPL is enabled, this routine will call the +/// third party library BLAS routines whenever the parameters passed +/// are consistent with the parameters expected by the corresponding +/// TPL routine. If not, then this routine will route the execution +/// to the kokkos-kernels implementation, thus honoring all +/// parameters passed, as stated in the "important note 2" above. +/// +/// Important note 4: Regarding parameter types: +/// - If A has components of real type (float or double), then: +/// - alpha must be of real type as well, +/// - components of x must be of real type as well, and +/// - components of y must be of real type as well. +/// - If A has components of complex type (complex or +/// complex), then: +/// - alpha must be of complex type as well (it may have zero +/// imaginary part, no problem), +/// - components of x may be of real type or complex type, and +/// - components of y may be of real type or complex type. +/// +/// \tparam ExecutionSpace The type of execution space +/// \tparam XViewType Input vector, as a 1-D Kokkos::View +/// \tparam YViewType Input vector, as a 1-D Kokkos::View +/// \tparam AViewType Input/Output matrix, as a 2-D Kokkos::View +/// +/// \param space [in] Execution space instance on which to run the kernel. +/// This may contain information about which stream to +/// run on. +/// \param trans [in] "T" or "t" for transpose, "H" or "h" for Hermitian. +/// Only the first character is taken into account. +/// \param uplo [in] "U" or "u" for upper portion, "L" or "l" for lower +/// portion. Only the first character is taken into +/// account. +/// \param alpha [in] Input coefficient of x * x^{T,H} +/// \param x [in] Input vector, as a 1-D Kokkos::View +/// \param y [in] Input vector, as a 1-D Kokkos::View +/// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View +template +void syr2(const ExecutionSpace& space, const char trans[], const char uplo[], + const typename AViewType::const_value_type& alpha, const XViewType& x, + const YViewType& y, const AViewType& A) { + static_assert( + Kokkos::SpaceAccessibility::accessible, + "AViewType memory space must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "XViewType memory space must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "YViewType memory space must be accessible from ExecutionSpace"); + + static_assert(Kokkos::is_view::value, + "AViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "XViewType must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "YViewType must be a Kokkos::View."); + + static_assert(static_cast(AViewType::rank()) == 2, + "AViewType must have rank 2."); + static_assert(static_cast(XViewType::rank()) == 1, + "XViewType must have rank 1."); + static_assert(static_cast(YViewType::rank()) == 1, + "YViewType must have rank 1."); + + // Check compatibility of dimensions at run time. + if ((A.extent(0) == A.extent(1)) && (A.extent(0) == x.extent(0)) && + (A.extent(0) == y.extent(0))) { + // Ok + } else { + std::ostringstream os; + os << "KokkosBlas::syr2: Dimensions of A, x: " + << "A is " << A.extent(0) << " by " << A.extent(1) << ", x has size " + << x.extent(0) << ", y has size " << y.extent(0); + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if ((trans[0] == 'T') || (trans[0] == 't') || (trans[0] == 'H') || + (trans[0] == 'h')) { + // Ok + } else { + std::ostringstream os; + os << "KokkosBlas2::syr2(): invalid trans[0] = '" << trans[0] + << "'. It must be equalt to 'T' or 't' or 'H' or 'h'"; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + + if ((uplo[0] == 'U') || (uplo[0] == 'u') || (uplo[0] == 'L') || + (uplo[0] == 'l')) { + // Ok + } else { + std::ostringstream oss; + oss << "KokkosBlas2::syr2(): invalid uplo[0] = " << uplo[0] + << "'. It must be equalt to 'U' or 'u' or 'L' or 'l'"; + throw std::runtime_error(oss.str()); + } + + if ((A.extent(0) == 0) || (A.extent(1) == 0)) { + return; + } + + using ALayout = typename AViewType::array_layout; + + // Minimize the number of Impl::SYR2 instantiations, by standardizing + // on particular View specializations for its template parameters. + typedef Kokkos::View::array_layout, + typename XViewType::device_type, + Kokkos::MemoryTraits > + XVT; + + typedef Kokkos::View::array_layout, + typename YViewType::device_type, + Kokkos::MemoryTraits > + YVT; + + typedef Kokkos::View > + AVT; + + Impl::SYR2::syr2(space, trans, uplo, alpha, x, + y, A); +} + +/// \brief Rank-1 update (just lower portion or just upper portion) of a +/// matrix A that is: +/// - symmetric, A += alpha * x * y^T + alpha * y * x^T, or +/// - Hermitian, A += alpha * x * y^H + conj(alpha) * y * x^H. +/// +/// Important note 1: this routine encapsulates the syr2() and her2() +/// routines specified in BLAS documentations. It has the purpose of +/// updating a symmetric (or Hermitian) matrix A in such a way that +/// it continues to be symmetric (or Hermitian). +/// +/// Important note 2: however, this routine will honor all parameters +/// passed to it, even if A is not symmetric or not Hermitian. +/// Moreover, this routine will always compute either the lower +/// portion or the upper portion (per user's request) of the final +/// matrix A. So, in order to obtain meaningful results, the user +/// must make sure to follow the conditions specified in the +/// "important note 1" above. +/// +/// Important note 3: if TPL is enabled, this routine will call the +/// third party library BLAS routines whenever the parameters passed +/// are consistent with the parameters expected by the corresponding +/// TPL routine. If not, then this routine will route the execution +/// to the kokkos-kernels implementation, thus honoring all +/// parameters passed, as stated in the "important note 2" above. +/// +/// Important note 4: Regarding parameter types: +/// - If A has components of real type (float or double), then: +/// - alpha must be of real type as well, +/// - components of x must be of real type as well, and +/// - components of y must be of real type as well. +/// - If A has components of complex type (complex or +/// complex), then: +/// - alpha must be of complex type as well (it may have zero +/// imaginary part, no problem), +/// - components of x may be of real type or complex type, and +/// - components of y may be of real type or complex type. +/// +/// \tparam XViewType Input vector, as a 1-D Kokkos::View +/// \tparam YViewType Input vector, as a 1-D Kokkos::View +/// \tparam AViewType Input/Output matrix, as a 2-D Kokkos::View +/// +/// \param trans [in] "T" or "t" for transpose, "H" or "h" for Hermitian. +/// Only the first character is taken into account. +/// \param uplo [in] "U" or "u" for upper portion, "L" or "l" for lower +/// portion. Only the first character is taken into +/// account. +/// \param alpha [in] Input coefficient of x * x^{T,H} +/// \param x [in] Input vector, as a 1-D Kokkos::View +/// \param y [in] Input vector, as a 1-D Kokkos::View +/// \param A [in/out] Output matrix, as a nonconst 2-D Kokkos::View +template +void syr2(const char trans[], const char uplo[], + const typename AViewType::const_value_type& alpha, const XViewType& x, + const YViewType& y, const AViewType& A) { + const typename AViewType::execution_space space = + typename AViewType::execution_space(); + syr2( + space, trans, uplo, alpha, x, y, A); +} + +} // namespace KokkosBlas + +#endif // KOKKOSBLAS2_SYR2_HPP_ diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp index ca2139980d9f..3ba8f063b4ed 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp @@ -52,18 +52,22 @@ KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::HostSpace) + +// TODO: we met difficuties in FindTPLMKL.cmake to set the BLAS library properly +// such that the test in CheckHostBlasReturnComplex.cmake could not be +// compiled and run to give a correct answer on KK_BLAS_RESULT_AS_POINTER_ARG. +// This resulted in segfault in dot() with MKL and complex. +// So we just temporarily disable it until FindTPLMKL.cmake is fixed. +#if !defined(KOKKOSKERNELS_ENABLE_TPL_MKL) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) +#endif #endif -// cuBLAS -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -// double -#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE) \ +#define KOKKOSBLAS1_DOT_TPL_SPEC(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ template <> \ struct dot_tpl_spec_avail< \ EXECSPACE, \ @@ -77,19 +81,27 @@ KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, enum : bool { value = true }; \ }; -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) +#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_DOT_TPL_SPEC(float, LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_DOT_TPL_SPEC(double, LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_DOT_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, \ + MEMSPACE) \ + KOKKOSBLAS1_DOT_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, MEMSPACE) +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) #endif +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLDeviceUSMSpace) +#endif } // namespace Impl } // namespace KokkosBlas #endif diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp index 718e32f14cce..ace26ebdbd12 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp @@ -39,71 +39,40 @@ inline void dot_print_specialization() { namespace KokkosBlas { namespace Impl { - -#define KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ + MEMSPACE, ETI_SPEC_AVAIL) \ template \ - struct Dot< \ - ExecSpace, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(const ExecSpace& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS,double]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - dot_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - R() = HostBlas::dot(N, X.data(), one, Y.data(), one); \ - } else { \ - Dot::dot(space, R, \ - X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - -#define KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Dot< \ - ExecSpace, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > \ RV; \ - typedef Kokkos::View, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ static void dot(const ExecSpace& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS,float]"); \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_BLAS," + \ + Kokkos::ArithTraits::name() + \ + "]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ dot_print_specialization(); \ int N = numElems; \ int one = 1; \ - R() = HostBlas::dot(N, X.data(), one, Y.data(), one); \ + R() = HostBlas::dot( \ + N, reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one); \ } else { \ Dot::dot(space, R, \ X, Y); \ @@ -112,105 +81,22 @@ namespace Impl { } \ }; -#define KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Dot, LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View, LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(const ExecSpace& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::dot[TPL_BLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - dot_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - R() = HostBlas >::dot( \ - N, reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one); \ - } else { \ - Dot::dot(space, R, \ - X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - -#define KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Dot, LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View, LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(const ExecSpace& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::dot[TPL_BLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - dot_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - R() = HostBlas >::dot( \ - N, reinterpret_cast*>(X.data()), one, \ - reinterpret_cast*>(Y.data()), one); \ - } else { \ - Dot::dot(space, R, \ - X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - -KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, float, float, \ + Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, double, double, \ + Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::HostSpace, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::HostSpace, ETI_SPEC_AVAIL) +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS_EXT(true) +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS_EXT(false) } // namespace Impl } // namespace KokkosBlas - #endif // cuBLAS @@ -219,38 +105,48 @@ KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, namespace KokkosBlas { namespace Impl { - -#define KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ + EXECSPACE, MEMSPACE, TPL_DOT, \ + ETI_SPEC_AVAIL) \ template <> \ - struct Dot< \ - EXECSPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > \ RV; \ - typedef Kokkos::View, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS,double]"); \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS," + \ + Kokkos::ArithTraits::name() + \ + "]"); \ const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ + /* TODO: CUDA-12's 64-bit indices allow larger numElems */ \ + if (numElems <= \ + static_cast(std::numeric_limits::max())) { \ dot_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ + const int N = static_cast(numElems); \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasDdot(s.handle, N, X.data(), one, Y.data(), one, &R()); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + TPL_DOT(s.handle, N, reinterpret_cast(X.data()), \ + 1, reinterpret_cast(Y.data()), 1, \ + reinterpret_cast(&R()))); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ } else { \ Dot::dot(space, R, \ X, Y); \ @@ -259,81 +155,73 @@ namespace Impl { } \ }; -#define KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Dot< \ - EXECSPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS,float]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - dot_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasSdot(s.handle, N, X.data(), one, Y.data(), one, &R()); \ - } else { \ - Dot::dot(space, R, \ - X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, float, float, \ + Kokkos::Cuda, Kokkos::CudaSpace, \ + cublasSdot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, double, double, \ + Kokkos::Cuda, Kokkos::CudaSpace, \ + cublasDdot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, cuComplex, Kokkos::Cuda, \ + Kokkos::CudaSpace, cublasCdotc, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, cuDoubleComplex, \ + Kokkos::Cuda, Kokkos::CudaSpace, cublasZdotc, ETI_SPEC_AVAIL) + +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS_EXT(true) +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS_EXT(false) +} // namespace Impl +} // namespace KokkosBlas +#endif + +// rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +#include -#define KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ +namespace KokkosBlas { +namespace Impl { +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ + EXECSPACE, MEMSPACE, TPL_DOT, \ ETI_SPEC_AVAIL) \ template <> \ struct Dot, LAYOUT, Kokkos::HostSpace, \ + Kokkos::View >, \ - Kokkos::View*, LAYOUT, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View, LAYOUT, Kokkos::HostSpace, \ + typedef Kokkos::View > \ RV; \ - typedef Kokkos::View*, LAYOUT, \ + typedef Kokkos::View, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::dot[TPL_CUBLAS,complex]"); \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_ROCBLAS," + \ + Kokkos::ArithTraits::name() + \ + "]"); \ const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ + if (numElems <= \ + static_cast(std::numeric_limits::max())) { \ dot_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasZdotc(s.handle, N, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(&R())); \ + const rocblas_int N = static_cast(numElems); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + TPL_DOT(s.handle, N, reinterpret_cast(X.data()), \ + 1, reinterpret_cast(Y.data()), 1, \ + reinterpret_cast(&R()))); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ } else { \ Dot::dot(space, R, \ X, Y); \ @@ -342,72 +230,100 @@ namespace Impl { } \ }; -#define KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, float, float, \ + Kokkos::HIP, Kokkos::HIPSpace, \ + rocblas_sdot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, double, double, \ + Kokkos::HIP, Kokkos::HIPSpace, \ + rocblas_ddot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, rocblas_float_complex, \ + Kokkos::HIP, Kokkos::HIPSpace, rocblas_cdotc, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, rocblas_double_complex, \ + Kokkos::HIP, Kokkos::HIPSpace, rocblas_zdotc, ETI_SPEC_AVAIL) + +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS_EXT(true) +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS_EXT(false) +} // namespace Impl +} // namespace KokkosBlas +#endif + +// ONEMKL +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) +#include +#include +#include + +namespace KokkosBlas { +namespace Impl { +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ + EXECSPACE, MEMSPACE, TPL_DOT, \ + ETI_SPEC_AVAIL) \ template <> \ struct Dot, LAYOUT, Kokkos::HostSpace, \ + Kokkos::View >, \ - Kokkos::View*, LAYOUT, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View, LAYOUT, Kokkos::HostSpace, \ + typedef Kokkos::View > \ RV; \ - typedef Kokkos::View*, LAYOUT, \ + typedef Kokkos::View, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ - static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::dot[TPL_CUBLAS,complex]"); \ + static void dot(const EXECSPACE& exec, RV& R, const XV& X, const XV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_ONEMKL," + \ + Kokkos::ArithTraits::name() + \ + "]"); \ const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ + if (numElems <= \ + static_cast(std::numeric_limits::max())) { \ dot_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasCdotc(s.handle, N, reinterpret_cast(X.data()), \ - one, reinterpret_cast(Y.data()), one, \ - reinterpret_cast(&R())); \ + const std::int64_t N = static_cast(numElems); \ + TPL_DOT(exec.sycl_queue(), N, \ + reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1, \ + reinterpret_cast(&R())); \ } else { \ - Dot::dot(space, R, \ + Dot::dot(exec, R, \ X, Y); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL( \ + Kokkos::LayoutLeft, float, float, Kokkos::Experimental::SYCL, \ + Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::dot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL( \ + Kokkos::LayoutLeft, double, double, Kokkos::Experimental::SYCL, \ + Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::dot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL( \ + Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::dotc, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL( \ + Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::dotc, ETI_SPEC_AVAIL) +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL_EXT(true) +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL_EXT(false) } // namespace Impl } // namespace KokkosBlas - #endif #endif diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp index 04ec8119900e..be0a45c7be61 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp @@ -113,6 +113,40 @@ KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, #endif // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +// oneMKL +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + +#if defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) + +#define KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL(SCALAR, LAYOUT, MEMSPACE) \ + template \ + struct nrm1_tpl_spec_avail< \ + ExecSpace, \ + Kokkos::View< \ + typename Kokkos::Details::InnerProductSpaceTraits::mag_type, \ + LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1> { \ + enum : bool { value = true }; \ + }; + +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL( + double, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL( + float, Kokkos::LayoutLeft, Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL( + Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_MKL_SYCL( + Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCLDeviceUSMSpace) + +#endif // KOKKOS_ENABLE_SYCL +#endif // KOKKOSKERNELS_ENABLE_TPL_MKL + } // namespace Impl } // namespace KokkosBlas #endif diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp index b5b6e061ec5c..c695eaee1e33 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas1_nrm1_tpl_spec_decl.hpp @@ -39,161 +39,88 @@ inline void nrm1_print_specialization() { namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Nrm1< \ - ExecSpace, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_BLAS,double]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - R() = HostBlas::asum(N, X.data(), one); \ - } else { \ - Nrm1::nrm1(space, R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - -#define KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ +#define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(SCALAR, LAYOUT, EXECSPACE, \ + MEMSPACE) \ + template <> \ struct Nrm1< \ - ExecSpace, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ + EXECSPACE, \ + Kokkos::View::mag_type, LAYOUT, \ + Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, true, \ + nrm1_eti_spec_avail< \ + EXECSPACE, \ + Kokkos::View::mag_type, LAYOUT, \ + Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using mag_type = typename Kokkos::ArithTraits::mag_type; \ + using RV = Kokkos::View>; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using size_type = typename XV::size_type; \ \ - static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_BLAS,float]"); \ + static void nrm1(const EXECSPACE& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_BLAS," #SCALAR "]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ nrm1_print_specialization(); \ int N = numElems; \ int one = 1; \ - R() = HostBlas::asum(N, X.data(), one); \ + if constexpr (Kokkos::ArithTraits::is_complex) { \ + R() = HostBlas>::asum( \ + N, reinterpret_cast*>(X.data()), \ + one); \ + } else { \ + R() = HostBlas::asum(N, X.data(), one); \ + } \ } else { \ - Nrm1::nrm1(space, R, X); \ + Nrm1::value>::nrm1(space, R, \ + X); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Nrm1 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm1[TPL_BLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - R() = HostBlas >::asum( \ - N, reinterpret_cast*>(X.data()), one); \ - } else { \ - Nrm1::nrm1(space, R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - -#define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_BLAS(LAYOUT, MEMSPACE, ETI_SPEC_AVAIL) \ - template \ - struct Nrm1 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm1[TPL_BLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - int N = numElems; \ - int one = 1; \ - R() = HostBlas >::asum( \ - N, reinterpret_cast*>(X.data()), one); \ - } else { \ - Nrm1::nrm1(space, R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - -KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) - -KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) +#if defined(KOKKOS_ENABLE_SERIAL) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Serial, Kokkos::HostSpace) +#endif -KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) +#if defined(KOKKOS_ENABLE_OPENMP) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::OpenMP, Kokkos::HostSpace) +#endif -KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - true) -KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, - false) +#if defined(KOKKOS_ENABLE_THREADS) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Threads, + Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Threads, + Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Threads, Kokkos::HostSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Threads, Kokkos::HostSpace) +#endif } // namespace Impl } // namespace KokkosBlas @@ -207,202 +134,105 @@ KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ +template +void cublasAsumWrapper(const ExecutionSpace& space, RViewType& R, + const XViewType& X) { + using XScalar = typename XViewType::non_const_value_type; + + nrm1_print_specialization(); + const int N = static_cast(X.extent(0)); + constexpr int one = 1; + KokkosBlas::Impl::CudaBlasSingleton& s = + KokkosBlas::Impl::CudaBlasSingleton::singleton(); + + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, space.cuda_stream())); + if constexpr (std::is_same_v) { + KOKKOS_CUBLAS_SAFE_CALL_IMPL( + cublasSasum(s.handle, N, X.data(), one, R.data())); + } + if constexpr (std::is_same_v) { + KOKKOS_CUBLAS_SAFE_CALL_IMPL( + cublasDasum(s.handle, N, X.data(), one, R.data())); + } + if constexpr (std::is_same_v>) { + KOKKOS_CUBLAS_SAFE_CALL_IMPL( + cublasScasum(s.handle, N, reinterpret_cast(X.data()), + one, R.data())); + } + if constexpr (std::is_same_v>) { + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDzasum( + s.handle, N, reinterpret_cast(X.data()), one, + R.data())); + } + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); +} + +#define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(SCALAR, LAYOUT, MEMSPACE) \ template <> \ struct Nrm1< \ - EXECSPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ + Kokkos::Cuda, \ + Kokkos::View::mag_type, LAYOUT, \ + Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, true, \ + nrm1_eti_spec_avail< \ + Kokkos::Cuda, \ + Kokkos::View::mag_type, LAYOUT, \ + Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using execution_space = Kokkos::Cuda; \ + using RV = Kokkos::View::mag_type, \ + LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits>; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using size_type = typename XV::size_type; \ \ static void nrm1(const execution_space& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_CUBLAS,double]"); \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_CUBLAS," #SCALAR \ + "]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasDasum(s.handle, N, X.data(), one, R.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + cublasAsumWrapper(space, R, X); \ } else { \ - Nrm1::nrm1(space, \ - R, X); \ + Nrm1::value>::nrm1(space, R, \ + X); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm1< \ - EXECSPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(const execution_space& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_CUBLAS,float]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSasum(s.handle, N, X.data(), one, R.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - Nrm1::nrm1(space, \ - R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - -#define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm1 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(const execution_space& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm1[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDzasum( \ - s.handle, N, reinterpret_cast(X.data()), \ - one, R.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - Nrm1::nrm1(space, \ - R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - -#define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Nrm1 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - using execution_space = EXECSPACE; \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(const execution_space& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm1[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ - cublasSetStream(s.handle, space.cuda_stream())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasScasum( \ - s.handle, N, reinterpret_cast(X.data()), one, \ - R.data())); \ - KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ - } else { \ - Nrm1::nrm1(space, \ - R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - -KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(float, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(double, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::CudaSpace) + +#if defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(float, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(double, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +#endif } // namespace Impl } // namespace KokkosBlas - -#endif +#endif // KOKKOSKERNELS_ENABLE_TPL_CUBLAS // rocBLAS #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS @@ -411,195 +241,218 @@ KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ +template +void rocblasAsumWrapper(const ExecutionSpace& space, RViewType& R, + const XViewType& X) { + using XScalar = typename XViewType::non_const_value_type; + + nrm1_print_specialization(); + const int N = static_cast(X.extent(0)); + constexpr int one = 1; + KokkosBlas::Impl::RocBlasSingleton& s = + KokkosBlas::Impl::RocBlasSingleton::singleton(); + + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + rocblas_set_stream(s.handle, space.hip_stream())); + if constexpr (std::is_same_v) { + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + rocblas_sasum(s.handle, N, X.data(), one, R.data())); + } + if constexpr (std::is_same_v) { + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + rocblas_dasum(s.handle, N, X.data(), one, R.data())); + } + if constexpr (std::is_same_v>) { + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_scasum( + s.handle, N, reinterpret_cast(X.data()), + one, R.data())); + } + if constexpr (std::is_same_v>) { + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dzasum( + s.handle, N, reinterpret_cast(X.data()), + one, R.data())); + } + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); +} + +#define KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ struct Nrm1< \ - ExecSpace, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ + Kokkos::HIP, \ + Kokkos::View::mag_type, LAYOUT, \ + Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, true, \ + nrm1_eti_spec_avail< \ + Kokkos::HIP, \ + Kokkos::View::mag_type, LAYOUT, \ + Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using RV = Kokkos::View::mag_type, \ + LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits>; \ + using XV = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using size_type = typename XV::size_type; \ \ - static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_ROCBLAS,double]"); \ + static void nrm1(const Kokkos::HIP& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_ROCBLAS," #SCALAR \ + "]"); \ const size_type numElems = X.extent(0); \ if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_dasum(s.handle, N, X.data(), one, R.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + rocblasAsumWrapper(space, R, X); \ } else { \ - Nrm1::nrm1(space, R, X); \ + Nrm1::value>::nrm1(space, R, \ + X); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct Nrm1< \ - ExecSpace, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_ROCBLAS,float]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_sasum(s.handle, N, X.data(), one, R.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - Nrm1::nrm1(space, R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(float, Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(double, Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSBLAS1_NRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIPSpace) -#define KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct Nrm1 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm1[TPL_ROCBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_dzasum( \ - s.handle, N, \ - reinterpret_cast(X.data()), one, \ - R.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - Nrm1::nrm1(space, R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; +} // namespace Impl +} // namespace KokkosBlas -#define KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_ROCBLAS(LAYOUT, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct Nrm1 >, \ - Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrm1(const ExecSpace& space, RV& R, const XV& X) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::nrm1[TPL_ROCBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - nrm1_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::RocBlasSingleton& s = \ - KokkosBlas::Impl::RocBlasSingleton::singleton(); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ - rocblas_set_stream(s.handle, space.hip_stream())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_scasum( \ - s.handle, N, \ - reinterpret_cast(X.data()), one, \ - R.data())); \ - KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ - } else { \ - Nrm1::nrm1(space, R, X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; +#endif // KOKKOSKERNELS_ENABLE_TPL_ROCBLAS + +// oneMKL +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + +#if defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) + +#include +#include -KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - true) -KOKKOSBLAS1_DNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - false) +namespace KokkosBlas { +namespace Impl { -KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - true) -KOKKOSBLAS1_SNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - false) +template +void onemklAsumWrapper(const ExecutionSpace& space, RViewType& R, + const XViewType& X) { + using XScalar = typename XViewType::non_const_value_type; + using KAT_X = Kokkos::ArithTraits; + using layout_t = typename XViewType::array_layout; + + const std::int64_t N = static_cast(X.extent(0)); + + // Create temp view on device to store the result + Kokkos::View::mag_type, + typename XViewType::memory_space> + res("sycl asum result"); + + // Decide to call row_major or column_major function + if constexpr (std::is_same_v) { + if constexpr (KAT_X::is_complex) { + oneapi::mkl::blas::row_major::asum( + space.sycl_queue(), N, + reinterpret_cast*>( + X.data()), + 1, res.data()); + } else { + oneapi::mkl::blas::row_major::asum(space.sycl_queue(), N, X.data(), 1, + res.data()); + } + } else { + if constexpr (KAT_X::is_complex) { + oneapi::mkl::blas::column_major::asum( + space.sycl_queue(), N, + reinterpret_cast*>( + X.data()), + 1, res.data()); + } else { + oneapi::mkl::blas::column_major::asum(space.sycl_queue(), X.extent_int(0), + X.data(), 1, res.data()); + } + } + // Bring result back to host + Kokkos::deep_copy(space, R, res); +} -KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - true) -KOKKOSBLAS1_ZNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - false) +#define KOKKOSBLAS1_NRM1_ONEMKL(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct Nrm1< \ + Kokkos::Experimental::SYCL, \ + Kokkos::View::mag_type, LAYOUT, \ + Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + 1, true, \ + nrm1_eti_spec_avail< \ + Kokkos::Experimental::SYCL, \ + Kokkos::View::mag_type, LAYOUT, \ + Kokkos::HostSpace, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using execution_space = Kokkos::Experimental::SYCL; \ + using RV = Kokkos::View::mag_type, \ + LAYOUT, Kokkos::HostSpace, \ + Kokkos::MemoryTraits>; \ + using XV = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using size_type = typename XV::size_type; \ + \ + static void nrm1(const execution_space& space, RV& R, const XV& X) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::nrm1[TPL_ONEMKL," #SCALAR \ + "]"); \ + const size_type numElems = X.extent(0); \ + if (numElems < static_cast(INT_MAX)) { \ + onemklAsumWrapper(space, R, X); \ + } else { \ + Nrm1::value>::nrm1(space, R, X); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; -KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - true) -KOKKOSBLAS1_CNRM1_TPL_SPEC_DECL_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIPSpace, - false) +KOKKOSBLAS1_NRM1_ONEMKL(float, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(double, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCLDeviceUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCLDeviceUSMSpace) + +#if defined(KOKKOSKERNELS_INST_MEMSPACE_SYCLSHAREDSPACE) +KOKKOSBLAS1_NRM1_ONEMKL(float, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCLSharedUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(double, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCLSharedUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCLSharedUSMSpace) +KOKKOSBLAS1_NRM1_ONEMKL(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Experimental::SYCLSharedUSMSpace) +#endif } // namespace Impl } // namespace KokkosBlas -#endif +#endif // KOKKOS_ENABLE_SYCL +#endif // KOKKOSKERNELS_ENABLE_TPL_MKL #endif diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp index 3ba437a5a7cb..bc1a10f61eac 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_blas.hpp @@ -149,9 +149,7 @@ namespace Impl { Kokkos::MemoryTraits> \ AViewType; \ \ - static void ger(const EXEC_SPACE& /* space */ \ - , \ - const char trans[], \ + static void ger(const EXEC_SPACE& space, const char trans[], \ typename AViewType::const_value_type& alpha, \ const XViewType& X, const YViewType& Y, \ const AViewType& A) { \ @@ -183,8 +181,9 @@ namespace Impl { reinterpret_cast*>(X.data()), one, \ reinterpret_cast*>(A.data()), LDA); \ } else { \ - throw std::runtime_error( \ - "Error: blasZgerc() requires LayoutLeft views."); \ + /* blasgerc() + ~A_ll => call kokkos-kernels' implementation */ \ + GER::ger(space, trans, alpha, X, Y, A); \ } \ } \ Kokkos::Profiling::popRegion(); \ @@ -218,9 +217,7 @@ namespace Impl { Kokkos::MemoryTraits> \ AViewType; \ \ - static void ger(const EXEC_SPACE& /* space */ \ - , \ - const char trans[], \ + static void ger(const EXEC_SPACE& space, const char trans[], \ typename AViewType::const_value_type& alpha, \ const XViewType& X, const YViewType& Y, \ const AViewType& A) { \ @@ -252,8 +249,9 @@ namespace Impl { reinterpret_cast*>(X.data()), one, \ reinterpret_cast*>(A.data()), LDA); \ } else { \ - throw std::runtime_error( \ - "Error: blasCgerc() requires LayoutLeft views."); \ + /* blasgerc() + ~A_ll => call kokkos-kernels' implementation */ \ + GER::ger(space, trans, alpha, X, Y, A); \ } \ } \ Kokkos::Profiling::popRegion(); \ diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp index d05b09784e36..3f80144f62c5 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_cublas.hpp @@ -196,8 +196,9 @@ namespace Impl { reinterpret_cast(X.data()), one, \ reinterpret_cast(A.data()), LDA)); \ } else { \ - throw std::runtime_error( \ - "Error: cublasZgerc() requires LayoutLeft views."); \ + /* cublasZgerc() + ~A_ll => call kokkos-kernels' implementation */ \ + GER::ger(space, trans, alpha, X, Y, A); \ } \ } \ KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ @@ -266,8 +267,9 @@ namespace Impl { reinterpret_cast(X.data()), one, \ reinterpret_cast(A.data()), LDA)); \ } else { \ - throw std::runtime_error( \ - "Error: cublasCgerc() requires LayoutLeft views."); \ + /* cublasCgerc() + ~A_ll => call kokkos-kernels' implementation */ \ + GER::ger(space, trans, alpha, X, Y, A); \ } \ } \ KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp index c55d091516f9..c21b61befaf5 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_ger_tpl_spec_decl_rocblas.hpp @@ -199,8 +199,9 @@ namespace Impl { reinterpret_cast(X.data()), one, \ reinterpret_cast(A.data()), LDA)); \ } else { \ - throw std::runtime_error( \ - "Error: rocblasZgerc() requires LayoutLeft views."); \ + /* rocblas_zgerc() + ~A_ll => call k-kernels' implementation */ \ + GER::ger(space, trans, alpha, X, Y, A); \ } \ } \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ @@ -273,8 +274,9 @@ namespace Impl { reinterpret_cast(X.data()), one, \ reinterpret_cast(A.data()), LDA)); \ } else { \ - throw std::runtime_error( \ - "Error: rocblasCgec() requires LayoutLeft views."); \ + /* rocblas_cgerc() + ~A_ll => call k-kernels' implementation */ \ + GER::ger(space, trans, alpha, X, Y, A); \ } \ } \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr2_tpl_spec_avail.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr2_tpl_spec_avail.hpp new file mode 100644 index 000000000000..59fb154d3530 --- /dev/null +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr2_tpl_spec_avail.hpp @@ -0,0 +1,205 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_HPP_ +#define KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_HPP_ + +namespace KokkosBlas { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct syr2_tpl_spec_avail { + enum : bool { value = false }; +}; + +// Generic Host side BLAS (could be MKL or whatever) +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS + +#define KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + template <> \ + struct syr2_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +#ifdef KOKKOS_ENABLE_SERIAL +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Serial, Kokkos::HostSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, + Kokkos::Serial, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Serial, + Kokkos::HostSpace) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::OpenMP, Kokkos::HostSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(double, Kokkos::LayoutRight, + Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(float, Kokkos::LayoutRight, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::OpenMP, + Kokkos::HostSpace) +#endif + +#endif + +// cuBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS + +#define KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + template <> \ + struct syr2_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaUVMSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, + Kokkos::Cuda, Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutRight, + Kokkos::Cuda, Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace) + +#endif + +// rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS + +#define KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(SCALAR, LAYOUT, EXEC_SPACE, \ + MEM_SPACE) \ + template <> \ + struct syr2_tpl_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) + +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(double, Kokkos::LayoutRight, + Kokkos::HIP, Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(float, Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) +KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_ROCBLAS(Kokkos::complex, + Kokkos::LayoutRight, Kokkos::HIP, + Kokkos::HIPSpace) + +#endif +} // namespace Impl +} // namespace KokkosBlas + +#endif // KOKKOSBLAS2_SYR2_TPL_SPEC_AVAIL_HPP_ diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl.hpp new file mode 100644 index 000000000000..66ba81b685b7 --- /dev/null +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl.hpp @@ -0,0 +1,35 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_TPL_SPEC_DECL_HPP_ +#define KOKKOSBLAS2_SYR2_TPL_SPEC_DECL_HPP_ + +// BLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS +#include +#endif + +// cuBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS +#include +#endif + +// rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +#include +#endif + +#endif diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp new file mode 100644 index 000000000000..8561675c72f0 --- /dev/null +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_blas.hpp @@ -0,0 +1,317 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_TPL_SPEC_DECL_BLAS_HPP_ +#define KOKKOSBLAS2_SYR2_TPL_SPEC_DECL_BLAS_HPP_ + +#include "KokkosBlas_Host_tpl.hpp" + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); + +#define KOKKOSBLAS2_DSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_BLAS,double]"); \ + KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::syr2(uplo[0], N, alpha, X.data(), one, Y.data(), \ + one, A.data(), LDA); \ + } else { \ + /* blasDsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_SSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_BLAS,float]"); \ + KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ + if (A_is_ll) { \ + HostBlas::syr2(uplo[0], N, alpha, X.data(), one, Y.data(), \ + one, A.data(), LDA); \ + } else { \ + /* blasSsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_ZSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr2[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + /* No blasZsyr2() => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } else { \ + if (A_is_ll) { \ + HostBlas>::zher2( \ + uplo[0], N, alpha, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + /* blasZher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_CSYR2_BLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits> \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr2[TPL_BLAS,complex"); \ + KOKKOSBLAS2_SYR2_DETERMINE_ARGS(LAYOUT); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + /* No blasCsyr2() => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } else { \ + if (A_is_ll) { \ + HostBlas>::cher2( \ + uplo[0], N, alpha, \ + reinterpret_cast*>(X.data()), one, \ + reinterpret_cast*>(Y.data()), one, \ + reinterpret_cast*>(A.data()), LDA); \ + } else { \ + /* blasCher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#ifdef KOKKOS_ENABLE_SERIAL +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace, + false) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::Serial, Kokkos::HostSpace, + false) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_DSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_SSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_ZSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) + +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace, + false) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + true) +KOKKOSBLAS2_CSYR2_BLAS(Kokkos::LayoutRight, Kokkos::OpenMP, Kokkos::HostSpace, + false) +#endif + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp new file mode 100644 index 000000000000..ca98fedf0d98 --- /dev/null +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_cublas.hpp @@ -0,0 +1,372 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_TPL_SPEC_DECL_CUBLAS_HPP_ +#define KOKKOSBLAS2_SYR2_TPL_SPEC_DECL_CUBLAS_HPP_ + +#include + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + cublasFillMode_t fillMode = (uploChar == 'L' || uploChar == 'l') \ + ? CUBLAS_FILL_MODE_LOWER \ + : CUBLAS_FILL_MODE_UPPER; + +#define KOKKOSBLAS2_DSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_CUBLAS,double]"); \ + KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasDsyr2(s.handle, fillMode, N, &alpha, X.data(), one, \ + Y.data(), one, A.data(), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasDsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_SSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_CUBLAS,float]"); \ + KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSsyr2(s.handle, fillMode, N, &alpha, X.data(), one, \ + Y.data(), one, A.data(), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasSsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_ZSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr2[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZsyr2( \ + s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasZsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } else { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZher2( \ + s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasZher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_CSYR2_CUBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr2[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_SYR2_CUBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasCsyr2(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasCsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } else { \ + if (A_is_ll) { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasCher2(s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } else { \ + /* cublasCher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) + +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_DSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) + +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_SSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) + +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_ZSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, false) + +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, + false) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + true) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, Kokkos::CudaSpace, + false) + +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + true) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace, + false) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, true) +KOKKOSBLAS2_CSYR2_CUBLAS(Kokkos::LayoutRight, Kokkos::Cuda, + Kokkos::CudaUVMSpace, false) + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp new file mode 100644 index 000000000000..869c065af286 --- /dev/null +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas2_syr2_tpl_spec_decl_rocblas.hpp @@ -0,0 +1,336 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS2_SYR2_TPL_SPEC_DECL_ROCBLAS_HPP_ +#define KOKKOSBLAS2_SYR2_TPL_SPEC_DECL_ROCBLAS_HPP_ + +#include + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uploChar) \ + bool A_is_ll = std::is_same::value; \ + bool A_is_lr = std::is_same::value; \ + const int N = static_cast(A_is_lr ? A.extent(0) : A.extent(1)); \ + constexpr int one = 1; \ + const int LDA = A_is_lr ? A.stride(0) : A.stride(1); \ + rocblas_fill fillMode = (uploChar == 'L' || uploChar == 'l') \ + ? rocblas_fill_lower \ + : rocblas_fill_upper; + +#define KOKKOSBLAS2_DSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_ROCBLAS,double]"); \ + KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_dsyr2(s.handle, fillMode, N, &alpha, X.data(), one, \ + Y.data(), one, A.data(), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_dsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_SSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::syr2[TPL_ROCBLAS,float]"); \ + KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_ssyr2(s.handle, fillMode, N, &alpha, X.data(), one, \ + Y.data(), one, A.data(), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_ssyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_ZSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr2[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zsyr2( \ + s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_zsyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zher2( \ + s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_zher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSBLAS2_CSYR2_ROCBLAS(LAYOUT, EXEC_SPACE, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template <> \ + struct SYR2*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + \ + static void syr2(const typename AViewType::execution_space& space, \ + const char trans[], const char uplo[], \ + typename AViewType::const_value_type& alpha, \ + const XViewType& X, const YViewType& Y, \ + const AViewType& A) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::syr2[TPL_ROCBLAS,complex]"); \ + KOKKOSBLAS2_SYR2_ROCBLAS_DETERMINE_ARGS(LAYOUT, uplo[0]); \ + bool justTranspose = (trans[0] == 'T') || (trans[0] == 't'); \ + if (justTranspose) { \ + if (A_is_ll) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_csyr2( \ + s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_csyr2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } else { \ + if (A_is_ll && (alpha.imag() == 0.)) { \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cher2( \ + s.handle, fillMode, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(Y.data()), one, \ + reinterpret_cast(A.data()), LDA)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ + } else { \ + /* rocblas_cher2() + ~A_ll => call kokkos-kernels' implementation */ \ + SYR2::syr2(space, trans, uplo, alpha, X, Y, A); \ + } \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_DSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) + +KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_SSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) + +KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_ZSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) + +KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, + false) +KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + true) +KOKKOSBLAS2_CSYR2_ROCBLAS(Kokkos::LayoutRight, Kokkos::HIP, Kokkos::HIPSpace, + false) + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp index 66177e28a6ec..68bf2708eca8 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp @@ -60,20 +60,20 @@ namespace Impl { Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS," #SCALAR_TYPE \ "]"); \ const bool A_t = (transA[0] != 'N') && (transA[0] != 'n'); \ - const int M = C.extent(0); \ - const int N = C.extent(1); \ - const int K = A.extent(A_t ? 0 : 1); \ + const KK_INT M = C.extent(0); \ + const KK_INT N = C.extent(1); \ + const KK_INT K = A.extent(A_t ? 0 : 1); \ \ bool A_is_lr = std::is_same::value; \ bool B_is_lr = std::is_same::value; \ bool C_is_lr = std::is_same::value; \ \ - const int AST = A_is_lr ? A.stride(0) : A.stride(1), \ - LDA = AST == 0 ? 1 : AST; \ - const int BST = B_is_lr ? B.stride(0) : B.stride(1), \ - LDB = BST == 0 ? 1 : BST; \ - const int CST = C_is_lr ? C.stride(0) : C.stride(1), \ - LDC = CST == 0 ? 1 : CST; \ + const KK_INT AST = A_is_lr ? A.stride(0) : A.stride(1), \ + LDA = AST == 0 ? 1 : AST; \ + const KK_INT BST = B_is_lr ? B.stride(0) : B.stride(1), \ + LDB = BST == 0 ? 1 : BST; \ + const KK_INT CST = C_is_lr ? C.stride(0) : C.stride(1), \ + LDC = CST == 0 ? 1 : CST; \ \ const BASE_SCALAR_TYPE alpha_val = alpha, beta_val = beta; \ if (!A_is_lr && !B_is_lr && !C_is_lr) \ diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas_Host_tpl.cpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas_Host_tpl.cpp index 6b158f4d196c..50aab57c7368 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas_Host_tpl.cpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas_Host_tpl.cpp @@ -22,140 +22,162 @@ #if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) +using KokkosBlas::Impl::KK_INT; + /// Fortran headers extern "C" { /// /// scal /// -void F77_BLAS_MANGLE(sscal, SSCAL)(const int* N, const float* alpha, - /* */ float* x, const int* x_inc); -void F77_BLAS_MANGLE(dscal, DSCAL)(const int* N, const double* alpha, - /* */ double* x, const int* x_inc); +void F77_BLAS_MANGLE(sscal, SSCAL)(const KK_INT* N, const float* alpha, + /* */ float* x, const KK_INT* x_inc); +void F77_BLAS_MANGLE(dscal, DSCAL)(const KK_INT* N, const double* alpha, + /* */ double* x, const KK_INT* x_inc); void F77_BLAS_MANGLE(cscal, - CSCAL)(const int* N, const std::complex* alpha, - /* */ std::complex* x, const int* x_inc); + CSCAL)(const KK_INT* N, const std::complex* alpha, + /* */ std::complex* x, const KK_INT* x_inc); void F77_BLAS_MANGLE(zscal, - ZSCAL)(const int* N, const std::complex* alpha, - /* */ std::complex* x, const int* x_inc); + ZSCAL)(const KK_INT* N, const std::complex* alpha, + /* */ std::complex* x, const KK_INT* x_inc); /// /// max /// -int F77_BLAS_MANGLE(isamax, ISAMAX)(const int* N, const float* x, - const int* x_inc); -int F77_BLAS_MANGLE(idamax, IDAMAX)(const int* N, const double* x, - const int* x_inc); -int F77_BLAS_MANGLE(icamax, ICAMAX)(const int* N, const std::complex* x, - const int* x_inc); -int F77_BLAS_MANGLE(izamax, IZAMAX)(const int* N, const std::complex* x, - const int* x_inc); +KK_INT F77_BLAS_MANGLE(isamax, ISAMAX)(const KK_INT* N, const float* x, + const KK_INT* x_inc); +KK_INT F77_BLAS_MANGLE(idamax, IDAMAX)(const KK_INT* N, const double* x, + const KK_INT* x_inc); +KK_INT F77_BLAS_MANGLE(icamax, ICAMAX)(const KK_INT* N, + const std::complex* x, + const KK_INT* x_inc); +KK_INT F77_BLAS_MANGLE(izamax, IZAMAX)(const KK_INT* N, + const std::complex* x, + const KK_INT* x_inc); /// /// nrm2 /// -float F77_BLAS_MANGLE(snrm2, SNRM2)(const int* N, const float* x, - const int* x_inc); -double F77_BLAS_MANGLE(dnrm2, DNRM2)(const int* N, const double* x, - const int* x_inc); -float F77_BLAS_MANGLE(scnrm2, SCNRM2)(const int* N, +float F77_BLAS_MANGLE(snrm2, SNRM2)(const KK_INT* N, const float* x, + const KK_INT* x_inc); +double F77_BLAS_MANGLE(dnrm2, DNRM2)(const KK_INT* N, const double* x, + const KK_INT* x_inc); +float F77_BLAS_MANGLE(scnrm2, SCNRM2)(const KK_INT* N, const std::complex* x, - const int* x_inc); -double F77_BLAS_MANGLE(dznrm2, DZNRM2)(const int* N, + const KK_INT* x_inc); +double F77_BLAS_MANGLE(dznrm2, DZNRM2)(const KK_INT* N, const std::complex* x, - const int* x_inc); + const KK_INT* x_inc); /// /// sum /// -float F77_BLAS_MANGLE(sasum, SASUM)(const int* N, const float* x, - const int* x_inc); -double F77_BLAS_MANGLE(dasum, DASUM)(const int* N, const double* x, - const int* x_inc); -float F77_BLAS_MANGLE(scasum, SCASUM)(const int* N, +float F77_BLAS_MANGLE(sasum, SASUM)(const KK_INT* N, const float* x, + const KK_INT* x_inc); +double F77_BLAS_MANGLE(dasum, DASUM)(const KK_INT* N, const double* x, + const KK_INT* x_inc); +float F77_BLAS_MANGLE(scasum, SCASUM)(const KK_INT* N, const std::complex* x, - const int* x_inc); -double F77_BLAS_MANGLE(dzasum, DZASUM)(const int* N, + const KK_INT* x_inc); +double F77_BLAS_MANGLE(dzasum, DZASUM)(const KK_INT* N, const std::complex* x, - const int* x_inc); + const KK_INT* x_inc); /// /// dot /// -float F77_BLAS_MANGLE(sdot, SDOT)(const int* N, const float* x, - const int* x_inc, const float* y, - const int* y_inc); -double F77_BLAS_MANGLE(ddot, DDOT)(const int* N, const double* x, - const int* x_inc, const double* y, - const int* y_inc); +float F77_BLAS_MANGLE(sdot, SDOT)(const KK_INT* N, const float* x, + const KK_INT* x_inc, const float* y, + const KK_INT* y_inc); +double F77_BLAS_MANGLE(ddot, DDOT)(const KK_INT* N, const double* x, + const KK_INT* x_inc, const double* y, + const KK_INT* y_inc); #if defined(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) -std::complex F77_BLAS_MANGLE(cdotu, CDOTU)(const int* N, - const std::complex* x, - const int* x_inc, - const std::complex* y, - const int* y_inc); -std::complex F77_BLAS_MANGLE(zdotu, ZDOTU)( - const int* N, const std::complex* x, const int* x_inc, - const std::complex* y, const int* y_inc); -std::complex F77_BLAS_MANGLE(cdotc, CDOTC)(const int* N, - const std::complex* x, - const int* x_inc, - const std::complex* y, - const int* y_inc); -std::complex F77_BLAS_MANGLE(zdotc, ZDOTC)( - const int* N, const std::complex* x, const int* x_inc, - const std::complex* y, const int* y_inc); +// clang-format off +// For the return type, don't use std::complex, otherwise compiler will complain +// error: 'cdotu_' has C-linkage specified, but returns user-defined type 'std::complex' which is incompatible with C [-Werror,-Wreturn-type-c-linkage]" +// But with float _Complex, I got error: '_Complex' is a C99 extension [-Werror,-Wc99-extensions]. +// So I just use a C struct. +// clang-format on +typedef struct { + float vals[2]; +} _kk_float2; +typedef struct { + double vals[2]; +} _kk_double2; + +_kk_float2 F77_BLAS_MANGLE(cdotu, CDOTU)(const KK_INT* N, + const std::complex* x, + const KK_INT* x_inc, + const std::complex* y, + const KK_INT* y_inc); +_kk_double2 F77_BLAS_MANGLE(zdotu, ZDOTU)(const KK_INT* N, + const std::complex* x, + const KK_INT* x_inc, + const std::complex* y, + const KK_INT* y_inc); +_kk_float2 F77_BLAS_MANGLE(cdotc, CDOTC)(const KK_INT* N, + const std::complex* x, + const KK_INT* x_inc, + const std::complex* y, + const KK_INT* y_inc); +_kk_double2 F77_BLAS_MANGLE(zdotc, ZDOTC)(const KK_INT* N, + const std::complex* x, + const KK_INT* x_inc, + const std::complex* y, + const KK_INT* y_inc); #else void F77_BLAS_MANGLE(cdotu, - CDOTU)(std::complex* res, const int* N, - const std::complex* x, const int* x_inc, - const std::complex* y, const int* y_inc); + CDOTU)(std::complex* res, const KK_INT* N, + const std::complex* x, const KK_INT* x_inc, + const std::complex* y, const KK_INT* y_inc); void F77_BLAS_MANGLE(zdotu, - ZDOTU)(std::complex* res, const int* N, - const std::complex* x, const int* x_inc, - const std::complex* y, const int* y_inc); + ZDOTU)(std::complex* res, const KK_INT* N, + const std::complex* x, const KK_INT* x_inc, + const std::complex* y, const KK_INT* y_inc); void F77_BLAS_MANGLE(cdotc, - CDOTC)(std::complex* res, const int* N, - const std::complex* x, const int* x_inc, - const std::complex* y, const int* y_inc); + CDOTC)(std::complex* res, const KK_INT* N, + const std::complex* x, const KK_INT* x_inc, + const std::complex* y, const KK_INT* y_inc); void F77_BLAS_MANGLE(zdotc, - ZDOTC)(std::complex* res, const int* N, - const std::complex* x, const int* x_inc, - const std::complex* y, const int* y_inc); + ZDOTC)(std::complex* res, const KK_INT* N, + const std::complex* x, const KK_INT* x_inc, + const std::complex* y, const KK_INT* y_inc); #endif /// /// axpy /// -void F77_BLAS_MANGLE(saxpy, SAXPY)(const int* N, const float* alpha, - const float* x, const int* x_inc, - /* */ float* y, const int* y_inc); -void F77_BLAS_MANGLE(daxpy, DAXPY)(const int* N, const double* alpha, - const double* x, const int* x_inc, - /* */ double* y, const int* y_inc); +void F77_BLAS_MANGLE(saxpy, SAXPY)(const KK_INT* N, const float* alpha, + const float* x, const KK_INT* x_inc, + /* */ float* y, const KK_INT* y_inc); +void F77_BLAS_MANGLE(daxpy, DAXPY)(const KK_INT* N, const double* alpha, + const double* x, const KK_INT* x_inc, + /* */ double* y, const KK_INT* y_inc); void F77_BLAS_MANGLE(caxpy, - CAXPY)(const int* N, const std::complex* alpha, - const std::complex* x, const int* x_inc, - /* */ std::complex* y, const int* y_inc); + CAXPY)(const KK_INT* N, const std::complex* alpha, + const std::complex* x, const KK_INT* x_inc, + /* */ std::complex* y, const KK_INT* y_inc); void F77_BLAS_MANGLE(zaxpy, - ZAXPY)(const int* N, const std::complex* alpha, - const std::complex* x, const int* x_inc, - /* */ std::complex* y, const int* y_inc); + ZAXPY)(const KK_INT* N, const std::complex* alpha, + const std::complex* x, const KK_INT* x_inc, + /* */ std::complex* y, const KK_INT* y_inc); /// /// rot /// -void F77_BLAS_MANGLE(srot, SROT)(int const* N, float* X, int const* incx, - float* Y, int const* incy, float* c, float* s); -void F77_BLAS_MANGLE(drot, DROT)(int const* N, double* X, int const* incx, - double* Y, int const* incy, double* c, +void F77_BLAS_MANGLE(srot, SROT)(KK_INT const* N, float* X, KK_INT const* incx, + float* Y, KK_INT const* incy, float* c, + float* s); +void F77_BLAS_MANGLE(drot, DROT)(KK_INT const* N, double* X, KK_INT const* incx, + double* Y, KK_INT const* incy, double* c, double* s); -void F77_BLAS_MANGLE(crot, CROT)(int const* N, std::complex* X, - int const* incx, std::complex* Y, - int const* incy, float* c, float* s); -void F77_BLAS_MANGLE(zrot, ZROT)(int const* N, std::complex* X, - int const* incx, std::complex* Y, - int const* incy, double* c, double* s); +void F77_BLAS_MANGLE(crot, CROT)(KK_INT const* N, std::complex* X, + KK_INT const* incx, std::complex* Y, + KK_INT const* incy, float* c, float* s); +void F77_BLAS_MANGLE(zrot, ZROT)(KK_INT const* N, std::complex* X, + KK_INT const* incx, std::complex* Y, + KK_INT const* incy, double* c, double* s); /// /// rotg @@ -172,12 +194,12 @@ void F77_BLAS_MANGLE(zrotg, ZROTG)(std::complex* a, /// /// rotm /// -void F77_BLAS_MANGLE(srotm, SROTM)(const int* n, float* X, const int* incx, - float* Y, const int* incy, - float const* param); -void F77_BLAS_MANGLE(drotm, DROTM)(const int* n, double* X, const int* incx, - double* Y, const int* incy, - double const* param); +void F77_BLAS_MANGLE(srotm, SROTM)(const KK_INT* n, float* X, + const KK_INT* incx, float* Y, + const KK_INT* incy, float const* param); +void F77_BLAS_MANGLE(drotm, DROTM)(const KK_INT* n, double* X, + const KK_INT* incx, double* Y, + const KK_INT* incy, double const* param); /// /// rotmg @@ -190,72 +212,78 @@ void F77_BLAS_MANGLE(drotmg, DROTMG)(double* d1, double* d2, double* x1, /// /// swap /// -void F77_BLAS_MANGLE(sswap, SSWAP)(int const* N, float* X, int const* incx, - float* Y, int const* incy); -void F77_BLAS_MANGLE(dswap, DSWAP)(int const* N, double* X, int const* incx, - double* Y, int const* incy); -void F77_BLAS_MANGLE(cswap, CSWAP)(int const* N, std::complex* X, - int const* incx, std::complex* Y, - int const* incy); -void F77_BLAS_MANGLE(zswap, ZSWAP)(int const* N, std::complex* X, - int const* incx, std::complex* Y, - int const* incy); +void F77_BLAS_MANGLE(sswap, SSWAP)(KK_INT const* N, float* X, + KK_INT const* incx, float* Y, + KK_INT const* incy); +void F77_BLAS_MANGLE(dswap, DSWAP)(KK_INT const* N, double* X, + KK_INT const* incx, double* Y, + KK_INT const* incy); +void F77_BLAS_MANGLE(cswap, CSWAP)(KK_INT const* N, std::complex* X, + KK_INT const* incx, std::complex* Y, + KK_INT const* incy); +void F77_BLAS_MANGLE(zswap, ZSWAP)(KK_INT const* N, std::complex* X, + KK_INT const* incx, std::complex* Y, + KK_INT const* incy); /// /// Gemv /// -void F77_BLAS_MANGLE(sgemv, SGEMV)(const char*, int*, int*, const float*, - const float*, int*, const float*, int*, +void F77_BLAS_MANGLE(sgemv, SGEMV)(const char*, KK_INT*, KK_INT*, const float*, + const float*, KK_INT*, const float*, KK_INT*, const float*, - /* */ float*, int*); -void F77_BLAS_MANGLE(dgemv, DGEMV)(const char*, int*, int*, const double*, - const double*, int*, const double*, int*, - const double*, - /* */ double*, int*); -void F77_BLAS_MANGLE(cgemv, CGEMV)(const char*, int*, int*, + /* */ float*, KK_INT*); +void F77_BLAS_MANGLE(dgemv, DGEMV)(const char*, KK_INT*, KK_INT*, const double*, + const double*, KK_INT*, const double*, + KK_INT*, const double*, + /* */ double*, KK_INT*); +void F77_BLAS_MANGLE(cgemv, CGEMV)(const char*, KK_INT*, KK_INT*, const std::complex*, - const std::complex*, int*, - const std::complex*, int*, + const std::complex*, KK_INT*, + const std::complex*, KK_INT*, const std::complex*, - /* */ std::complex*, int*); -void F77_BLAS_MANGLE(zgemv, ZGEMV)(const char*, int*, int*, + /* */ std::complex*, KK_INT*); +void F77_BLAS_MANGLE(zgemv, ZGEMV)(const char*, KK_INT*, KK_INT*, const std::complex*, - const std::complex*, int*, - const std::complex*, int*, + const std::complex*, KK_INT*, + const std::complex*, KK_INT*, const std::complex*, - /* */ std::complex*, int*); + /* */ std::complex*, KK_INT*); /// /// Ger /// -void F77_BLAS_MANGLE(sger, SGER)(int*, int*, const float*, const float*, int*, - const float*, int*, float*, int*); -void F77_BLAS_MANGLE(dger, DGER)(int*, int*, const double*, const double*, int*, - const double*, int*, double*, int*); -void F77_BLAS_MANGLE(cgeru, CGERU)(int*, int*, const std::complex*, - const std::complex*, int*, - const std::complex*, int*, - std::complex*, int*); -void F77_BLAS_MANGLE(zgeru, ZGERU)(int*, int*, const std::complex*, - const std::complex*, int*, - const std::complex*, int*, - std::complex*, int*); -void F77_BLAS_MANGLE(cgerc, CGERC)(int*, int*, const std::complex*, - const std::complex*, int*, - const std::complex*, int*, - std::complex*, int*); -void F77_BLAS_MANGLE(zgerc, ZGERC)(int*, int*, const std::complex*, - const std::complex*, int*, - const std::complex*, int*, - std::complex*, int*); +void F77_BLAS_MANGLE(sger, SGER)(KK_INT*, KK_INT*, const float*, const float*, + KK_INT*, const float*, KK_INT*, float*, + KK_INT*); +void F77_BLAS_MANGLE(dger, DGER)(KK_INT*, KK_INT*, const double*, const double*, + KK_INT*, const double*, KK_INT*, double*, + KK_INT*); +void F77_BLAS_MANGLE(cgeru, CGERU)(KK_INT*, KK_INT*, const std::complex*, + const std::complex*, KK_INT*, + const std::complex*, KK_INT*, + std::complex*, KK_INT*); +void F77_BLAS_MANGLE(zgeru, ZGERU)(KK_INT*, KK_INT*, + const std::complex*, + const std::complex*, KK_INT*, + const std::complex*, KK_INT*, + std::complex*, KK_INT*); +void F77_BLAS_MANGLE(cgerc, CGERC)(KK_INT*, KK_INT*, const std::complex*, + const std::complex*, KK_INT*, + const std::complex*, KK_INT*, + std::complex*, KK_INT*); +void F77_BLAS_MANGLE(zgerc, ZGERC)(KK_INT*, KK_INT*, + const std::complex*, + const std::complex*, KK_INT*, + const std::complex*, KK_INT*, + std::complex*, KK_INT*); /// /// Syr /// -void F77_BLAS_MANGLE(ssyr, SSYR)(const char*, int*, const float*, const float*, - int*, float*, int*); -void F77_BLAS_MANGLE(dsyr, DSYR)(const char*, int*, const double*, - const double*, int*, double*, int*); +void F77_BLAS_MANGLE(ssyr, SSYR)(const char*, KK_INT*, const float*, + const float*, KK_INT*, float*, KK_INT*); +void F77_BLAS_MANGLE(dsyr, DSYR)(const char*, KK_INT*, const double*, + const double*, KK_INT*, double*, KK_INT*); // Although there is a cgeru, there is no csyru // Although there is a zgeru, there is no zsyru // Although there is a cgerc, there is no csyrc, but there is cher (see below) @@ -265,135 +293,166 @@ void F77_BLAS_MANGLE(dsyr, DSYR)(const char*, int*, const double*, /// Her /// -void F77_BLAS_MANGLE(cher, CHER)(const char*, int*, const float*, - const std::complex*, int*, - std::complex*, int*); -void F77_BLAS_MANGLE(zher, ZHER)(const char*, int*, const double*, - const std::complex*, int*, - std::complex*, int*); +void F77_BLAS_MANGLE(cher, CHER)(const char*, KK_INT*, const float*, + const std::complex*, KK_INT*, + std::complex*, KK_INT*); +void F77_BLAS_MANGLE(zher, ZHER)(const char*, KK_INT*, const double*, + const std::complex*, KK_INT*, + std::complex*, KK_INT*); + +/// +/// Syr2 +/// +void F77_BLAS_MANGLE(ssyr2, SSYR2)(const char*, KK_INT*, const float*, + const float*, const KK_INT*, const float*, + KK_INT*, float*, KK_INT*); +void F77_BLAS_MANGLE(dsyr2, DSYR2)(const char*, KK_INT*, const double*, + const double*, const KK_INT*, const double*, + KK_INT*, double*, KK_INT*); +// Although there is a cgeru, there is no csyr2u +// Although there is a zgeru, there is no zsyr2u +// Although there is a cgerc, there is no csyr2c, but there is cher2 (see below) +// Although there is a zgerc, there is no zsyr2c, but there is zher2 (see below) + +/// +/// Her2 +/// + +void F77_BLAS_MANGLE(cher2, CHER2)(const char*, KK_INT*, + const std::complex*, + const std::complex*, KK_INT*, + const std::complex*, KK_INT*, + std::complex*, KK_INT*); +void F77_BLAS_MANGLE(zher2, ZHER2)(const char*, KK_INT*, + const std::complex*, + const std::complex*, KK_INT*, + const std::complex*, KK_INT*, + std::complex*, KK_INT*); /// /// Trsv /// -void F77_BLAS_MANGLE(strsv, STRSV)(const char*, const char*, const char*, int*, - const float*, int*, - /* */ float*, int*); -void F77_BLAS_MANGLE(dtrsv, DTRSV)(const char*, const char*, const char*, int*, - const double*, int*, - /* */ double*, int*); -void F77_BLAS_MANGLE(ctrsv, CTRSV)(const char*, const char*, const char*, int*, - const std::complex*, int*, - /* */ std::complex*, int*); -void F77_BLAS_MANGLE(ztrsv, ZTRSV)(const char*, const char*, const char*, int*, - const std::complex*, int*, - /* */ std::complex*, int*); +void F77_BLAS_MANGLE(strsv, STRSV)(const char*, const char*, const char*, + KK_INT*, const float*, KK_INT*, + /* */ float*, KK_INT*); +void F77_BLAS_MANGLE(dtrsv, DTRSV)(const char*, const char*, const char*, + KK_INT*, const double*, KK_INT*, + /* */ double*, KK_INT*); +void F77_BLAS_MANGLE(ctrsv, CTRSV)(const char*, const char*, const char*, + KK_INT*, const std::complex*, KK_INT*, + /* */ std::complex*, KK_INT*); +void F77_BLAS_MANGLE(ztrsv, ZTRSV)(const char*, const char*, const char*, + KK_INT*, const std::complex*, + KK_INT*, + /* */ std::complex*, KK_INT*); /// /// Gemm /// -void F77_BLAS_MANGLE(sgemm, SGEMM)(const char*, const char*, int*, int*, int*, - const float*, const float*, int*, - const float*, int*, const float*, - /* */ float*, int*); -void F77_BLAS_MANGLE(dgemm, DGEMM)(const char*, const char*, int*, int*, int*, - const double*, const double*, int*, - const double*, int*, const double*, - /* */ double*, int*); -void F77_BLAS_MANGLE(cgemm, CGEMM)(const char*, const char*, int*, int*, int*, - const std::complex*, - const std::complex*, int*, - const std::complex*, int*, +void F77_BLAS_MANGLE(sgemm, SGEMM)(const char*, const char*, KK_INT*, KK_INT*, + KK_INT*, const float*, const float*, KK_INT*, + const float*, KK_INT*, const float*, + /* */ float*, KK_INT*); +void F77_BLAS_MANGLE(dgemm, DGEMM)(const char*, const char*, KK_INT*, KK_INT*, + KK_INT*, const double*, const double*, + KK_INT*, const double*, KK_INT*, + const double*, + /* */ double*, KK_INT*); +void F77_BLAS_MANGLE(cgemm, CGEMM)(const char*, const char*, KK_INT*, KK_INT*, + KK_INT*, const std::complex*, + const std::complex*, KK_INT*, + const std::complex*, KK_INT*, const std::complex*, - /* */ std::complex*, int*); -void F77_BLAS_MANGLE(zgemm, ZGEMM)(const char*, const char*, int*, int*, int*, + /* */ std::complex*, KK_INT*); +void F77_BLAS_MANGLE(zgemm, ZGEMM)(const char*, const char*, KK_INT*, KK_INT*, + KK_INT*, const std::complex*, + const std::complex*, KK_INT*, + const std::complex*, KK_INT*, const std::complex*, - const std::complex*, int*, - const std::complex*, int*, - const std::complex*, - /* */ std::complex*, int*); + /* */ std::complex*, KK_INT*); /// /// Herk /// -void F77_BLAS_MANGLE(ssyrk, SSYRK)(const char*, const char*, int*, int*, - const float*, const float*, int*, +void F77_BLAS_MANGLE(ssyrk, SSYRK)(const char*, const char*, KK_INT*, KK_INT*, + const float*, const float*, KK_INT*, const float*, - /* */ float*, int*); -void F77_BLAS_MANGLE(dsyrk, DSYRK)(const char*, const char*, int*, int*, - const double*, const double*, int*, + /* */ float*, KK_INT*); +void F77_BLAS_MANGLE(dsyrk, DSYRK)(const char*, const char*, KK_INT*, KK_INT*, + const double*, const double*, KK_INT*, const double*, - /* */ double*, int*); -void F77_BLAS_MANGLE(cherk, CHERK)(const char*, const char*, int*, int*, + /* */ double*, KK_INT*); +void F77_BLAS_MANGLE(cherk, CHERK)(const char*, const char*, KK_INT*, KK_INT*, const std::complex*, - const std::complex*, int*, + const std::complex*, KK_INT*, const std::complex*, - /* */ std::complex*, int*); -void F77_BLAS_MANGLE(zherk, ZHERK)(const char*, const char*, int*, int*, + /* */ std::complex*, KK_INT*); +void F77_BLAS_MANGLE(zherk, ZHERK)(const char*, const char*, KK_INT*, KK_INT*, const std::complex*, - const std::complex*, int*, + const std::complex*, KK_INT*, const std::complex*, - /* */ std::complex*, int*); + /* */ std::complex*, KK_INT*); /// /// Trmm /// void F77_BLAS_MANGLE(strmm, STRMM)(const char*, const char*, const char*, - const char*, int*, int*, const float*, - const float*, int*, - /* */ float*, int*); + const char*, KK_INT*, KK_INT*, const float*, + const float*, KK_INT*, + /* */ float*, KK_INT*); void F77_BLAS_MANGLE(dtrmm, DTRMM)(const char*, const char*, const char*, - const char*, int*, int*, const double*, - const double*, int*, - /* */ double*, int*); + const char*, KK_INT*, KK_INT*, const double*, + const double*, KK_INT*, + /* */ double*, KK_INT*); void F77_BLAS_MANGLE(ctrmm, CTRMM)(const char*, const char*, const char*, - const char*, int*, int*, + const char*, KK_INT*, KK_INT*, const std::complex*, - const std::complex*, int*, - /* */ std::complex*, int*); + const std::complex*, KK_INT*, + /* */ std::complex*, KK_INT*); void F77_BLAS_MANGLE(ztrmm, ZTRMM)(const char*, const char*, const char*, - const char*, int*, int*, + const char*, KK_INT*, KK_INT*, const std::complex*, - const std::complex*, int*, - /* */ std::complex*, int*); + const std::complex*, KK_INT*, + /* */ std::complex*, KK_INT*); /// /// Trsm /// void F77_BLAS_MANGLE(strsm, STRSM)(const char*, const char*, const char*, - const char*, int*, int*, const float*, - const float*, int*, - /* */ float*, int*); + const char*, KK_INT*, KK_INT*, const float*, + const float*, KK_INT*, + /* */ float*, KK_INT*); void F77_BLAS_MANGLE(dtrsm, DTRSM)(const char*, const char*, const char*, - const char*, int*, int*, const double*, - const double*, int*, - /* */ double*, int*); + const char*, KK_INT*, KK_INT*, const double*, + const double*, KK_INT*, + /* */ double*, KK_INT*); void F77_BLAS_MANGLE(ctrsm, CTRSM)(const char*, const char*, const char*, - const char*, int*, int*, + const char*, KK_INT*, KK_INT*, const std::complex*, - const std::complex*, int*, - /* */ std::complex*, int*); + const std::complex*, KK_INT*, + /* */ std::complex*, KK_INT*); void F77_BLAS_MANGLE(ztrsm, ZTRSM)(const char*, const char*, const char*, - const char*, int*, int*, + const char*, KK_INT*, KK_INT*, const std::complex*, - const std::complex*, int*, - /* */ std::complex*, int*); + const std::complex*, KK_INT*, + /* */ std::complex*, KK_INT*); } -void F77_BLAS_MANGLE(sscal, SSCAL)(const int* N, const float* alpha, - /* */ float* x, const int* x_inc); -void F77_BLAS_MANGLE(dscal, DSCAL)(const int* N, const double* alpha, - /* */ double* x, const int* x_inc); +void F77_BLAS_MANGLE(sscal, SSCAL)(const KK_INT* N, const float* alpha, + /* */ float* x, const KK_INT* x_inc); +void F77_BLAS_MANGLE(dscal, DSCAL)(const KK_INT* N, const double* alpha, + /* */ double* x, const KK_INT* x_inc); void F77_BLAS_MANGLE(cscal, - CSCAL)(const int* N, const std::complex* alpha, - /* */ std::complex* x, const int* x_inc); + CSCAL)(const KK_INT* N, const std::complex* alpha, + /* */ std::complex* x, const KK_INT* x_inc); void F77_BLAS_MANGLE(zscal, - ZSCAL)(const int* N, const std::complex* alpha, - /* */ std::complex* x, const int* x_inc); + ZSCAL)(const KK_INT* N, const std::complex* alpha, + /* */ std::complex* x, const KK_INT* x_inc); #define F77_FUNC_SSCAL F77_BLAS_MANGLE(sscal, SSCAL) #define F77_FUNC_DSCAL F77_BLAS_MANGLE(dscal, DSCAL) @@ -466,6 +525,12 @@ void F77_BLAS_MANGLE(zscal, #define F77_FUNC_CHER F77_BLAS_MANGLE(cher, CHER) #define F77_FUNC_ZHER F77_BLAS_MANGLE(zher, ZHER) +#define F77_FUNC_SSYR2 F77_BLAS_MANGLE(ssyr2, SSYR2) +#define F77_FUNC_DSYR2 F77_BLAS_MANGLE(dsyr2, DSYR2) + +#define F77_FUNC_CHER2 F77_BLAS_MANGLE(cher2, CHER2) +#define F77_FUNC_ZHER2 F77_BLAS_MANGLE(zher2, ZHER2) + #define F77_FUNC_STRSV F77_BLAS_MANGLE(strsv, STRSV) #define F77_FUNC_DTRSV F77_BLAS_MANGLE(dtrsv, DTRSV) #define F77_FUNC_CTRSV F77_BLAS_MANGLE(ctrsv, CTRSV) @@ -499,35 +564,36 @@ namespace Impl { /// template <> -void HostBlas::scal(int n, const float alpha, - /* */ float* x, int x_inc) { +void HostBlas::scal(KK_INT n, const float alpha, + /* */ float* x, KK_INT x_inc) { F77_FUNC_SSCAL(&n, &alpha, x, &x_inc); } template <> -int HostBlas::iamax(int n, const float* x, int x_inc) { +KK_INT HostBlas::iamax(KK_INT n, const float* x, KK_INT x_inc) { return F77_FUNC_ISAMAX(&n, x, &x_inc); } template <> -float HostBlas::nrm2(int n, const float* x, int x_inc) { +float HostBlas::nrm2(KK_INT n, const float* x, KK_INT x_inc) { return F77_FUNC_SNRM2(&n, x, &x_inc); } template <> -float HostBlas::asum(int n, const float* x, int x_inc) { +float HostBlas::asum(KK_INT n, const float* x, KK_INT x_inc) { return F77_FUNC_SASUM(&n, x, &x_inc); } template <> -float HostBlas::dot(int n, const float* x, int x_inc, const float* y, - int y_inc) { +float HostBlas::dot(KK_INT n, const float* x, KK_INT x_inc, + const float* y, KK_INT y_inc) { return F77_FUNC_SDOT(&n, x, &x_inc, y, &y_inc); } template <> -void HostBlas::axpy(int n, const float alpha, const float* x, int x_inc, - /* */ float* y, int y_inc) { +void HostBlas::axpy(KK_INT n, const float alpha, const float* x, + KK_INT x_inc, + /* */ float* y, KK_INT y_inc) { F77_FUNC_SAXPY(&n, &alpha, x, &x_inc, y, &y_inc); } template <> -void HostBlas::rot(int const N, float* X, int const incx, float* Y, - int const incy, float* c, float* s) { +void HostBlas::rot(KK_INT const N, float* X, KK_INT const incx, float* Y, + KK_INT const incy, float* c, float* s) { F77_FUNC_SROT(&N, X, &incx, Y, &incy, c, s); } template <> @@ -535,8 +601,8 @@ void HostBlas::rotg(float* a, float* b, float* c, float* s) { F77_FUNC_SROTG(a, b, c, s); } template <> -void HostBlas::rotm(const int n, float* X, const int incx, float* Y, - const int incy, const float* param) { +void HostBlas::rotm(const KK_INT n, float* X, const KK_INT incx, + float* Y, const KK_INT incy, const float* param) { F77_FUNC_SROTM(&n, X, &incx, Y, &incy, param); } template <> @@ -545,62 +611,69 @@ void HostBlas::rotmg(float* d1, float* d2, float* x1, const float* y1, F77_FUNC_SROTMG(d1, d2, x1, y1, param); } template <> -void HostBlas::swap(int const N, float* X, int const incx, float* Y, - int const incy) { +void HostBlas::swap(KK_INT const N, float* X, KK_INT const incx, + float* Y, KK_INT const incy) { F77_FUNC_SSWAP(&N, X, &incx, Y, &incy); } template <> -void HostBlas::gemv(const char trans, int m, int n, const float alpha, - const float* a, int lda, const float* b, int ldb, - const float beta, - /* */ float* c, int ldc) { +void HostBlas::gemv(const char trans, KK_INT m, KK_INT n, + const float alpha, const float* a, KK_INT lda, + const float* b, KK_INT ldb, const float beta, + /* */ float* c, KK_INT ldc) { F77_FUNC_SGEMV(&trans, &m, &n, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } template <> -void HostBlas::ger(int m, int n, const float alpha, const float* x, - int incx, const float* y, int incy, float* a, - int lda) { +void HostBlas::ger(KK_INT m, KK_INT n, const float alpha, const float* x, + KK_INT incx, const float* y, KK_INT incy, float* a, + KK_INT lda) { F77_FUNC_SGER(&m, &n, &alpha, x, &incx, y, &incy, a, &lda); } template <> -void HostBlas::syr(const char uplo, int n, const float alpha, - const float* x, int incx, float* a, int lda) { +void HostBlas::syr(const char uplo, KK_INT n, const float alpha, + const float* x, KK_INT incx, float* a, KK_INT lda) { F77_FUNC_SSYR(&uplo, &n, &alpha, x, &incx, a, &lda); } template <> +void HostBlas::syr2(const char uplo, KK_INT n, const float alpha, + const float* x, KK_INT incx, const float* y, + KK_INT incy, float* a, KK_INT lda) { + F77_FUNC_SSYR2(&uplo, &n, &alpha, x, &incx, y, &incy, a, &lda); +} +template <> void HostBlas::trsv(const char uplo, const char transa, const char diag, - int m, const float* a, int lda, - /* */ float* b, int ldb) { + KK_INT m, const float* a, KK_INT lda, + /* */ float* b, KK_INT ldb) { F77_FUNC_STRSV(&uplo, &transa, &diag, &m, a, &lda, b, &ldb); } template <> -void HostBlas::gemm(const char transa, const char transb, int m, int n, - int k, const float alpha, const float* a, int lda, - const float* b, int ldb, const float beta, - /* */ float* c, int ldc) { +void HostBlas::gemm(const char transa, const char transb, KK_INT m, + KK_INT n, KK_INT k, const float alpha, + const float* a, KK_INT lda, const float* b, + KK_INT ldb, const float beta, + /* */ float* c, KK_INT ldc) { F77_FUNC_SGEMM(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } template <> -void HostBlas::herk(const char transa, const char transb, int n, int k, - const float alpha, const float* a, int lda, - const float beta, - /* */ float* c, int ldc) { +void HostBlas::herk(const char transa, const char transb, KK_INT n, + KK_INT k, const float alpha, const float* a, + KK_INT lda, const float beta, + /* */ float* c, KK_INT ldc) { F77_FUNC_SSYRK(&transa, &transb, &n, &k, &alpha, a, &lda, &beta, c, &ldc); } template <> void HostBlas::trmm(const char side, const char uplo, const char transa, - const char diag, int m, int n, const float alpha, - const float* a, int lda, - /* */ float* b, int ldb) { + const char diag, KK_INT m, KK_INT n, + const float alpha, const float* a, KK_INT lda, + /* */ float* b, KK_INT ldb) { F77_FUNC_STRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } template <> void HostBlas::trsm(const char side, const char uplo, const char transa, - const char diag, int m, int n, const float alpha, - const float* a, int lda, - /* */ float* b, int ldb) { + const char diag, KK_INT m, KK_INT n, + const float alpha, const float* a, KK_INT lda, + /* */ float* b, KK_INT ldb) { F77_FUNC_STRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } @@ -610,36 +683,36 @@ void HostBlas::trsm(const char side, const char uplo, const char transa, /// template <> -void HostBlas::scal(int n, const double alpha, - /* */ double* x, int x_inc) { +void HostBlas::scal(KK_INT n, const double alpha, + /* */ double* x, KK_INT x_inc) { F77_FUNC_DSCAL(&n, &alpha, x, &x_inc); } template <> -int HostBlas::iamax(int n, const double* x, int x_inc) { +KK_INT HostBlas::iamax(KK_INT n, const double* x, KK_INT x_inc) { return F77_FUNC_IDAMAX(&n, x, &x_inc); } template <> -double HostBlas::nrm2(int n, const double* x, int x_inc) { +double HostBlas::nrm2(KK_INT n, const double* x, KK_INT x_inc) { return F77_FUNC_DNRM2(&n, x, &x_inc); } template <> -double HostBlas::asum(int n, const double* x, int x_inc) { +double HostBlas::asum(KK_INT n, const double* x, KK_INT x_inc) { return F77_FUNC_DASUM(&n, x, &x_inc); } template <> -double HostBlas::dot(int n, const double* x, int x_inc, const double* y, - int y_inc) { +double HostBlas::dot(KK_INT n, const double* x, KK_INT x_inc, + const double* y, KK_INT y_inc) { return F77_FUNC_DDOT(&n, x, &x_inc, y, &y_inc); } template <> -void HostBlas::axpy(int n, const double alpha, const double* x, - int x_inc, - /* */ double* y, int y_inc) { +void HostBlas::axpy(KK_INT n, const double alpha, const double* x, + KK_INT x_inc, + /* */ double* y, KK_INT y_inc) { F77_FUNC_DAXPY(&n, &alpha, x, &x_inc, y, &y_inc); } template <> -void HostBlas::rot(int const N, double* X, int const incx, double* Y, - int const incy, double* c, double* s) { +void HostBlas::rot(KK_INT const N, double* X, KK_INT const incx, + double* Y, KK_INT const incy, double* c, double* s) { F77_FUNC_DROT(&N, X, &incx, Y, &incy, c, s); } template <> @@ -647,8 +720,8 @@ void HostBlas::rotg(double* a, double* b, double* c, double* s) { F77_FUNC_DROTG(a, b, c, s); } template <> -void HostBlas::rotm(const int n, double* X, const int incx, double* Y, - const int incy, const double* param) { +void HostBlas::rotm(const KK_INT n, double* X, const KK_INT incx, + double* Y, const KK_INT incy, const double* param) { F77_FUNC_DROTM(&n, X, &incx, Y, &incy, param); } template <> @@ -657,62 +730,70 @@ void HostBlas::rotmg(double* d1, double* d2, double* x1, F77_FUNC_DROTMG(d1, d2, x1, y1, param); } template <> -void HostBlas::swap(int const N, double* X, int const incx, double* Y, - int const incy) { +void HostBlas::swap(KK_INT const N, double* X, KK_INT const incx, + double* Y, KK_INT const incy) { F77_FUNC_DSWAP(&N, X, &incx, Y, &incy); } template <> -void HostBlas::gemv(const char trans, int m, int n, const double alpha, - const double* a, int lda, const double* b, int ldb, - const double beta, - /* */ double* c, int ldc) { +void HostBlas::gemv(const char trans, KK_INT m, KK_INT n, + const double alpha, const double* a, KK_INT lda, + const double* b, KK_INT ldb, const double beta, + /* */ double* c, KK_INT ldc) { F77_FUNC_DGEMV(&trans, &m, &n, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } template <> -void HostBlas::ger(int m, int n, const double alpha, const double* x, - int incx, const double* y, int incy, double* a, - int lda) { +void HostBlas::ger(KK_INT m, KK_INT n, const double alpha, + const double* x, KK_INT incx, const double* y, + KK_INT incy, double* a, KK_INT lda) { F77_FUNC_DGER(&m, &n, &alpha, x, &incx, y, &incy, a, &lda); } template <> -void HostBlas::syr(const char uplo, int n, const double alpha, - const double* x, int incx, double* a, int lda) { +void HostBlas::syr(const char uplo, KK_INT n, const double alpha, + const double* x, KK_INT incx, double* a, + KK_INT lda) { F77_FUNC_DSYR(&uplo, &n, &alpha, x, &incx, a, &lda); } template <> +void HostBlas::syr2(const char uplo, KK_INT n, const double alpha, + const double* x, KK_INT incx, const double* y, + KK_INT incy, double* a, KK_INT lda) { + F77_FUNC_DSYR2(&uplo, &n, &alpha, x, &incx, y, &incy, a, &lda); +} +template <> void HostBlas::trsv(const char uplo, const char transa, const char diag, - int m, const double* a, int lda, - /* */ double* b, int ldb) { + KK_INT m, const double* a, KK_INT lda, + /* */ double* b, KK_INT ldb) { F77_FUNC_DTRSV(&uplo, &transa, &diag, &m, a, &lda, b, &ldb); } template <> -void HostBlas::gemm(const char transa, const char transb, int m, int n, - int k, const double alpha, const double* a, int lda, - const double* b, int ldb, const double beta, - /* */ double* c, int ldc) { +void HostBlas::gemm(const char transa, const char transb, KK_INT m, + KK_INT n, KK_INT k, const double alpha, + const double* a, KK_INT lda, const double* b, + KK_INT ldb, const double beta, + /* */ double* c, KK_INT ldc) { F77_FUNC_DGEMM(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } template <> -void HostBlas::herk(const char transa, const char transb, int n, int k, - const double alpha, const double* a, int lda, - const double beta, - /* */ double* c, int ldc) { +void HostBlas::herk(const char transa, const char transb, KK_INT n, + KK_INT k, const double alpha, const double* a, + KK_INT lda, const double beta, + /* */ double* c, KK_INT ldc) { F77_FUNC_DSYRK(&transa, &transb, &n, &k, &alpha, a, &lda, &beta, c, &ldc); } template <> void HostBlas::trmm(const char side, const char uplo, const char transa, - const char diag, int m, int n, const double alpha, - const double* a, int lda, - /* */ double* b, int ldb) { + const char diag, KK_INT m, KK_INT n, + const double alpha, const double* a, KK_INT lda, + /* */ double* b, KK_INT ldb) { F77_FUNC_DTRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } template <> void HostBlas::trsm(const char side, const char uplo, const char transa, - const char diag, int m, int n, const double alpha, - const double* a, int lda, - /* */ double* b, int ldb) { + const char diag, KK_INT m, KK_INT n, + const double alpha, const double* a, KK_INT lda, + /* */ double* b, KK_INT ldb) { F77_FUNC_DTRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } @@ -722,33 +803,37 @@ void HostBlas::trsm(const char side, const char uplo, const char transa, /// template <> -void HostBlas >::scal(int n, +void HostBlas >::scal(KK_INT n, const std::complex alpha, /* */ std::complex* x, - int x_inc) { + KK_INT x_inc) { F77_FUNC_CSCAL(&n, &alpha, x, &x_inc); } template <> -int HostBlas >::iamax(int n, const std::complex* x, - int x_inc) { +KK_INT HostBlas >::iamax(KK_INT n, + const std::complex* x, + KK_INT x_inc) { return F77_FUNC_ICAMAX(&n, x, &x_inc); } template <> -float HostBlas >::nrm2(int n, const std::complex* x, - int x_inc) { +float HostBlas >::nrm2(KK_INT n, + const std::complex* x, + KK_INT x_inc) { return F77_FUNC_SCNRM2(&n, x, &x_inc); } template <> -float HostBlas >::asum(int n, const std::complex* x, - int x_inc) { +float HostBlas >::asum(KK_INT n, + const std::complex* x, + KK_INT x_inc) { return F77_FUNC_SCASUM(&n, x, &x_inc); } template <> std::complex HostBlas >::dot( - int n, const std::complex* x, int x_inc, - const std::complex* y, int y_inc) { + KK_INT n, const std::complex* x, KK_INT x_inc, + const std::complex* y, KK_INT y_inc) { #if defined(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) - return F77_FUNC_CDOTC(&n, x, &x_inc, y, &y_inc); + _kk_float2 res = F77_FUNC_CDOTC(&n, x, &x_inc, y, &y_inc); + return std::complex(res.vals[0], res.vals[1]); #else std::complex res; F77_FUNC_CDOTC(&res, &n, x, &x_inc, y, &y_inc); @@ -756,18 +841,20 @@ std::complex HostBlas >::dot( #endif } template <> -void HostBlas >::axpy(int n, +void HostBlas >::axpy(KK_INT n, const std::complex alpha, const std::complex* x, - int x_inc, + KK_INT x_inc, /* */ std::complex* y, - int y_inc) { + KK_INT y_inc) { F77_FUNC_CAXPY(&n, &alpha, x, &x_inc, y, &y_inc); } template <> -void HostBlas >::rot(int const N, std::complex* X, - int const incx, std::complex* Y, - int const incy, float* c, float* s) { +void HostBlas >::rot(KK_INT const N, std::complex* X, + KK_INT const incx, + std::complex* Y, + KK_INT const incy, float* c, + float* s) { F77_FUNC_CROT(&N, X, &incx, Y, &incy, c, s); } template <> @@ -777,38 +864,37 @@ void HostBlas >::rotg(std::complex* a, F77_FUNC_CROTG(a, b, c, s); } template <> -void HostBlas >::swap(int const N, std::complex* X, - int const incx, +void HostBlas >::swap(KK_INT const N, + std::complex* X, + KK_INT const incx, std::complex* Y, - int const incy) { + KK_INT const incy) { F77_FUNC_CSWAP(&N, X, &incx, Y, &incy); } template <> -void HostBlas >::gemv(const char trans, int m, int n, - const std::complex alpha, - const std::complex* a, int lda, - const std::complex* b, int ldb, - const std::complex beta, - /* */ std::complex* c, - int ldc) { +void HostBlas >::gemv( + const char trans, KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* a, KK_INT lda, const std::complex* b, + KK_INT ldb, const std::complex beta, + /* */ std::complex* c, KK_INT ldc) { F77_FUNC_CGEMV(&trans, &m, &n, &alpha, (const std::complex*)a, &lda, (const std::complex*)b, &ldb, &beta, (std::complex*)c, &ldc); } template <> void HostBlas >::geru( - int m, int n, const std::complex alpha, const std::complex* x, - int incx, const std::complex* y, int incy, std::complex* a, - int lda) { + KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* x, KK_INT incx, const std::complex* y, + KK_INT incy, std::complex* a, KK_INT lda) { F77_FUNC_CGERU(&m, &n, &alpha, (const std::complex*)x, &incx, (const std::complex*)y, &incy, (std::complex*)a, &lda); } template <> void HostBlas >::gerc( - int m, int n, const std::complex alpha, const std::complex* x, - int incx, const std::complex* y, int incy, std::complex* a, - int lda) { + KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* x, KK_INT incx, const std::complex* y, + KK_INT incy, std::complex* a, KK_INT lda) { F77_FUNC_CGERC(&m, &n, &alpha, (const std::complex*)x, &incx, (const std::complex*)y, &incy, (std::complex*)a, &lda); @@ -816,63 +902,67 @@ void HostBlas >::gerc( template <> template <> void HostBlas >::cher( - const char uplo, int n, const float alpha, const std::complex* x, - int incx, std::complex* a, int lda) { + const char uplo, KK_INT n, const float alpha, const std::complex* x, + KK_INT incx, std::complex* a, KK_INT lda) { F77_FUNC_CHER(&uplo, &n, &alpha, (const std::complex*)x, &incx, (std::complex*)a, &lda); } template <> +void HostBlas >::cher2( + const char uplo, KK_INT n, const std::complex alpha, + const std::complex* x, KK_INT incx, const std::complex* y, + KK_INT incy, std::complex* a, KK_INT lda) { + F77_FUNC_CHER2(&uplo, &n, &alpha, (const std::complex*)x, &incx, + (const std::complex*)y, &incy, (std::complex*)a, + &lda); +} +template <> void HostBlas >::trsv(const char uplo, const char transa, - const char diag, int m, - const std::complex* a, int lda, + const char diag, KK_INT m, + const std::complex* a, + KK_INT lda, /* */ std::complex* b, - int ldb) { + KK_INT ldb) { F77_FUNC_CTRSV(&uplo, &transa, &diag, &m, (const std::complex*)a, &lda, (std::complex*)b, &ldb); } template <> void HostBlas >::gemm( - const char transa, const char transb, int m, int n, int k, - const std::complex alpha, const std::complex* a, int lda, - const std::complex* b, int ldb, const std::complex beta, - /* */ std::complex* c, int ldc) { + const char transa, const char transb, KK_INT m, KK_INT n, KK_INT k, + const std::complex alpha, const std::complex* a, KK_INT lda, + const std::complex* b, KK_INT ldb, const std::complex beta, + /* */ std::complex* c, KK_INT ldc) { F77_FUNC_CGEMM(&transa, &transb, &m, &n, &k, &alpha, (const std::complex*)a, &lda, (const std::complex*)b, &ldb, &beta, (std::complex*)c, &ldc); } template <> -void HostBlas >::herk(const char transa, const char transb, - int n, int k, - const std::complex alpha, - const std::complex* a, int lda, - const std::complex beta, - /* */ std::complex* c, - int ldc) { +void HostBlas >::herk( + const char transa, const char transb, KK_INT n, KK_INT k, + const std::complex alpha, const std::complex* a, KK_INT lda, + const std::complex beta, + /* */ std::complex* c, KK_INT ldc) { F77_FUNC_CHERK(&transa, &transb, &n, &k, &alpha, (const std::complex*)a, &lda, &beta, (std::complex*)c, &ldc); } template <> -void HostBlas >::trmm(const char side, const char uplo, - const char transa, const char diag, - int m, int n, - const std::complex alpha, - const std::complex* a, int lda, - /* */ std::complex* b, - int ldb) { +void HostBlas >::trmm( + const char side, const char uplo, const char transa, const char diag, + KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* a, KK_INT lda, + /* */ std::complex* b, KK_INT ldb) { F77_FUNC_CTRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const std::complex*)a, &lda, (std::complex*)b, &ldb); } template <> -void HostBlas >::trsm(const char side, const char uplo, - const char transa, const char diag, - int m, int n, - const std::complex alpha, - const std::complex* a, int lda, - /* */ std::complex* b, - int ldb) { +void HostBlas >::trsm( + const char side, const char uplo, const char transa, const char diag, + KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* a, KK_INT lda, + /* */ std::complex* b, KK_INT ldb) { F77_FUNC_CTRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const std::complex*)a, &lda, (std::complex*)b, &ldb); @@ -883,35 +973,37 @@ void HostBlas >::trsm(const char side, const char uplo, /// template <> -void HostBlas >::scal(int n, +void HostBlas >::scal(KK_INT n, const std::complex alpha, /* */ std::complex* x, - int x_inc) { + KK_INT x_inc) { F77_FUNC_ZSCAL(&n, &alpha, x, &x_inc); } template <> -int HostBlas >::iamax(int n, const std::complex* x, - int x_inc) { +KK_INT HostBlas >::iamax(KK_INT n, + const std::complex* x, + KK_INT x_inc) { return F77_FUNC_IZAMAX(&n, x, &x_inc); } template <> -double HostBlas >::nrm2(int n, +double HostBlas >::nrm2(KK_INT n, const std::complex* x, - int x_inc) { + KK_INT x_inc) { return F77_FUNC_DZNRM2(&n, x, &x_inc); } template <> -double HostBlas >::asum(int n, +double HostBlas >::asum(KK_INT n, const std::complex* x, - int x_inc) { + KK_INT x_inc) { return F77_FUNC_DZASUM(&n, x, &x_inc); } template <> std::complex HostBlas >::dot( - int n, const std::complex* x, int x_inc, - const std::complex* y, int y_inc) { + KK_INT n, const std::complex* x, KK_INT x_inc, + const std::complex* y, KK_INT y_inc) { #if defined(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) - return F77_FUNC_ZDOTC(&n, x, &x_inc, y, &y_inc); + _kk_double2 res = F77_FUNC_ZDOTC(&n, x, &x_inc, y, &y_inc); + return std::complex(res.vals[0], res.vals[1]); #else std::complex res; F77_FUNC_ZDOTC(&res, &n, x, &x_inc, y, &y_inc); @@ -919,20 +1011,18 @@ std::complex HostBlas >::dot( #endif } template <> -void HostBlas >::axpy(int n, +void HostBlas >::axpy(KK_INT n, const std::complex alpha, const std::complex* x, - int x_inc, + KK_INT x_inc, /* */ std::complex* y, - int y_inc) { + KK_INT y_inc) { F77_FUNC_ZAXPY(&n, &alpha, x, &x_inc, y, &y_inc); } template <> -void HostBlas >::rot(int const N, std::complex* X, - int const incx, - std::complex* Y, - int const incy, double* c, - double* s) { +void HostBlas >::rot( + KK_INT const N, std::complex* X, KK_INT const incx, + std::complex* Y, KK_INT const incy, double* c, double* s) { F77_FUNC_ZROT(&N, X, &incx, Y, &incy, c, s); } template <> @@ -942,36 +1032,37 @@ void HostBlas >::rotg(std::complex* a, F77_FUNC_ZROTG(a, b, c, s); } template <> -void HostBlas >::swap(int const N, std::complex* X, - int const incx, +void HostBlas >::swap(KK_INT const N, + std::complex* X, + KK_INT const incx, std::complex* Y, - int const incy) { + KK_INT const incy) { F77_FUNC_ZSWAP(&N, X, &incx, Y, &incy); } template <> void HostBlas >::gemv( - const char trans, int m, int n, const std::complex alpha, - const std::complex* a, int lda, const std::complex* b, - int ldb, const std::complex beta, - /* */ std::complex* c, int ldc) { + const char trans, KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* a, KK_INT lda, const std::complex* b, + KK_INT ldb, const std::complex beta, + /* */ std::complex* c, KK_INT ldc) { F77_FUNC_ZGEMV(&trans, &m, &n, &alpha, (const std::complex*)a, &lda, (const std::complex*)b, &ldb, &beta, (std::complex*)c, &ldc); } template <> void HostBlas >::geru( - int m, int n, const std::complex alpha, - const std::complex* x, int incx, const std::complex* y, - int incy, std::complex* a, int lda) { + KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* x, KK_INT incx, const std::complex* y, + KK_INT incy, std::complex* a, KK_INT lda) { F77_FUNC_ZGERU(&m, &n, &alpha, (const std::complex*)x, &incx, (const std::complex*)y, &incy, (std::complex*)a, &lda); } template <> void HostBlas >::gerc( - int m, int n, const std::complex alpha, - const std::complex* x, int incx, const std::complex* y, - int incy, std::complex* a, int lda) { + KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* x, KK_INT incx, const std::complex* y, + KK_INT incy, std::complex* a, KK_INT lda) { F77_FUNC_ZGERC(&m, &n, &alpha, (const std::complex*)x, &incx, (const std::complex*)y, &incy, (std::complex*)a, &lda); @@ -979,28 +1070,38 @@ void HostBlas >::gerc( template <> template <> void HostBlas >::zher( - const char uplo, int n, const double alpha, const std::complex* x, - int incx, std::complex* a, int lda) { + const char uplo, KK_INT n, const double alpha, + const std::complex* x, KK_INT incx, std::complex* a, + KK_INT lda) { F77_FUNC_ZHER(&uplo, &n, &alpha, (const std::complex*)x, &incx, (std::complex*)a, &lda); } template <> +void HostBlas >::zher2( + const char uplo, KK_INT n, const std::complex alpha, + const std::complex* x, KK_INT incx, const std::complex* y, + KK_INT incy, std::complex* a, KK_INT lda) { + F77_FUNC_ZHER2(&uplo, &n, &alpha, (const std::complex*)x, &incx, + (const std::complex*)y, &incy, + (std::complex*)a, &lda); +} +template <> void HostBlas >::trsv(const char uplo, const char transa, - const char diag, int m, + const char diag, KK_INT m, const std::complex* a, - int lda, + KK_INT lda, /* */ std::complex* b, - int ldb) { + KK_INT ldb) { F77_FUNC_ZTRSV(&uplo, &transa, &diag, &m, (const std::complex*)a, &lda, (std::complex*)b, &ldb); } template <> void HostBlas >::gemm( - const char transa, const char transb, int m, int n, int k, - const std::complex alpha, const std::complex* a, int lda, - const std::complex* b, int ldb, const std::complex beta, - /* */ std::complex* c, int ldc) { + const char transa, const char transb, KK_INT m, KK_INT n, KK_INT k, + const std::complex alpha, const std::complex* a, KK_INT lda, + const std::complex* b, KK_INT ldb, const std::complex beta, + /* */ std::complex* c, KK_INT ldc) { F77_FUNC_ZGEMM(&transa, &transb, &m, &n, &k, &alpha, (const std::complex*)a, &lda, (const std::complex*)b, &ldb, &beta, @@ -1008,30 +1109,30 @@ void HostBlas >::gemm( } template <> void HostBlas >::herk( - const char transa, const char transb, int n, int k, - const std::complex alpha, const std::complex* a, int lda, + const char transa, const char transb, KK_INT n, KK_INT k, + const std::complex alpha, const std::complex* a, KK_INT lda, const std::complex beta, - /* */ std::complex* c, int ldc) { + /* */ std::complex* c, KK_INT ldc) { F77_FUNC_ZHERK(&transa, &transb, &n, &k, &alpha, (const std::complex*)a, &lda, &beta, (std::complex*)c, &ldc); } template <> void HostBlas >::trmm( - const char side, const char uplo, const char transa, const char diag, int m, - int n, const std::complex alpha, const std::complex* a, - int lda, - /* */ std::complex* b, int ldb) { + const char side, const char uplo, const char transa, const char diag, + KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* a, KK_INT lda, + /* */ std::complex* b, KK_INT ldb) { F77_FUNC_ZTRMM(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const std::complex*)a, &lda, (std::complex*)b, &ldb); } template <> void HostBlas >::trsm( - const char side, const char uplo, const char transa, const char diag, int m, - int n, const std::complex alpha, const std::complex* a, - int lda, - /* */ std::complex* b, int ldb) { + const char side, const char uplo, const char transa, const char diag, + KK_INT m, KK_INT n, const std::complex alpha, + const std::complex* a, KK_INT lda, + /* */ std::complex* b, KK_INT ldb) { F77_FUNC_ZTRSM(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const std::complex*)a, &lda, (std::complex*)b, &ldb); diff --git a/packages/kokkos-kernels/blas/tpls/KokkosBlas_Host_tpl.hpp b/packages/kokkos-kernels/blas/tpls/KokkosBlas_Host_tpl.hpp index 06a562015551..5fb7c1f624ff 100644 --- a/packages/kokkos-kernels/blas/tpls/KokkosBlas_Host_tpl.hpp +++ b/packages/kokkos-kernels/blas/tpls/KokkosBlas_Host_tpl.hpp @@ -25,87 +25,106 @@ #include "Kokkos_ArithTraits.hpp" #if defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) +#include "mkl_types.h" +#endif namespace KokkosBlas { namespace Impl { +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) +using KK_INT = MKL_INT; +#else +using KK_INT = int; +#endif + template struct HostBlas { typedef Kokkos::ArithTraits ats; typedef typename ats::mag_type mag_type; - static void scal(int n, const T alpha, - /* */ T *x, int x_inc); + static void scal(KK_INT n, const T alpha, + /* */ T *x, KK_INT x_inc); - static int iamax(int n, const T *x, int x_inc); + static KK_INT iamax(KK_INT n, const T *x, KK_INT x_inc); - static mag_type nrm2(int n, const T *x, int x_inc); + static mag_type nrm2(KK_INT n, const T *x, KK_INT x_inc); - static mag_type asum(int n, const T *x, int x_inc); + static mag_type asum(KK_INT n, const T *x, KK_INT x_inc); - static T dot(int n, const T *x, int x_inc, const T *y, int y_inc); + static T dot(KK_INT n, const T *x, KK_INT x_inc, const T *y, KK_INT y_inc); - static void axpy(int n, const T alpha, const T *x, int x_inc, - /* */ T *y, int y_inc); + static void axpy(KK_INT n, const T alpha, const T *x, KK_INT x_inc, + /* */ T *y, KK_INT y_inc); - static void rot(int const N, T *X, int const incx, T *Y, int const incy, - mag_type *c, mag_type *s); + static void rot(KK_INT const N, T *X, KK_INT const incx, T *Y, + KK_INT const incy, mag_type *c, mag_type *s); static void rotg(T *a, T *b, mag_type *c, T *s); - static void rotm(const int n, T *X, const int incx, T *Y, const int incy, - T const *param); + static void rotm(const KK_INT n, T *X, const KK_INT incx, T *Y, + const KK_INT incy, T const *param); static void rotmg(T *d1, T *d2, T *x1, const T *y1, T *param); - static void swap(int const N, T *X, int const incx, T *Y, int const incy); + static void swap(KK_INT const N, T *X, KK_INT const incx, T *Y, + KK_INT const incy); + + static void gemv(const char trans, KK_INT m, KK_INT n, const T alpha, + const T *a, KK_INT lda, const T *b, KK_INT ldb, const T beta, + /* */ T *c, KK_INT ldc); - static void gemv(const char trans, int m, int n, const T alpha, const T *a, - int lda, const T *b, int ldb, const T beta, - /* */ T *c, int ldc); + static void ger(KK_INT m, KK_INT n, const T alpha, const T *x, KK_INT incx, + const T *y, KK_INT incy, T *a, KK_INT lda); - static void ger(int m, int n, const T alpha, const T *x, int incx, const T *y, - int incy, T *a, int lda); + static void geru(KK_INT m, KK_INT n, const T alpha, const T *x, KK_INT incx, + const T *y, KK_INT incy, T *a, KK_INT lda); - static void geru(int m, int n, const T alpha, const T *x, int incx, - const T *y, int incy, T *a, int lda); + static void gerc(KK_INT m, KK_INT n, const T alpha, const T *x, KK_INT incx, + const T *y, KK_INT incy, T *a, KK_INT lda); - static void gerc(int m, int n, const T alpha, const T *x, int incx, - const T *y, int incy, T *a, int lda); + static void syr(const char uplo, KK_INT n, const T alpha, const T *x, + KK_INT incx, T *a, KK_INT lda); - static void syr(const char uplo, int n, const T alpha, const T *x, int incx, - T *a, int lda); + static void syr2(const char uplo, KK_INT n, const T alpha, const T *x, + KK_INT incx, const T *y, KK_INT incy, T *a, KK_INT lda); template - static void cher(const char uplo, int n, const tAlpha alpha, const T *x, - int incx, T *a, int lda); + static void cher(const char uplo, KK_INT n, const tAlpha alpha, const T *x, + KK_INT incx, T *a, KK_INT lda); template - static void zher(const char uplo, int n, const tAlpha alpha, const T *x, - int incx, T *a, int lda); + static void zher(const char uplo, KK_INT n, const tAlpha alpha, const T *x, + KK_INT incx, T *a, KK_INT lda); + + static void cher2(const char uplo, KK_INT n, const T alpha, const T *x, + KK_INT incx, const T *y, KK_INT incy, T *a, KK_INT lda); + + static void zher2(const char uplo, KK_INT n, const T alpha, const T *x, + KK_INT incx, const T *y, KK_INT incy, T *a, KK_INT lda); - static void trsv(const char uplo, const char transa, const char diag, int m, - const T *a, int lda, - /* */ T *b, int ldb); + static void trsv(const char uplo, const char transa, const char diag, + KK_INT m, const T *a, KK_INT lda, + /* */ T *b, KK_INT ldb); - static void gemm(const char transa, const char transb, int m, int n, int k, - const T alpha, const T *a, int lda, const T *b, int ldb, - const T beta, - /* */ T *c, int ldc); + static void gemm(const char transa, const char transb, KK_INT m, KK_INT n, + KK_INT k, const T alpha, const T *a, KK_INT lda, const T *b, + KK_INT ldb, const T beta, + /* */ T *c, KK_INT ldc); - static void herk(const char transa, const char transb, int n, int k, - const T alpha, const T *a, int lda, const T beta, - /* */ T *c, int ldc); + static void herk(const char transa, const char transb, KK_INT n, KK_INT k, + const T alpha, const T *a, KK_INT lda, const T beta, + /* */ T *c, KK_INT ldc); static void trmm(const char side, const char uplo, const char transa, - const char diag, int m, int n, const T alpha, const T *a, - int lda, - /* */ T *b, int ldb); + const char diag, KK_INT m, KK_INT n, const T alpha, + const T *a, KK_INT lda, + /* */ T *b, KK_INT ldb); static void trsm(const char side, const char uplo, const char transa, - const char diag, int m, int n, const T alpha, const T *a, - int lda, - /* */ T *b, int ldb); + const char diag, KK_INT m, KK_INT n, const T alpha, + const T *a, KK_INT lda, + /* */ T *b, KK_INT ldb); }; } // namespace Impl } // namespace KokkosBlas diff --git a/packages/kokkos-kernels/blas/unit_test/Test_Blas.hpp b/packages/kokkos-kernels/blas/unit_test/Test_Blas.hpp index a29c5ffd727d..9bb37d8d95a3 100644 --- a/packages/kokkos-kernels/blas/unit_test/Test_Blas.hpp +++ b/packages/kokkos-kernels/blas/unit_test/Test_Blas.hpp @@ -21,6 +21,7 @@ #include "Test_Blas1_asum.hpp" #include "Test_Blas1_axpby.hpp" #include "Test_Blas1_axpy.hpp" +#include "Test_Blas1_axpby_unification.hpp" #include "Test_Blas1_dot.hpp" #include "Test_Blas1_iamax.hpp" #include "Test_Blas1_mult.hpp" @@ -60,6 +61,7 @@ #include "Test_Blas2_gemv.hpp" #include "Test_Blas2_ger.hpp" #include "Test_Blas2_syr.hpp" +#include "Test_Blas2_syr2.hpp" // Serial Blas 2 #include "Test_Blas2_serial_gemv.hpp" diff --git a/packages/kokkos-kernels/blas/unit_test/Test_Blas1_axpby.hpp b/packages/kokkos-kernels/blas/unit_test/Test_Blas1_axpby.hpp index 8d5afb5f0b9a..299e18e493b0 100644 --- a/packages/kokkos-kernels/blas/unit_test/Test_Blas1_axpby.hpp +++ b/packages/kokkos-kernels/blas/unit_test/Test_Blas1_axpby.hpp @@ -109,8 +109,6 @@ void impl_test_axpby_mv(int N, int K) { Kokkos::deep_copy(org_y.h_base, y.d_base); Kokkos::deep_copy(x.h_base, x.d_base); - Kokkos::View r("Dot::Result", K); - KokkosBlas::axpby(a, x.d_view, b, y.d_view); Kokkos::deep_copy(y.h_base, y.d_base); diff --git a/packages/kokkos-kernels/blas/unit_test/Test_Blas1_axpby_unification.hpp b/packages/kokkos-kernels/blas/unit_test/Test_Blas1_axpby_unification.hpp new file mode 100644 index 000000000000..6ce7bad0b148 --- /dev/null +++ b/packages/kokkos-kernels/blas/unit_test/Test_Blas1_axpby_unification.hpp @@ -0,0 +1,2741 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +// ********************************************************************** +// The tests executed by the code below cover many combinations for +// the operation y += a * x + b * y: +// 01) Type of 'x' and 'a' components: float, double, complex, ... +// 02) Type of 'y' and 'b' components: float, double, complex, ... +// 03) Execution space: serial, threads, OpenMP, Cuda, ... +// 04) Layout of 'x' and 'a' +// 05) Layout of 'y' and 'b' +// 06) Ranks of 'x' and 'y': rank-1 or rank-2 +// 07) Ranks of 'a' and 'b': scalars or rank-0 or rank-1 +// +// Choices (01)-(03) are selected in the routines TEST_F() at the very +// bottom of the file, when calling: +// - either test_axpby_unification<...>(), +// - or test_axpby_mv_unification<...>(). +// +// Choices (04)-(05) are selected in routines: +// - test_axpby_unification<...>(), when calling +// Test::impl_test_axpby_unification<...>(), and +// - test_axpby_mv_unification<...>(), when calling +// Test::impl_test_axpby_mv_unification<...>(). +// +// Choices (06)-(07) are selected in routines: +// - Test::impl_test_axpby_unification<...>(), through +// 16 different combinations and calls to +// Test::impl_test_axpby_unification_compare<...>(), and +// - Test::impl_test_axpby_mv_unification<...>(), through +// 36 different combinations and calls to +// Test::impl_test_axpby_mv_unification_compare<...>(). +// +// The constexpr integer value 15 for 'numVecsAxpbyTest' was chosen to +// force the test of the three unrolling values 8, 4, and 1, in routine +// Axpby_MV_Invoke_Left<...>(...) in file KokkosBlas1_axpby_mv_impl.hpp +// ********************************************************************** + +#include +#include +#include +#include + +static constexpr int numVecsAxpbyTest = 15; + +namespace Test { + +template +struct getScalarTypeFromT { + using type = T; +}; + +template +struct getScalarTypeFromT { + using type = typename T::value_type; +}; + +template +constexpr bool isRank0() { + if constexpr (Kokkos::is_view_v) { + return (T::rank == 0); + } + return false; +} + +template +void impl_test_axpby_unification_compare( + tA const& a, tX const& x, tB const& b, tY const& y, int N, + bool testWithNanY, + typename Kokkos::ArithTraits::mag_type const max_val, + typename Kokkos::ArithTraits::mag_type const max_error, + tScalarA const inputValueA = Kokkos::ArithTraits::zero(), + tScalarB const inputValueB = Kokkos::ArithTraits::zero()) { + using ScalarTypeX = + typename std::remove_const::type; + using ScalarTypeY = + typename std::remove_const::type; + + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); + + { + ScalarTypeX randStart, randEnd; + Test::getRandomBounds(max_val, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); + } + Kokkos::deep_copy(x.h_base, x.d_base); + + { + ScalarTypeY randStart, randEnd; + Test::getRandomBounds(max_val, randStart, randEnd); + if (testWithNanY) { + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::nan()); + } else { + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); + } + } + tY org_y("Org_Y", N); + Kokkos::deep_copy(org_y.h_base, y.d_base); + + tScalarA valueA(Kokkos::ArithTraits::zero()); + tScalarB valueB(Kokkos::ArithTraits::zero()); + + if constexpr (std::is_same_v) { + valueA = a; + if constexpr (std::is_same_v) { + valueB = b; + KokkosBlas::axpby(a, x.d_view, b, y.d_view); + } else if constexpr (isRank0()) { + if constexpr (std::is_same_v) { + valueB = inputValueB; + } else { + typename tB::HostMirror h_b("h_B"); + Kokkos::deep_copy(h_b, b); + valueB = h_b(); + } + KokkosBlas::axpby(a, x.d_view, b, y.d_view); + } else { + Kokkos::deep_copy(b.h_base, b.d_base); + valueB = b.h_view(0); + KokkosBlas::axpby(a, x.d_view, b.d_view, y.d_view); + } + } else if constexpr (isRank0()) { + if constexpr (std::is_same_v) { + valueA = inputValueA; + } else { + typename tA::HostMirror h_a("h_A"); + Kokkos::deep_copy(h_a, a); + valueA = h_a(); + } + if constexpr (std::is_same_v) { + valueB = b; + KokkosBlas::axpby(a, x.d_view, b, y.d_view); + } else if constexpr (isRank0()) { + if constexpr (std::is_same_v) { + valueB = inputValueB; + } else { + typename tB::HostMirror h_b("h_B"); + Kokkos::deep_copy(h_b, b); + valueB = h_b(); + } + KokkosBlas::axpby(a, x.d_view, b, y.d_view); + } else { + Kokkos::deep_copy(b.h_base, b.d_base); + valueB = b.h_view(0); + KokkosBlas::axpby(a, x.d_view, b.d_view, y.d_view); + } + } else { + Kokkos::deep_copy(a.h_base, a.d_base); + valueA = a.h_view(0); + if constexpr (std::is_same_v) { + valueB = b; + KokkosBlas::axpby(a.d_view, x.d_view, b, y.d_view); + } else if constexpr (isRank0()) { + if constexpr (std::is_same_v) { + valueB = inputValueB; + } else { + typename tB::HostMirror h_b("h_B"); + Kokkos::deep_copy(h_b, b); + valueB = h_b(); + } + KokkosBlas::axpby(a.d_view, x.d_view, b, y.d_view); + } else { + Kokkos::deep_copy(b.h_base, b.d_base); + valueB = b.h_view(0); + KokkosBlas::axpby(a.d_view, x.d_view, b.d_view, y.d_view); + } + } + + Kokkos::deep_copy(y.h_base, y.d_base); + + if (testWithNanY == false) { + for (int i(0); i < N; ++i) { + EXPECT_NEAR_KK(static_cast(valueA * x.h_view(i) + + valueB * org_y.h_view(i)), + y.h_view(i), 4. * max_error); + } + } else { + // ******************************************************** + // Tests with 'Y == nan()' are called only for cases where + // b == Kokkos::ArithTraits::zero() + // ******************************************************** + for (int i(0); i < N; ++i) { +#if 0 + ScalarTypeY tmp = static_cast(valueA * x.h_view(i) + valueB * org_y.h_view(i)); + std::cout << "i = " << i + << ", valueA = " << valueA + << ", x.h_view(i) = " << x.h_view(i) + << ", valueB = " << valueB + << ", org_y.h_view(i) = " << org_y.h_view(i) + << ", tmp = " << tmp + << ", y.h_view(i) = " << y.h_view(i) + << std::endl; +#endif + if constexpr (std::is_same_v) { + // **************************************************************** + // 'nan()' converts to '-1' in case of 'int' => no need to compare + // **************************************************************** + if (y.h_view(i) != -1) { + EXPECT_NE(y.h_view(i), Kokkos::ArithTraits::nan()); + } + } else { + EXPECT_NE(y.h_view(i), Kokkos::ArithTraits::nan()); + } + EXPECT_NEAR_KK(static_cast(valueA * x.h_view(i)), + y.h_view(i), 4. * max_error); + } + } +} + +template +void impl_test_axpby_mv_unification_compare( + tA const& a, tX const& x, tB const& b, tY const& y, int N, int K, + bool testWithNanY, + typename Kokkos::ArithTraits::mag_type const max_val, + typename Kokkos::ArithTraits::mag_type const max_error, + tScalarA const inputValueA = Kokkos::ArithTraits::zero(), + tScalarB const inputValueB = Kokkos::ArithTraits::zero()) { + using ScalarTypeX = + typename std::remove_const::type; + using ScalarTypeY = + typename std::remove_const::type; + + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); + + { + ScalarTypeX randStart, randEnd; + Test::getRandomBounds(max_val, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); + } + Kokkos::deep_copy(x.h_base, x.d_base); + + { + ScalarTypeY randStart, randEnd; + Test::getRandomBounds(max_val, randStart, randEnd); + if (testWithNanY) { + Kokkos::deep_copy(y.d_view, Kokkos::ArithTraits::nan()); + } else { + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); + } + } + tY org_y("Org_Y", N, K); + Kokkos::deep_copy(org_y.h_base, y.d_base); + + // Cannot use "if constexpr (isRank1()) {" because rank-1 variables + // are passed to current routine with view_stride_adapter<...> + bool constexpr aIsRank1 = !std::is_same_v && !isRank0(); + if constexpr (aIsRank1) { + Kokkos::deep_copy(a.h_base, a.d_base); + } + + // Cannot use "if constexpr (isRank1()) {" because rank-1 variables + // are passed to current routine with view_stride_adapter<...> + bool constexpr bIsRank1 = !std::is_same_v && !isRank0(); + if constexpr (bIsRank1) { + Kokkos::deep_copy(b.h_base, b.d_base); + } + + tScalarA valueA(Kokkos::ArithTraits::zero()); + tScalarB valueB(Kokkos::ArithTraits::zero()); + if constexpr (std::is_same_v) { + valueA = a; + if constexpr (std::is_same_v) { + valueB = b; + KokkosBlas::axpby(a, x.d_view, b, y.d_view); + } else if constexpr (isRank0()) { + if constexpr (std::is_same_v) { + valueB = inputValueB; + } else { + typename tB::HostMirror h_b("h_B"); + Kokkos::deep_copy(h_b, b); + valueB = h_b(); + } + KokkosBlas::axpby(a, x.d_view, b, y.d_view); + } else { + valueB = b.h_view(0); + KokkosBlas::axpby(a, x.d_view, b.d_view, y.d_view); + } + } else if constexpr (isRank0()) { + if constexpr (std::is_same_v) { + valueA = inputValueA; + } else { + typename tA::HostMirror h_a("h_A"); + Kokkos::deep_copy(h_a, a); + valueA = h_a(); + } + if constexpr (std::is_same_v) { + valueB = b; + KokkosBlas::axpby(a, x.d_view, b, y.d_view); + } else if constexpr (isRank0()) { + if constexpr (std::is_same_v) { + valueB = inputValueB; + } else { + typename tB::HostMirror h_b("h_B"); + Kokkos::deep_copy(h_b, b); + valueB = h_b(); + } + KokkosBlas::axpby(a, x.d_view, b, y.d_view); + } else { + valueB = b.h_view(0); + KokkosBlas::axpby(a, x.d_view, b.d_view, y.d_view); + } + } else { + valueA = a.h_view(0); + if constexpr (std::is_same_v) { + valueB = b; + KokkosBlas::axpby(a.d_view, x.d_view, b, y.d_view); + } else if constexpr (isRank0()) { + if constexpr (std::is_same_v) { + valueB = inputValueB; + } else { + typename tB::HostMirror h_b("h_B"); + Kokkos::deep_copy(h_b, b); + valueB = h_b(); + } + KokkosBlas::axpby(a.d_view, x.d_view, b, y.d_view); + } else { + valueB = b.h_view(0); + KokkosBlas::axpby(a.d_view, x.d_view, b.d_view, y.d_view); + } + } + + Kokkos::deep_copy(y.h_base, y.d_base); + + if (testWithNanY == false) { + for (int i(0); i < N; ++i) { + for (int k(0); k < K; ++k) { + ScalarTypeY vanillaValue(Kokkos::ArithTraits::zero()); + if constexpr (aIsRank1) { + (void)valueA; // Avoid "set but not used" error + if constexpr (bIsRank1) { + (void)valueB; // Avoid "set but not used" error + int a_k(a.h_view.extent(0) == 1 ? 0 : k); + int b_k(b.h_view.extent(0) == 1 ? 0 : k); +#if 0 + std::cout << "In impl_test_axpby_mv_unification_compare()" + << ": i = " << i + << ", k = " << k + << ", a.h_view.extent(0) = " << a.h_view.extent(0) + << ", a_k = " << a_k + << ", b.h_view.extent(0) = " << b.h_view.extent(0) + << ", b_k = " << b_k + << ", a.h_view(a_k) = " << a.h_view(a_k) + << ", x.h_view(i, k) = " << x.h_view(i, k) + << ", b.h_view(b_k) = " << b.h_view(b_k) + << ", org_y.h_view(i, k) = " << org_y.h_view(i, k) + << std::endl; +#endif + vanillaValue = + static_cast(a.h_view(a_k) * x.h_view(i, k) + + b.h_view(b_k) * org_y.h_view(i, k)); + } else { + int a_k(a.h_view.extent(0) == 1 ? 0 : k); + vanillaValue = static_cast( + a.h_view(a_k) * x.h_view(i, k) + valueB * org_y.h_view(i, k)); + } + } else { + if constexpr (bIsRank1) { + (void)valueB; // Avoid "set but not used" error + int b_k(b.h_view.extent(0) == 1 ? 0 : k); + vanillaValue = static_cast( + valueA * x.h_view(i, k) + b.h_view(b_k) * org_y.h_view(i, k)); + } else { + vanillaValue = static_cast( + valueA * x.h_view(i, k) + valueB * org_y.h_view(i, k)); + } + } +#if 0 + std::cout << "In impl_test_axpby_mv_unification_compare(1)" + << ": i = " << i + << ", k = " << k + << ", y.h_view(i, k) = " << y.h_view(i, k) + << ", vanillaValue = " << vanillaValue + << std::endl; +#endif + EXPECT_NEAR_KK(vanillaValue, y.h_view(i, k), 4. * max_error); + } + } + } else { + // ******************************************************** + // Tests with 'Y == nan()' are called only for cases where + // b == Kokkos::ArithTraits::zero() + // ******************************************************** + for (int i(0); i < N; ++i) { + for (int k(0); k < K; ++k) { + ScalarTypeY vanillaValue(Kokkos::ArithTraits::zero()); + if constexpr (aIsRank1) { + (void)valueA; // Avoid "set but not used" error + int a_k(a.h_view.extent(0) == 1 ? 0 : k); + vanillaValue = + static_cast(a.h_view(a_k) * x.h_view(i, k)); +#if 0 + ScalarTypeY tmp = static_cast(a.h_view(a_k) * x.h_view(i, k) + valueB * org_y.h_view(i, k)); + std::cout << "i = " << i + << ", k = " << k + << ", a_k = " << a_k + << ", a.h_view(a_k) = " << a.h_view(a_k) + << ", x.h_view(i, k) = " << x.h_view(i, k) + << ", valueB = " << valueB + << ", org_y.h_view(i, k) = " << org_y.h_view(i, k) + << ", tmp = " << tmp + << ", vanillaValue = " << vanillaValue + << ", y.h_view(i, k) = " << y.h_view(i, k) + << std::endl; +#endif + } else { + vanillaValue = static_cast(valueA * x.h_view(i, k)); +#if 0 + ScalarTypeY tmp = static_cast(valueA * x.h_view(i, k) + valueB * org_y.h_view(i, k)); + std::cout << "i = " << i + << ", k = " << k + << ", valueA = " << valueA + << ", x.h_view(i, k) = " << x.h_view(i, k) + << ", valueB = " << valueB + << ", org_y.h_view(i, k) = " << org_y.h_view(i, k) + << ", tmp = " << tmp + << ", vanillaValue = " << vanillaValue + << ", y.h_view(i, k) = " << y.h_view(i, k) + << std::endl; +#endif + } + + if constexpr (std::is_same_v) { + // **************************************************************** + // 'nan()' converts to '-1' in case of 'int' => no need to compare + // **************************************************************** + if (y.h_view(i, k) != -1) { + EXPECT_NE(y.h_view(i, k), Kokkos::ArithTraits::nan()); + } + } else { + EXPECT_NE(y.h_view(i, k), Kokkos::ArithTraits::nan()); + } +#if 0 + std::cout << "In impl_test_axpby_mv_unification_compare(2)" + << ": i = " << i + << ", k = " << k + << ", y.h_view(i, k) = " << y.h_view(i, k) + << ", vanillaValue = " << vanillaValue + << std::endl; +#endif + EXPECT_NEAR_KK(vanillaValue, y.h_view(i, k), 4. * max_error); + } + } + } +} + +template +void impl_test_axpby_unification(int const N) { + using ViewTypeAr0 = Kokkos::View; + using ViewTypeAr1s_1 = Kokkos::View; + using ViewTypeAr1d = Kokkos::View; + + using ViewTypeX = Kokkos::View; + + using ViewTypeBr0 = Kokkos::View; + using ViewTypeBr1s_1 = Kokkos::View; + using ViewTypeBr1d = Kokkos::View; + + using ViewTypeY = Kokkos::View; + + std::array const valuesA{ + -1, Kokkos::ArithTraits::zero(), 1, 3}; + std::array const valuesB{ + -1, Kokkos::ArithTraits::zero(), 1, 5}; + + // eps should probably be based on tScalarB since that is the type + // in which the result is computed. + using MagnitudeB = typename Kokkos::ArithTraits::mag_type; + MagnitudeB const eps = Kokkos::ArithTraits::epsilon(); + MagnitudeB const max_val = 10; + MagnitudeB const max_error = + static_cast( + Kokkos::ArithTraits::abs(valuesA[valuesA.size() - 1]) + + Kokkos::ArithTraits::abs(valuesB[valuesB.size() - 1])) * + max_val * eps; + + // ************************************************************ + // Case 01/16: Ascalar + Bscalar + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 01/16" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + tScalarA a; + view_stride_adapter x("X", N); + tScalarB b; + view_stride_adapter y("Y", N); + + a = valueA; + b = valueB; + impl_test_axpby_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + tScalarB, view_stride_adapter, Device>( + a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + tScalarB, view_stride_adapter, Device>( + a, x, b, y, N, true, max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 02/16: Ascalar + Br0 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 02/16" << std::endl; +#endif + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + // ViewTypeBr0 b; + // Kokkos::deep_copy(b, valueB); + // //std::cout << "b() = " << b() << std::endl; + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + tScalarA a; + view_stride_adapter x("X", N); + ViewTypeBr0 b("B"); + view_stride_adapter y("Y", N); + + a = valueA; + Kokkos::deep_copy(b, valueB); + impl_test_axpby_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( + a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( + a, x, b, y, N, true, max_val, max_error); + } + } + } + } + } + + // ************************************************************ + // Case 03/16: Ascalar + Br1s_1 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 03/16" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + tScalarA a; + view_stride_adapter x("X", N); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N); + + a = valueA; + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, true, + max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 04/16: Ascalar + Br1d + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 04/16" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + tScalarA a; + view_stride_adapter x("X", N); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N); + + a = valueA; + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, true, max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 05/16: Ar0 + Bscalar + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 05/16" << std::endl; +#endif + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + ViewTypeAr0 a("A"); + view_stride_adapter x("X", N); + tScalarB b; + view_stride_adapter y("Y", N); + + Kokkos::deep_copy(a, valueA); + b = valueB; + impl_test_axpby_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + tScalarB, view_stride_adapter, Device>( + a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + tScalarB, view_stride_adapter, Device>( + a, x, b, y, N, true, max_val, max_error); + } + } + } + } + } + + // ************************************************************ + // Case 06/16: Ar0 + Br0 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 06/16" << std::endl; +#endif + if constexpr ((std::is_same_v) || + (std::is_same_v)) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + ViewTypeAr0 a("A"); + view_stride_adapter x("X", N); + ViewTypeBr0 b("B"); + view_stride_adapter y("Y", N); + + Kokkos::deep_copy(a, valueA); + Kokkos::deep_copy(b, valueB); + impl_test_axpby_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( + a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( + a, x, b, y, N, true, max_val, max_error); + } + } + } + } + } + + // ************************************************************ + // Case 07/16: Ar0 + Br1s_1 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 07/16" << std::endl; +#endif + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + ViewTypeAr0 a("A"); + view_stride_adapter x("X", N); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N); + + Kokkos::deep_copy(a, valueA); + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, false, + max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, true, + max_val, max_error); + } + } + } + } + } + + // ************************************************************ + // Case 08/16: Ar0 + Br1d + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 08/16" << std::endl; +#endif + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + ViewTypeAr0 a("A"); + view_stride_adapter x("X", N); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N); + + Kokkos::deep_copy(a, valueA); + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, true, + max_val, max_error); + } + } + } + } + } + + // ************************************************************ + // Case 09/16: Ar1s_1 + Bscalar + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 09/16" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N); + tScalarB b; + view_stride_adapter y("Y", N); + + Kokkos::deep_copy(a.d_base, valueA); + b = valueB; + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, false, + max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, true, + max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 10/16: Ar1s_1 + Br0 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 10/16" << std::endl; +#endif + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N); + ViewTypeBr0 b("B"); + view_stride_adapter y("Y", N); + + Kokkos::deep_copy(a.d_base, valueA); + Kokkos::deep_copy(b, valueB); + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, false, + max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, true, + max_val, max_error); + } + } + } + } + } + + // ************************************************************ + // Case 11/16: Ar1s_1 + Br1s_1 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 11/16" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N); + + Kokkos::deep_copy(a.d_base, valueA); + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, true, + max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 12/16: Ar1s_1 + Br1d + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 12/16" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N); + + Kokkos::deep_copy(a.d_base, valueA); + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, true, max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 13/16: Ar1d + Bscalar + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 13/16" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N); + tScalarB b; + view_stride_adapter y("Y", N); + + Kokkos::deep_copy(a.d_base, valueA); + b = valueB; + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, false, + max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, true, + max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 14/16: Ar1d + Br0 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 14/16" << std::endl; +#endif + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N); + ViewTypeBr0 b("B"); + view_stride_adapter y("Y", N); + + Kokkos::deep_copy(a.d_base, valueA); + Kokkos::deep_copy(b, valueB); + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, false, + max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, true, + max_val, max_error); + } + } + } + } + } + + // ************************************************************ + // Case 15/16: Ar1d + Br1s_1 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 15/16" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N); + + Kokkos::deep_copy(a.d_base, valueA); + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, true, + max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 16/16: Ar1d + Br1d + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 16/16" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N); + + Kokkos::deep_copy(a.d_base, valueA); + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, true, max_val, max_error); + } + } + } + } +} + +template +void impl_test_axpby_mv_unification(int const N, int const K) { + // std::cout << "=========================================" << std::endl; + // std::cout << "Entering impl_test_axpby_mv_unification()" + // << ": tLayoutA = " << typeid(tLayoutA).name() + // << ": tLayoutX = " << typeid(tLayoutX).name() + // << ", tLayoutB = " << typeid(tLayoutB).name() + // << ": tLayoutY = " << typeid(tLayoutY).name() + // << std::endl; + using ViewTypeAr0 = Kokkos::View; + using ViewTypeAr1s_1 = Kokkos::View; + using ViewTypeAr1s_k = Kokkos::View; // Yes, hard coded + using ViewTypeAr1d = Kokkos::View; + + using ViewTypeX = Kokkos::View; + + using ViewTypeBr0 = Kokkos::View; + using ViewTypeBr1s_1 = Kokkos::View; + using ViewTypeBr1s_k = Kokkos::View; // Yes, hard coded + using ViewTypeBr1d = Kokkos::View; + + using ViewTypeY = Kokkos::View; + + std::array const valuesA{ + -1, Kokkos::ArithTraits::zero(), 1, 3}; + std::array const valuesB{ + -1, Kokkos::ArithTraits::zero(), 1, 5}; + + // eps should probably be based on tScalarB since that is the type + // in which the result is computed. + using MagnitudeB = typename Kokkos::ArithTraits::mag_type; + MagnitudeB const eps = Kokkos::ArithTraits::epsilon(); + MagnitudeB const max_val = 10; + MagnitudeB const max_error = + static_cast( + Kokkos::ArithTraits::abs(valuesA[valuesA.size() - 1]) + + Kokkos::ArithTraits::abs(valuesB[valuesB.size() - 1])) * + max_val * eps; + + // ************************************************************ + // Case 01/36: Ascalar + Bscalar + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 01/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + tScalarA a; + view_stride_adapter x("X", N, K); + tScalarB b; + view_stride_adapter y("Y", N, K); + + a = valueA; + b = valueB; + impl_test_axpby_mv_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + tScalarB, view_stride_adapter, Device>( + a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + tScalarB, view_stride_adapter, Device>( + a, x, b, y, N, K, true, max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 02/36: Ascalar + Br0 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 02/36" << std::endl; +#endif + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + tScalarA a; + view_stride_adapter x("X", N, K); + ViewTypeBr0 b("B"); + view_stride_adapter y("Y", N, K); + + a = valueA; + Kokkos::deep_copy(b, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( + a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( + a, x, b, y, N, K, true, max_val, max_error); + } + } + } + } + } + + // ************************************************************ + // Case 03/36: Ascalar + Br1s_1 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 03/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + tScalarA a; + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N, K); + + a = valueA; + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 04/36: Ascalar + Br1s_k + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 04/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + if (K == numVecsAxpbyTest) { + tScalarA a; + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", K); + view_stride_adapter y("Y", N, K); + + a = valueA; + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + b.h_view[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } else { + for (int k(0); k < K; ++k) { + b.h_base[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } + impl_test_axpby_mv_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 05/36: Ascalar + Br1d,1 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 05/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + tScalarA a; + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N, K); + + a = valueA; + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, true, max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 06/36: Ascalar + Br1d,k + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 06/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + tScalarA a; + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", K); + view_stride_adapter y("Y", N, K); + + a = valueA; + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + b.h_view[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } else { + for (int k(0); k < K; ++k) { + b.h_base[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } + impl_test_axpby_mv_unification_compare< + tScalarA, tScalarA, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 07/36: Ar0 + Bscalar + // ************************************************************w +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 07/36" << std::endl; +#endif + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + ViewTypeAr0 a("A"); + view_stride_adapter x("X", N, K); + tScalarB b; + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a, valueA); + b = valueB; + impl_test_axpby_mv_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + tScalarB, view_stride_adapter, Device>( + a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + tScalarB, view_stride_adapter, Device>( + a, x, b, y, N, K, true, max_val, max_error); + } + } + } + } + } + + // ************************************************************ + // Case 08/36: Ar0 + Br0 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 08/36" << std::endl; +#endif + if constexpr ((std::is_same_v) || + (std::is_same_v)) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + ViewTypeAr0 a("A"); + view_stride_adapter x("X", N, K); + ViewTypeBr0 b("B"); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a, valueA); + Kokkos::deep_copy(b, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( + a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + ViewTypeBr0, view_stride_adapter, Device>( + a, x, b, y, N, K, true, max_val, max_error); + } + } + } + } + } + + // ************************************************************ + // Case 09/36: Ar0 + Br1s_1 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 09/36" << std::endl; +#endif + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + ViewTypeAr0 a("A"); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a, valueA); + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); + } + } + } + } + } + + // ************************************************************ + // Case 10/36: Ar0 + Br1s_k + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 10/36" << std::endl; +#endif + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + if (K == numVecsAxpbyTest) { + ViewTypeAr0 a("A"); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", K); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a, valueA); + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + b.h_view[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } else { + for (int k(0); k < K; ++k) { + b.h_base[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } + impl_test_axpby_mv_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 11/36: Ar0 + Br1d,1 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 11/36" << std::endl; +#endif + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + ViewTypeAr0 a("A"); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a, valueA); + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); + } + } + } + } + } + + // ************************************************************ + // Case 12/36: Ar0 + Br1d,k + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 12/36" << std::endl; +#endif + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + ViewTypeAr0 a("A"); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", K); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a, valueA); + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + b.h_view[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } else { + for (int k(0); k < K; ++k) { + b.h_base[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } + impl_test_axpby_mv_unification_compare< + tScalarA, ViewTypeAr0, view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 13/36: Ar1s_1 + Bscalar + // ************************************************************w +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 13/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N, K); + tScalarB b; + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a.d_base, valueA); + b = valueB; + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 14/36: Ar1s_1 + Br0 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 14/36" << std::endl; +#endif + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N, K); + ViewTypeBr0 b("B"); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a.d_base, valueA); + Kokkos::deep_copy(b, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); + } + } + } + } + } + + // ************************************************************ + // Case 15/36: Ar1s_1 + Br1s_1 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 15/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a.d_base, valueA); + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 16/36: Ar1s_1 + Br1s_k + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 16/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + if (K == numVecsAxpbyTest) { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", K); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a.d_base, valueA); + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + b.h_view[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } else { + for (int k(0); k < K; ++k) { + b.h_base[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 17/36: Ar1s_1 + Br1d,1 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 17/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a.d_base, valueA); + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, true, max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 18/36: Ar1s_1 + Br1d,k + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 18/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", K); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a.d_base, valueA); + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + b.h_view[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } else { + for (int k(0); k < K; ++k) { + b.h_base[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 19/36: Ar1s_k + Bscalar + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 19/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + if (K == numVecsAxpbyTest) { + view_stride_adapter a("A", K); + view_stride_adapter x("X", N, K); + tScalarB b; + view_stride_adapter y("Y", N, K); + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + a.h_view[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } else { + for (int k(0); k < K; ++k) { + a.h_base[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } + b = valueB; + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 20/36: Ar1s_k + Br0 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 20/36" << std::endl; +#endif + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + if (K == numVecsAxpbyTest) { + view_stride_adapter a("A", K); + view_stride_adapter x("X", N, K); + ViewTypeBr0 b("B"); + view_stride_adapter y("Y", N, K); + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + a.h_view[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } else { + for (int k(0); k < K; ++k) { + a.h_base[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } + Kokkos::deep_copy(b, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); + } + } + } + } + } + + // ************************************************************ + // Case 21/36: Ar1s_k + Br1s_1 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 21/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + if (K == numVecsAxpbyTest) { + view_stride_adapter a("A", K); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N, K); + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + a.h_view[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } else { + for (int k(0); k < K; ++k) { + a.h_base[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 22/36: Ar1s_k + Br1s_k + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 22/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + if (K == numVecsAxpbyTest) { + view_stride_adapter a("A", K); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", K); + view_stride_adapter y("Y", N, K); + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + a.h_view[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } else { + for (int k(0); k < K; ++k) { + a.h_base[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + b.h_view[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } else { + for (int k(0); k < K; ++k) { + b.h_base[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 23/36: Ar1s_k + Br1d,1 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 23/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + if (K == numVecsAxpbyTest) { + view_stride_adapter a("A", K); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N, K); + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + a.h_view[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } else { + for (int k(0); k < K; ++k) { + a.h_base[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, true, max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 24/36: Ar1s_k + Br1d,k + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 24/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + if (K == numVecsAxpbyTest) { + view_stride_adapter a("A", K); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", K); + view_stride_adapter y("Y", N, K); + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + a.h_view[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } else { + for (int k(0); k < K; ++k) { + a.h_base[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + b.h_view[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } else { + for (int k(0); k < K; ++k) { + b.h_base[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } + + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 25/36: Ar1d,1 + Bscalar + // ************************************************************w +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 25/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N, K); + tScalarB b; + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a.d_base, valueA); + b = valueB; + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 26/36: Ar1d,1 + Br0 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 26/36" << std::endl; +#endif + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N, K); + ViewTypeBr0 b("B"); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a.d_base, valueA); + Kokkos::deep_copy(b, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); + } + } + } + } + } + + // ************************************************************ + // Case 27/36: Ar1d,1 + Br1s_1 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 27/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a.d_base, valueA); + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 28/36: Ar1d,1 + Br1s_k + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 28/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + if (K == numVecsAxpbyTest) { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", K); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a.d_base, valueA); + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + b.h_view[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } else { + for (int k(0); k < K; ++k) { + b.h_base[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 29/36: Ar1d,1 + Br1d,1 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 29/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a.d_base, valueA); + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, true, max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 30/36: Ar1d,1 + Br1d,k + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 30/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", 1); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", K); + view_stride_adapter y("Y", N, K); + + Kokkos::deep_copy(a.d_base, valueA); + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + b.h_view[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } else { + for (int k(0); k < K; ++k) { + b.h_base[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 31/36: Ar1d,k + Bscalar + // ************************************************************w +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 31/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", K); + view_stride_adapter x("X", N, K); + tScalarB b; + view_stride_adapter y("Y", N, K); + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + a.h_view[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } else { + for (int k(0); k < K; ++k) { + a.h_base[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } + b = valueB; + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, tScalarB, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 32/36: Ar1d,k + Br0 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 32/36" << std::endl; +#endif + if constexpr (std::is_same_v) { + // Avoid the test, due to compilation errors + } else { + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", K); + view_stride_adapter x("X", N, K); + ViewTypeBr0 b("B"); + view_stride_adapter y("Y", N, K); + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + a.h_view[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } else { + for (int k(0); k < K; ++k) { + a.h_base[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } + Kokkos::deep_copy(b, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, false, + max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, ViewTypeBr0, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); + } + } + } + } + } + + // ************************************************************ + // Case 33/36: Ar1d,k + Br1s_1 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 33/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", K); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N, K); + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + a.h_view[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } else { + for (int k(0); k < K; ++k) { + a.h_base[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, + view_stride_adapter, Device>(a, x, b, y, N, K, true, + max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 34/36: Ar1d,k + Br1s_k + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 34/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + if (K == numVecsAxpbyTest) { + view_stride_adapter a("A", K); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", K); + view_stride_adapter y("Y", N, K); + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + a.h_view[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } else { + for (int k(0); k < K; ++k) { + a.h_base[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + b.h_view[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } else { + for (int k(0); k < K; ++k) { + b.h_base[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } + + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); + } + } + } + + // ************************************************************ + // Case 35/36: Ar1d,k + Br1d,1 + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 35/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", K); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", 1); + view_stride_adapter y("Y", N, K); + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + a.h_view[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } else { + for (int k(0); k < K; ++k) { + a.h_base[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } + Kokkos::deep_copy(b.d_base, valueB); + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); + if (valueB == Kokkos::ArithTraits::zero()) { + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, true, max_val, max_error); + } + } + } + } + + // ************************************************************ + // Case 36/36: Ar1d,k + Br1d,k + // ************************************************************ +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Starting case 36/36" << std::endl; +#endif + for (size_t i(0); i < valuesA.size(); ++i) { + tScalarA const valueA(valuesA[i]); + for (size_t j(0); j < valuesB.size(); ++j) { + tScalarB const valueB(valuesB[j]); + { + view_stride_adapter a("A", K); + view_stride_adapter x("X", N, K); + view_stride_adapter b("B", K); + view_stride_adapter y("Y", N, K); + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + a.h_view[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } else { + for (int k(0); k < K; ++k) { + a.h_base[k] = valueA + k; + } + Kokkos::deep_copy(a.d_base, a.h_base); + } + + if constexpr (std::is_same_v) { + for (int k(0); k < K; ++k) { + b.h_view[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } else { + for (int k(0); k < K; ++k) { + b.h_base[k] = valueB + k; + } + Kokkos::deep_copy(b.d_base, b.h_base); + } + + impl_test_axpby_mv_unification_compare< + tScalarA, view_stride_adapter, + view_stride_adapter, tScalarB, + view_stride_adapter, view_stride_adapter, + Device>(a, x, b, y, N, K, false, max_val, max_error); + } + } + } + + // std::cout << "Leaving impl_test_axpby_mv_unification()" << std::endl; + // std::cout << "=========================================" << std::endl; +} + +} // namespace Test + +template +int test_axpby_unification() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Calling impl_test_axpby_unif(), L-LLL" << std::endl; +#endif + Test::impl_test_axpby_unification< + tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutLeft, tScalarB, + Kokkos::LayoutLeft, tScalarY, Kokkos::LayoutLeft, Device>(14); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Calling impl_test_axpby_unif(), L-RRR" << std::endl; +#endif + Test::impl_test_axpby_unification< + tScalarA, Kokkos::LayoutRight, tScalarX, Kokkos::LayoutRight, tScalarB, + Kokkos::LayoutRight, tScalarY, Kokkos::LayoutRight, Device>(14); +#endif + +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Calling impl_test_axpby_unif(), L-SSS" << std::endl; +#endif + Test::impl_test_axpby_unification< + tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutStride, tScalarB, + Kokkos::LayoutStride, tScalarY, Kokkos::LayoutStride, Device>(14); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Calling impl_test_axpby_unif(), L-SLL" << std::endl; +#endif + Test::impl_test_axpby_unification< + tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutStride, tScalarB, + Kokkos::LayoutLeft, tScalarY, Kokkos::LayoutLeft, Device>(14); + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Calling impl_test_axpby_unif(), L-LSS" << std::endl; +#endif + Test::impl_test_axpby_unification< + tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutLeft, tScalarB, + Kokkos::LayoutStride, tScalarY, Kokkos::LayoutStride, Device>(14); + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Calling impl_test_axpby_unif(), L-SRS" << std::endl; +#endif + Test::impl_test_axpby_unification< + tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutStride, tScalarB, + Kokkos::LayoutRight, tScalarY, Kokkos::LayoutStride, Device>(14); + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Calling impl_test_axpby_unif(), L-LSR" << std::endl; +#endif + Test::impl_test_axpby_unification< + tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutLeft, tScalarB, + Kokkos::LayoutStride, tScalarY, Kokkos::LayoutRight, Device>(14); +#endif + return 1; +} + +template +int test_axpby_mv_unification() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + Test::impl_test_axpby_mv_unification< + tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutLeft, tScalarB, + Kokkos::LayoutLeft, tScalarY, Kokkos::LayoutLeft, Device>( + 14, numVecsAxpbyTest); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + Test::impl_test_axpby_mv_unification< + tScalarA, Kokkos::LayoutRight, tScalarX, Kokkos::LayoutRight, tScalarB, + Kokkos::LayoutRight, tScalarY, Kokkos::LayoutRight, Device>( + 14, numVecsAxpbyTest); +#endif + +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + Test::impl_test_axpby_mv_unification< + tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutStride, tScalarB, + Kokkos::LayoutStride, tScalarY, Kokkos::LayoutStride, Device>( + 14, numVecsAxpbyTest); +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) + Test::impl_test_axpby_mv_unification< + tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutStride, tScalarB, + Kokkos::LayoutLeft, tScalarY, Kokkos::LayoutLeft, Device>( + 14, numVecsAxpbyTest); + Test::impl_test_axpby_mv_unification< + tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutLeft, tScalarB, + Kokkos::LayoutStride, tScalarY, Kokkos::LayoutStride, Device>( + 14, numVecsAxpbyTest); + + Test::impl_test_axpby_mv_unification< + tScalarA, Kokkos::LayoutLeft, tScalarX, Kokkos::LayoutStride, tScalarB, + Kokkos::LayoutRight, tScalarY, Kokkos::LayoutStride, Device>( + 14, numVecsAxpbyTest); + + Test::impl_test_axpby_mv_unification< + tScalarA, Kokkos::LayoutStride, tScalarX, Kokkos::LayoutLeft, tScalarB, + Kokkos::LayoutStride, tScalarY, Kokkos::LayoutRight, Device>( + 14, numVecsAxpbyTest); +#endif + return 1; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, axpby_unification_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_unification_float"); + test_axpby_unification(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, axpby_mv_unification_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_unification_float"); + test_axpby_mv_unification(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, axpby_unification_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_unification_double"); + test_axpby_unification(); +} +TEST_F(TestCategory, axpby_mv_unification_double) { + Kokkos::Profiling::pushRegion( + "KokkosBlas::Test::axpby_mv_unification_double"); + test_axpby_mv_unification(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, axpby_unification_complex_double) { + Kokkos::Profiling::pushRegion( + "KokkosBlas::Test::axpby_unification_complex_double"); + test_axpby_unification, Kokkos::complex, + Kokkos::complex, Kokkos::complex, + TestDevice>(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, axpby_mv_unification_complex_double) { + Kokkos::Profiling::pushRegion( + "KokkosBlas::Test::axpby_mv_unification_complex_double"); + test_axpby_mv_unification, Kokkos::complex, + Kokkos::complex, Kokkos::complex, + TestDevice>(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, axpby_unification_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_unification_int"); + test_axpby_unification(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, axpby_mv_unification_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::axpby_mv_unification_int"); + test_axpby_mv_unification(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +TEST_F(TestCategory, axpby_unification_double_int) { + Kokkos::Profiling::pushRegion( + "KokkosBlas::Test::axpby_unification_double_int"); + test_axpby_unification(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, axpby_double_mv_unification_int) { + Kokkos::Profiling::pushRegion( + "KokkosBlas::Test::axpby_mv_unification_double_int"); + test_axpby_mv_unification(); + Kokkos::Profiling::popRegion(); +} +#endif diff --git a/packages/kokkos-kernels/blas/unit_test/Test_Blas1_nrm1.hpp b/packages/kokkos-kernels/blas/unit_test/Test_Blas1_nrm1.hpp index f6938c514771..24795878d143 100644 --- a/packages/kokkos-kernels/blas/unit_test/Test_Blas1_nrm1.hpp +++ b/packages/kokkos-kernels/blas/unit_test/Test_Blas1_nrm1.hpp @@ -22,10 +22,10 @@ namespace Test { template void impl_test_nrm1(int N) { - typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::ArithTraits AT; - typedef typename AT::mag_type mag_type; - typedef Kokkos::ArithTraits MAT; + using ScalarA = typename ViewTypeA::value_type; + using AT = Kokkos::ArithTraits; + using mag_type = typename AT::mag_type; + using MAT = Kokkos::ArithTraits; view_stride_adapter a("a", N); diff --git a/packages/kokkos-kernels/blas/unit_test/Test_Blas1_swap.hpp b/packages/kokkos-kernels/blas/unit_test/Test_Blas1_swap.hpp index 382c35947b89..624552f1dca6 100644 --- a/packages/kokkos-kernels/blas/unit_test/Test_Blas1_swap.hpp +++ b/packages/kokkos-kernels/blas/unit_test/Test_Blas1_swap.hpp @@ -3,11 +3,12 @@ namespace Test { namespace Impl { -template +template void test_swap(int const vector_length) { - using vector_type = VectorType; - using execution_space = typename vector_type::execution_space; - using scalar_type = typename VectorType::non_const_value_type; + using execution_space = typename DeviceType::execution_space; + using memory_space = typename DeviceType::memory_space; + using vector_type = Kokkos::View; + using scalar_type = typename vector_type::non_const_value_type; using mag_type = typename Kokkos::ArithTraits::mag_type; // Note that Xref and Yref need to always be copies of X and Y @@ -43,14 +44,12 @@ void test_swap(int const vector_length) { } // namespace Impl } // namespace Test -template +template int test_swap() { - using Vector = Kokkos::View; - - Test::Impl::test_swap(0); - Test::Impl::test_swap(10); - Test::Impl::test_swap(256); - Test::Impl::test_swap(1024); + Test::Impl::test_swap(0); + Test::Impl::test_swap(10); + Test::Impl::test_swap(256); + Test::Impl::test_swap(1024); return 0; } diff --git a/packages/kokkos-kernels/blas/unit_test/Test_Blas2_ger.hpp b/packages/kokkos-kernels/blas/unit_test/Test_Blas2_ger.hpp index a0860bae047b..df3d2cb5d158 100644 --- a/packages/kokkos-kernels/blas/unit_test/Test_Blas2_ger.hpp +++ b/packages/kokkos-kernels/blas/unit_test/Test_Blas2_ger.hpp @@ -79,10 +79,11 @@ class GerTester { using _KAT_A = Kokkos::ArithTraits; using _AuxType = typename _KAT_A::mag_type; - void populateVariables(ScalarA& alpha, _HostViewTypeX& h_x, - _HostViewTypeY& h_y, _HostViewTypeA& h_A, - _ViewTypeExpected& h_expected, _ViewTypeX& x, - _ViewTypeY& y, _ViewTypeA& A, + void populateVariables(ScalarA& alpha, + view_stride_adapter<_ViewTypeX, false>& x, + view_stride_adapter<_ViewTypeY, false>& y, + view_stride_adapter<_ViewTypeA, false>& A, + _ViewTypeExpected& h_expected, bool& expectedResultIsKnown); template @@ -149,11 +150,10 @@ class GerTester { T shrinkAngleToZeroTwoPiRange(const T input); template - void callKkGerAndCompareAgainstExpected(const ScalarA& alpha, TX& x, TY& y, - _ViewTypeA& A, - const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_expected, - const std::string& situation); + void callKkGerAndCompareAgainstExpected( + const ScalarA& alpha, TX& x, TY& y, + view_stride_adapter<_ViewTypeA, false>& A, + const _ViewTypeExpected& h_expected, const std::string& situation); const bool _A_is_complex; const bool _A_is_lr; @@ -195,8 +195,12 @@ GerTester::value ? 1.0e-6 : 1.0e-9), - _relTol(std::is_same<_AuxType, float>::value ? 5.0e-3 : 1.0e-6), + _absTol(std::is_same<_AuxType, float>::value + ? 1.0e-6 + : (std::is_same<_AuxType, double>::value ? 1.0e-9 : 0)), + _relTol(std::is_same<_AuxType, float>::value + ? 5.0e-3 + : (std::is_same<_AuxType, double>::value ? 1.0e-6 : 0)), _M(-1), _N(-1), _useAnalyticalResults(false), @@ -282,8 +286,7 @@ void GerTesterpopulateVariables(alpha, x.h_view, y.h_view, A.h_view, - h_expected.d_view, x.d_view, y.d_view, A.d_view, + this->populateVariables(alpha, x, y, A, h_expected.d_view, expectedResultIsKnown); // ******************************************************************** @@ -329,8 +332,7 @@ void GerTestercallKkGerAndCompareAgainstExpected( - alpha, x.d_view, y.d_view, A.d_view, A.h_view, h_expected.d_view, - "non const {x,y}"); + alpha, x.d_view, y.d_view, A, h_expected.d_view, "non const {x,y}"); } // ******************************************************************** @@ -339,8 +341,7 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view_const, y.d_view, - A.d_view, A.h_view, + this->callKkGerAndCompareAgainstExpected(alpha, x.d_view_const, y.d_view, A, h_expected.d_view, "const x"); } @@ -350,8 +351,7 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view, y.d_view_const, - A.d_view, A.h_view, + this->callKkGerAndCompareAgainstExpected(alpha, x.d_view, y.d_view_const, A, h_expected.d_view, "const y"); } @@ -362,7 +362,7 @@ void GerTestercallKkGerAndCompareAgainstExpected(alpha, x.d_view_const, - y.d_view_const, A.d_view, A.h_view, + y.d_view_const, A, h_expected.d_view, "const {x,y}"); } @@ -384,52 +384,53 @@ void GerTester -void GerTester::populateVariables(ScalarA& alpha, _HostViewTypeX& h_x, - _HostViewTypeY& h_y, - _HostViewTypeA& h_A, - _ViewTypeExpected& h_expected, - _ViewTypeX& x, _ViewTypeY& y, - _ViewTypeA& A, - bool& expectedResultIsKnown) { +void GerTester< + ScalarX, tLayoutX, ScalarY, tLayoutY, ScalarA, tLayoutA, + Device>::populateVariables(ScalarA& alpha, + view_stride_adapter<_ViewTypeX, false>& x, + view_stride_adapter<_ViewTypeY, false>& y, + view_stride_adapter<_ViewTypeA, false>& A, + _ViewTypeExpected& h_expected, + bool& expectedResultIsKnown) { expectedResultIsKnown = false; if (_useAnalyticalResults) { - this->populateAnalyticalValues(alpha, h_x, h_y, h_A, h_expected); - Kokkos::deep_copy(x, h_x); - Kokkos::deep_copy(y, h_y); - Kokkos::deep_copy(A, h_A); + this->populateAnalyticalValues(alpha, x.h_view, y.h_view, A.h_view, + h_expected); + Kokkos::deep_copy(x.d_base, x.h_base); + Kokkos::deep_copy(y.d_base, y.h_base); + Kokkos::deep_copy(A.d_base, A.h_base); expectedResultIsKnown = true; } else if ((_M == 1) && (_N == 1)) { alpha = 3; - h_x[0] = 2; + x.h_view[0] = 2; - h_y[0] = 3; + y.h_view[0] = 3; - h_A(0, 0) = 7; + A.h_view(0, 0) = 7; - Kokkos::deep_copy(x, h_x); - Kokkos::deep_copy(y, h_y); - Kokkos::deep_copy(A, h_A); + Kokkos::deep_copy(x.d_base, x.h_base); + Kokkos::deep_copy(y.d_base, y.h_base); + Kokkos::deep_copy(A.d_base, A.h_base); h_expected(0, 0) = 25; expectedResultIsKnown = true; } else if ((_M == 1) && (_N == 2)) { alpha = 3; - h_x[0] = 2; + x.h_view[0] = 2; - h_y[0] = 3; - h_y[1] = 4; + y.h_view[0] = 3; + y.h_view[1] = 4; - h_A(0, 0) = 7; - h_A(0, 1) = -6; + A.h_view(0, 0) = 7; + A.h_view(0, 1) = -6; - Kokkos::deep_copy(x, h_x); - Kokkos::deep_copy(y, h_y); - Kokkos::deep_copy(A, h_A); + Kokkos::deep_copy(x.d_base, x.h_base); + Kokkos::deep_copy(y.d_base, y.h_base); + Kokkos::deep_copy(A.d_base, A.h_base); h_expected(0, 0) = 25; h_expected(0, 1) = 18; @@ -437,20 +438,20 @@ void GerTester void GerTester:: - callKkGerAndCompareAgainstExpected(const ScalarA& alpha, TX& x, TY& y, - _ViewTypeA& A, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_expected, - const std::string& situation) { + callKkGerAndCompareAgainstExpected( + const ScalarA& alpha, TX& x, TY& y, + view_stride_adapter<_ViewTypeA, false>& A, + const _ViewTypeExpected& h_expected, const std::string& situation) { #ifdef HAVE_KOKKOSKERNELS_DEBUG #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( @@ -1379,7 +1380,7 @@ void GerTestercompareKkGerAgainstExpected(alpha, h_A, h_expected); + this->compareKkGerAgainstExpected(alpha, A.h_view, h_expected); } } diff --git a/packages/kokkos-kernels/blas/unit_test/Test_Blas2_syr.hpp b/packages/kokkos-kernels/blas/unit_test/Test_Blas2_syr.hpp index 4396c81bb20f..1253a8e32924 100644 --- a/packages/kokkos-kernels/blas/unit_test/Test_Blas2_syr.hpp +++ b/packages/kokkos-kernels/blas/unit_test/Test_Blas2_syr.hpp @@ -76,9 +76,10 @@ class SyrTester { using _KAT_A = Kokkos::ArithTraits; using _AuxType = typename _KAT_A::mag_type; - void populateVariables(ScalarA& alpha, _HostViewTypeX& h_x, - _HostViewTypeA& h_A, _ViewTypeExpected& h_expected, - _ViewTypeX& x, _ViewTypeA& A, + void populateVariables(ScalarA& alpha, + view_stride_adapter<_ViewTypeX, false>& x, + view_stride_adapter<_ViewTypeA, false>& A, + _ViewTypeExpected& h_expected, bool& expectedResultIsKnown); template @@ -145,11 +146,9 @@ class SyrTester { T shrinkAngleToZeroTwoPiRange(const T input); template - void callKkSyrAndCompareAgainstExpected(const ScalarA& alpha, TX& x, - _ViewTypeA& A, - const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_expected, - const std::string& situation); + void callKkSyrAndCompareAgainstExpected( + const ScalarA& alpha, TX& x, view_stride_adapter<_ViewTypeA, false>& A, + const _ViewTypeExpected& h_expected, const std::string& situation); template void callKkGerAndCompareKkSyrAgainstIt( @@ -198,8 +197,12 @@ SyrTester::SyrTester() // large enough to require 'relTol' to value 5.0e-3. The same // calculations show no discrepancies for calculations with double. // **************************************************************** - _absTol(std::is_same<_AuxType, float>::value ? 1.0e-6 : 1.0e-9), - _relTol(std::is_same<_AuxType, float>::value ? 5.0e-3 : 1.0e-6), + _absTol(std::is_same<_AuxType, float>::value + ? 1.0e-6 + : (std::is_same<_AuxType, double>::value ? 1.0e-9 : 0)), + _relTol(std::is_same<_AuxType, float>::value + ? 5.0e-3 + : (std::is_same<_AuxType, double>::value ? 1.0e-6 : 0)), _M(-1), _N(-1), _useAnalyticalResults(false), @@ -279,8 +282,8 @@ void SyrTester::test( // ******************************************************************** // Step 2 of 7: populate alpha, h_x, h_A, h_expected, x, A // ******************************************************************** - this->populateVariables(alpha, x.h_view, A.h_view, h_expected.d_view, - x.d_view, A.d_view, expectedResultIsKnown); + this->populateVariables(alpha, x, A, h_expected.d_view, + expectedResultIsKnown); // ******************************************************************** // Step 3 of 7: populate h_vanilla @@ -324,8 +327,8 @@ void SyrTester::test( Kokkos::deep_copy(org_A.h_view, A.h_view); if (test_x) { - this->callKkSyrAndCompareAgainstExpected( - alpha, x.d_view, A.d_view, A.h_view, h_expected.d_view, "non const x"); + this->callKkSyrAndCompareAgainstExpected(alpha, x.d_view, A, + h_expected.d_view, "non const x"); if ((_useAnalyticalResults == false) && // Just to save run time (_kkGerShouldThrowException == false)) { @@ -340,9 +343,8 @@ void SyrTester::test( if (test_cx) { Kokkos::deep_copy(A.d_base, org_A.d_base); - this->callKkSyrAndCompareAgainstExpected(alpha, x.d_view_const, A.d_view, - A.h_view, h_expected.d_view, - "const x"); + this->callKkSyrAndCompareAgainstExpected(alpha, x.d_view_const, A, + h_expected.d_view, "const x"); } // ******************************************************************** @@ -368,42 +370,42 @@ void SyrTester::test( template void SyrTester::populateVariables( - ScalarA& alpha, _HostViewTypeX& h_x, _HostViewTypeA& h_A, - _ViewTypeExpected& h_expected, _ViewTypeX& x, _ViewTypeA& A, + ScalarA& alpha, view_stride_adapter<_ViewTypeX, false>& x, + view_stride_adapter<_ViewTypeA, false>& A, _ViewTypeExpected& h_expected, bool& expectedResultIsKnown) { expectedResultIsKnown = false; if (_useAnalyticalResults) { - this->populateAnalyticalValues(alpha, h_x, h_A, h_expected); - Kokkos::deep_copy(x, h_x); - Kokkos::deep_copy(A, h_A); + this->populateAnalyticalValues(alpha, x.h_view, A.h_view, h_expected); + Kokkos::deep_copy(x.d_base, x.h_base); + Kokkos::deep_copy(A.d_base, A.h_base); expectedResultIsKnown = true; } else if (_N == 1) { alpha = 3; - h_x[0] = 2; + x.h_view[0] = 2; - h_A(0, 0) = 7; + A.h_view(0, 0) = 7; - Kokkos::deep_copy(x, h_x); - Kokkos::deep_copy(A, h_A); + Kokkos::deep_copy(x.d_base, x.h_base); + Kokkos::deep_copy(A.d_base, A.h_base); h_expected(0, 0) = 19; expectedResultIsKnown = true; } else if (_N == 2) { alpha = 3; - h_x[0] = -2; - h_x[1] = 9; + x.h_view[0] = -2; + x.h_view[1] = 9; - h_A(0, 0) = 17; - h_A(0, 1) = -43; - h_A(1, 0) = -43; - h_A(1, 1) = 101; + A.h_view(0, 0) = 17; + A.h_view(0, 1) = -43; + A.h_view(1, 0) = -43; + A.h_view(1, 1) = 101; - Kokkos::deep_copy(x, h_x); - Kokkos::deep_copy(A, h_A); + Kokkos::deep_copy(x.d_base, x.h_base); + Kokkos::deep_copy(A.d_base, A.h_base); if (_useUpOption) { h_expected(0, 0) = 29; @@ -426,17 +428,17 @@ void SyrTester::populateVariables( { ScalarX randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(x, rand_pool, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); } { ScalarA randStart, randEnd; Test::getRandomBounds(1.0, randStart, randEnd); - Kokkos::fill_random(A, rand_pool, randStart, randEnd); + Kokkos::fill_random(A.d_view, rand_pool, randStart, randEnd); } - Kokkos::deep_copy(h_x, x); - Kokkos::deep_copy(h_A, A); + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(A.h_base, A.d_base); if (_useHermitianOption && _A_is_complex) { // **************************************************************** @@ -444,12 +446,12 @@ void SyrTester::populateVariables( // **************************************************************** for (int i(0); i < _N; ++i) { for (int j(i + 1); j < _N; ++j) { - h_A(i, j) = _KAT_A::conj(h_A(j, i)); + A.h_view(i, j) = _KAT_A::conj(A.h_view(j, i)); } } for (int i(0); i < _N; ++i) { - h_A(i, i) = 0.5 * (h_A(i, i) + _KAT_A::conj(h_A(i, i))); + A.h_view(i, i) = 0.5 * (A.h_view(i, i) + _KAT_A::conj(A.h_view(i, i))); } } else { // **************************************************************** @@ -457,18 +459,18 @@ void SyrTester::populateVariables( // **************************************************************** for (int i(0); i < _N; ++i) { for (int j(i + 1); j < _N; ++j) { - h_A(i, j) = h_A(j, i); + A.h_view(i, j) = A.h_view(j, i); } } } - Kokkos::deep_copy(A, h_A); + Kokkos::deep_copy(A.d_base, A.h_base); } #ifdef HAVE_KOKKOSKERNELS_DEBUG if (_N <= 2) { for (int i(0); i < _M; ++i) { for (int j(0); j < _N; ++j) { - std::cout << "h_origA(" << i << "," << j << ")=" << h_A(i, j) + std::cout << "h_origA(" << i << "," << j << ")=" << A.h_view(i, j) << std::endl; } } @@ -1433,10 +1435,9 @@ template template void SyrTester:: - callKkSyrAndCompareAgainstExpected(const ScalarA& alpha, TX& x, - _ViewTypeA& A, const _HostViewTypeA& h_A, - const _ViewTypeExpected& h_expected, - const std::string& situation) { + callKkSyrAndCompareAgainstExpected( + const ScalarA& alpha, TX& x, view_stride_adapter<_ViewTypeA, false>& A, + const _ViewTypeExpected& h_expected, const std::string& situation) { #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "In Test_Blas2_syr, '" << situation << "', alpha = " << alpha << std::endl; @@ -1457,7 +1458,7 @@ void SyrTester:: bool gotStdException(false); bool gotUnknownException(false); try { - KokkosBlas::syr(mode.c_str(), uplo.c_str(), alpha, x, A); + KokkosBlas::syr(mode.c_str(), uplo.c_str(), alpha, x, A.d_view); } catch (const std::exception& e) { #ifdef HAVE_KOKKOSKERNELS_DEBUG std::cout << "In Test_Blas2_syr, '" << situation @@ -1482,8 +1483,8 @@ void SyrTester:: << "have thrown a std::exception"; if ((gotStdException == false) && (gotUnknownException == false)) { - Kokkos::deep_copy(h_A, A); - this->compareKkSyrAgainstReference(alpha, h_A, h_expected); + Kokkos::deep_copy(A.h_base, A.d_base); + this->compareKkSyrAgainstReference(alpha, A.h_view, h_expected); } } diff --git a/packages/kokkos-kernels/blas/unit_test/Test_Blas2_syr2.hpp b/packages/kokkos-kernels/blas/unit_test/Test_Blas2_syr2.hpp new file mode 100644 index 000000000000..c49eba765b65 --- /dev/null +++ b/packages/kokkos-kernels/blas/unit_test/Test_Blas2_syr2.hpp @@ -0,0 +1,1965 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +// ********************************************************************** +// The tests executed by the code below cover many combinations for +// the operations: +// --> A += alpha * x * y^T + alpha * y * x^T, or +// --> A += alpha * x * y^H + conj(alpha) * y * x^H +// 01) Type of 'x' components: float, double, complex, ... +// 02) Type of 'y' components: float, double, complex, ... +// 03) Type of 'A' components: float, double, complex, ... +// 04) Execution space: serial, threads, OpenMP, Cuda, ... +// 05) Layout of 'x' +// 06) Layout of 'y' +// 07) Layout of 'A' +// 08) Dimension of 'A' +// 09) Options 'const' or 'non const' for x view, when calling syr2() +// 10) Options 'const' or 'non const' for y view, when calling syr2() +// 11) Usage of analytical results in the tests +// 12) Options 'T' or 'H' when calling syr2() +// 13) Options 'U' or 'L' when calling syr2() +// +// Choices (01)-(05) are selected in the routines TEST_F() at the +// very bottom of the file, when calling test_syr2<...>(). +// +// Choices (06)-(13) are selected in routine test_syr2<...>(), +// when calling the method test() of class Test::Syr2Tester<...>. +// +// The class Test::Syr2Tester<...> represents the "core" of the test +// logic, where all calculations, comparisons, and success/failure +// decisions are performed. +// +// A high level explanation of method Test::SyrTester<...>::test() +// is given by the 7 steps named "Step 1 of 7" to "Step 7 of 7" +// in the code below. +// ********************************************************************** + +#include +#include +#include +#include +#include + +namespace Test { + +template +class Syr2Tester { + public: + Syr2Tester(); + + ~Syr2Tester(); + + void test(const int N, const int nonConstConstCombinations, + const bool useAnalyticalResults = false, + const bool useHermitianOption = false, + const bool useUpOption = false); + + private: + using _ViewTypeX = Kokkos::View; + using _ViewTypeY = Kokkos::View; + using _ViewTypeA = Kokkos::View; + + using _HostViewTypeX = typename _ViewTypeX::HostMirror; + using _HostViewTypeY = typename _ViewTypeY::HostMirror; + using _HostViewTypeA = typename _ViewTypeA::HostMirror; + using _ViewTypeExpected = + Kokkos::View; + + using _KAT_A = Kokkos::ArithTraits; + using _AuxType = typename _KAT_A::mag_type; + + void populateVariables(ScalarA& alpha, + view_stride_adapter<_ViewTypeX, false>& x, + view_stride_adapter<_ViewTypeY, false>& y, + view_stride_adapter<_ViewTypeA, false>& A, + _ViewTypeExpected& h_expected, + bool& expectedResultIsKnown); + + template + typename std::enable_if>::value || + std::is_same>::value, + void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, + _HostViewTypeA& h_A, _ViewTypeExpected& h_expected); + + template + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, _HostViewTypeY& h_y, + _HostViewTypeA& h_A, _ViewTypeExpected& h_expected); + + template + typename std::enable_if>::value || + std::is_same>::value, + void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, + const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla); + + template + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + populateVanillaValues(const T& alpha, const _HostViewTypeX& h_x, + const _HostViewTypeY& h_y, const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla); + + template + typename std::enable_if>::value || + std::is_same>::value, + void>::type + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected); + + template + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected); + + template + typename std::enable_if>::value || + std::is_same>::value, + void>::type + compareKkSyr2AgainstReference(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_reference); + + template + typename std::enable_if>::value && + !std::is_same>::value, + void>::type + compareKkSyr2AgainstReference(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_reference); + + template + T shrinkAngleToZeroTwoPiRange(const T input); + + template + void callKkSyr2AndCompareAgainstExpected( + const ScalarA& alpha, TX& x, TY& y, + view_stride_adapter<_ViewTypeA, false>& A, + const _ViewTypeExpected& h_expected, const std::string& situation); + + template + void callKkGerAndCompareKkSyr2AgainstIt( + const ScalarA& alpha, TX& x, TY& y, + view_stride_adapter<_ViewTypeA, false>& org_A, + const _HostViewTypeA& h_A_syr2, const std::string& situation); + + const bool _A_is_complex; + const bool _A_is_lr; + const bool _A_is_ll; + const bool _testIsGpu; + const bool _vanillaUsesDifferentOrderOfOps; + const _AuxType _absTol; + const _AuxType _relTol; + int _M; + int _N; + bool _useAnalyticalResults; + bool _useHermitianOption; + bool _useUpOption; + bool _kkSyr2ShouldThrowException; + bool _kkGerShouldThrowException; +}; + +template +Syr2Tester::Syr2Tester() + : _A_is_complex(std::is_same>::value || + std::is_same>::value), + _A_is_lr(std::is_same::value), + _A_is_ll(std::is_same::value), + _testIsGpu(KokkosKernels::Impl::kk_is_gpu_exec_space< + typename Device::execution_space>()) +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS + , + _vanillaUsesDifferentOrderOfOps(_A_is_lr) +#else + , + _vanillaUsesDifferentOrderOfOps(false) +#endif + , + // **************************************************************** + // Tolerances for double can be tighter than tolerances for float. + // + // In the case of calculations with float, a small amount of + // discrepancies between reference results and CUDA results are + // large enough to require 'relTol' to value 5.0e-3. The same + // calculations show no discrepancies for calculations with double. + // **************************************************************** + _absTol(std::is_same<_AuxType, float>::value + ? 1.0e-6 + : (std::is_same<_AuxType, double>::value ? 1.0e-9 : 0)), + _relTol(std::is_same<_AuxType, float>::value + ? 5.0e-3 + : (std::is_same<_AuxType, double>::value ? 1.0e-6 : 0)), + _M(-1), + _N(-1), + _useAnalyticalResults(false), + _useHermitianOption(false), + _useUpOption(false), + _kkSyr2ShouldThrowException(false), + _kkGerShouldThrowException(false) { +} + +template +Syr2Tester::~Syr2Tester() { + // Nothing to do +} + +template +void Syr2Tester::test(const int N, const int nonConstConstCombinations, + const bool useAnalyticalResults, + const bool useHermitianOption, + const bool useUpOption) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Entering Syr2Tester::test()... - - - - - - - - - - - - - - - - " + "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " + "- - - - - - - - - " + << std::endl; + + std::cout << "_A_is_complex = " << _A_is_complex + << ", _A_is_lr = " << _A_is_lr << ", _A_is_ll = " << _A_is_ll + << ", _testIsGpu = " << _testIsGpu + << ", _vanillaUsesDifferentOrderOfOps = " + << _vanillaUsesDifferentOrderOfOps << ", _absTol = " << _absTol + << ", _relTol = " << _relTol + << ", nonConstConstCombinations = " << nonConstConstCombinations + << ", useAnalyticalResults = " << useAnalyticalResults + << ", useHermitianOption = " << useHermitianOption + << ", useUpOption = " << useUpOption << std::endl; +#endif + // ******************************************************************** + // Step 1 of 7: declare main types and variables + // ******************************************************************** + _M = N; + _N = N; + _useAnalyticalResults = useAnalyticalResults; + _useHermitianOption = useHermitianOption; + _useUpOption = useUpOption; + +#ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS + _kkSyr2ShouldThrowException = false; + + _kkGerShouldThrowException = false; + if (_A_is_complex && _useHermitianOption) { + _kkGerShouldThrowException = !_A_is_ll; + } +#endif + + bool test_x(false); + bool test_cx(false); + if (nonConstConstCombinations == 0) { + test_x = true; + } else if (nonConstConstCombinations == 1) { + test_cx = true; + } else { + test_x = true; + test_cx = true; + } + + view_stride_adapter<_ViewTypeX, false> x("X", _M); + view_stride_adapter<_ViewTypeY, false> y("Y", _N); + view_stride_adapter<_ViewTypeA, false> A("A", _M, _N); + + view_stride_adapter<_ViewTypeExpected, true> h_expected( + "expected A += alpha * x * x^{t,h}", _M, _N); + bool expectedResultIsKnown = false; + + using AlphaCoeffType = typename _ViewTypeA::non_const_value_type; + ScalarA alpha(Kokkos::ArithTraits::zero()); + + // ******************************************************************** + // Step 2 of 7: populate alpha, h_x, h_A, h_expected, x, A + // ******************************************************************** + this->populateVariables(alpha, x, y, A, h_expected.d_view, + expectedResultIsKnown); + + // ******************************************************************** + // Step 3 of 7: populate h_vanilla + // ******************************************************************** + view_stride_adapter<_ViewTypeExpected, true> h_vanilla( + "vanilla = A + alpha * x * x^{t,h}", _M, _N); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2.hpp, computing vanilla A with alpha type = " + << typeid(alpha).name() << std::endl; +#endif + this->populateVanillaValues(alpha, x.h_view, y.h_view, A.h_view, + h_vanilla.d_view); + + // ******************************************************************** + // Step 4 of 7: use h_vanilla and h_expected as appropriate + // ******************************************************************** + if (expectedResultIsKnown) { + // ****************************************************************** + // Compare h_vanilla against h_expected + // ****************************************************************** + this->compareVanillaAgainstExpected(alpha, h_vanilla.d_view, + h_expected.d_view); + } else { + // ****************************************************************** + // Copy h_vanilla to h_expected + // ****************************************************************** + Kokkos::deep_copy(h_expected.d_base, h_vanilla.d_base); + } + + // ******************************************************************** + // Step 5 of 7: test with 'non const x' + // ******************************************************************** + view_stride_adapter<_ViewTypeA, false> org_A("Org_A", _M, _N); + Kokkos::deep_copy(org_A.d_base, A.d_base); + Kokkos::deep_copy(org_A.h_view, A.h_view); + + if (test_x) { + this->callKkSyr2AndCompareAgainstExpected(alpha, x.d_view, y.d_view, A, + h_expected.d_view, "non const x"); + + if ((_useAnalyticalResults == false) && // Just to save run time + (_kkGerShouldThrowException == false)) { + this->callKkGerAndCompareKkSyr2AgainstIt(alpha, x.d_view, y.d_view, org_A, + A.h_view, "non const x"); + } + } + + // ******************************************************************** + // Step 6 of 7: test with const x + // ******************************************************************** + if (test_cx) { + Kokkos::deep_copy(A.d_base, org_A.d_base); + + this->callKkSyr2AndCompareAgainstExpected( + alpha, x.d_view_const, y.d_view_const, A, h_expected.d_view, "const x"); + } + + // ******************************************************************** + // Step 7 of 7: tests with invalid values on the first input parameter + // ******************************************************************** + EXPECT_ANY_THROW( + KokkosBlas::syr2(".", "U", alpha, x.d_view, y.d_view, A.d_view)) + << "Failed test: kk syr2 should have thrown an exception for mode '.'"; + EXPECT_ANY_THROW( + KokkosBlas::syr2("", "U", alpha, x.d_view, y.d_view, A.d_view)) + << "Failed test: kk syr2 should have thrown an exception for mode ''"; + EXPECT_ANY_THROW( + KokkosBlas::syr2("T", ".", alpha, x.d_view, y.d_view, A.d_view)) + << "Failed test: kk syr2 should have thrown an exception for uplo '.'"; + EXPECT_ANY_THROW( + KokkosBlas::syr2("T", "", alpha, x.d_view, y.d_view, A.d_view)) + << "Failed test: kk syr2 should have thrown an exception for uplo ''"; + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Leaving Syr2Tester::test() - - - - - - - - - - - - - - - - - - " + "- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - " + "- - - - - - - " + << std::endl; +#endif +} + +template +void Syr2Tester< + ScalarX, tLayoutX, ScalarY, tLayoutY, ScalarA, tLayoutA, + Device>::populateVariables(ScalarA& alpha, + view_stride_adapter<_ViewTypeX, false>& x, + view_stride_adapter<_ViewTypeY, false>& y, + view_stride_adapter<_ViewTypeA, false>& A, + _ViewTypeExpected& h_expected, + bool& expectedResultIsKnown) { + expectedResultIsKnown = false; + + if (_useAnalyticalResults) { + this->populateAnalyticalValues(alpha, x.h_view, y.h_view, A.h_view, + h_expected); + Kokkos::deep_copy(x.d_base, x.h_base); + Kokkos::deep_copy(y.d_base, y.h_base); + Kokkos::deep_copy(A.d_base, A.h_base); + + expectedResultIsKnown = true; + } else if (_N == 1) { + alpha = 3; + + x.h_view[0] = 2; + + y.h_view[0] = 4; + + A.h_view(0, 0) = 7; + + Kokkos::deep_copy(x.d_base, x.h_base); + Kokkos::deep_copy(y.d_base, y.h_base); + Kokkos::deep_copy(A.d_base, A.h_base); + + h_expected(0, 0) = 55; + expectedResultIsKnown = true; + } else if (_N == 2) { + alpha = 3; + + x.h_view[0] = -2; + x.h_view[1] = 9; + + y.h_view[0] = 5; + y.h_view[1] = -4; + + A.h_view(0, 0) = 17; + A.h_view(0, 1) = -43; + A.h_view(1, 0) = -43; + A.h_view(1, 1) = 101; + + Kokkos::deep_copy(x.d_base, x.h_base); + Kokkos::deep_copy(y.d_base, y.h_base); + Kokkos::deep_copy(A.d_base, A.h_base); + + if (_useUpOption) { + h_expected(0, 0) = -43; + h_expected(0, 1) = 116; + h_expected(1, 0) = -43; + h_expected(1, 1) = -115; + } else { + h_expected(0, 0) = -43; + h_expected(0, 1) = -43; + h_expected(1, 0) = 116; + h_expected(1, 1) = -115; + } + expectedResultIsKnown = true; + } else { + alpha = 3; + + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); + + { + ScalarX randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(x.d_view, rand_pool, randStart, randEnd); + } + + { + ScalarY randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(y.d_view, rand_pool, randStart, randEnd); + } + + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(A.d_view, rand_pool, randStart, randEnd); + } + + Kokkos::deep_copy(x.h_base, x.d_base); + Kokkos::deep_copy(y.h_base, y.d_base); + Kokkos::deep_copy(A.h_base, A.d_base); + + if (_useHermitianOption && _A_is_complex) { + // **************************************************************** + // Make h_A Hermitian + // **************************************************************** + for (int i(0); i < _N; ++i) { + for (int j(i + 1); j < _N; ++j) { + A.h_view(i, j) = _KAT_A::conj(A.h_view(j, i)); + } + } + + for (int i(0); i < _N; ++i) { + A.h_view(i, i) = 0.5 * (A.h_view(i, i) + _KAT_A::conj(A.h_view(i, i))); + } + } else { + // **************************************************************** + // Make h_A symmetric + // **************************************************************** + for (int i(0); i < _N; ++i) { + for (int j(i + 1); j < _N; ++j) { + A.h_view(i, j) = A.h_view(j, i); + } + } + } + Kokkos::deep_copy(A.d_base, A.h_base); + } + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + std::cout << "h_origA(" << i << "," << j << ") = " << A.h_view(i, j) + << std::endl; + } + } + } +#endif +} + +// Code for complex values +template +template +typename std::enable_if>::value || + std::is_same>::value, + void>::type +Syr2Tester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, + _HostViewTypeY& h_y, + _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected) { + alpha.real() = 1.4; + alpha.imag() = -2.3; + + for (int i = 0; i < _M; ++i) { + _AuxType auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); + h_x[i].real() = sin(auxI); + h_x[i].imag() = sin(auxI); + } + + for (int i = 0; i < _M; ++i) { + _AuxType auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); + h_y[i].real() = cos(auxI); + h_y[i].imag() = cos(auxI); + } + + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + _AuxType auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + _AuxType auxImJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_A(i, j).real() = sin(auxIpJ); + h_A(i, j).imag() = -sin(auxImJ); + } else { + h_A(i, j).real() = sin(auxIpJ); + h_A(i, j).imag() = sin(auxImJ); + } + } + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + _AuxType auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_A(i, j).real() = sin(auxIpJ); + h_A(i, j).imag() = sin(auxIpJ); + } + } + } + + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + _AuxType auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + _AuxType auxImJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i - j)); + h_expected(i, j).real() = 3.8 * sin(auxIpJ); + h_expected(i, j).imag() = -5.6 * sin(auxImJ); + } else { + h_expected(i, j).real() = h_A(i, j).real(); + h_expected(i, j).imag() = h_A(i, j).imag(); + } + } + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + _AuxType auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_expected(i, j).real() = 5.6 * sin(auxIpJ); + h_expected(i, j).imag() = 3.8 * sin(auxIpJ); + } else { + h_expected(i, j).real() = h_A(i, j).real(); + h_expected(i, j).imag() = h_A(i, j).imag(); + } + } + } + } +} + +// Code for non-complex values +template +template +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +Syr2Tester::populateAnalyticalValues(T& alpha, _HostViewTypeX& h_x, + _HostViewTypeY& h_y, + _HostViewTypeA& h_A, + _ViewTypeExpected& h_expected) { + alpha = std::is_same<_AuxType, int>::value ? 1 : 1.1; + + for (int i = 0; i < _M; ++i) { + _AuxType auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); + h_x[i] = sin(auxI); + } + + for (int i = 0; i < _M; ++i) { + _AuxType auxI = this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i)); + h_y[i] = cos(auxI); + } + + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + _AuxType auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_A(i, j) = .1 * sin(auxIpJ); + } + } + + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + _AuxType auxIpJ = + this->shrinkAngleToZeroTwoPiRange(static_cast<_AuxType>(i + j)); + h_expected(i, j) = 1.2 * sin(auxIpJ); + } else { + h_expected(i, j) = h_A(i, j); + } + } + } +} + +// Code for complex values +template +template +typename std::enable_if>::value || + std::is_same>::value, + void>::type +Syr2Tester::populateVanillaValues(const T& alpha, + const _HostViewTypeX& h_x, + const _HostViewTypeY& h_y, + const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { + if (_vanillaUsesDifferentOrderOfOps) { + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = + h_A(i, j) + alpha * _KAT_A::conj(h_y(j)) * h_x(i) + + _KAT_A::conj(alpha) * _KAT_A::conj(h_x(j)) * h_y(i); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + for (int i = 0; i < _N; ++i) { + h_vanilla(i, i).imag() = 0.; + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = + h_A(i, j) + alpha * h_x(j) * h_y(i) + alpha * h_y(j) * h_x(i); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + } + } else { + if (_useHermitianOption) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = + h_A(i, j) + alpha * h_x(i) * _KAT_A::conj(h_y(j)) + + _KAT_A::conj(alpha) * h_y(i) * _KAT_A::conj(h_x(j)); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + for (int i = 0; i < _N; ++i) { + h_vanilla(i, i).imag() = 0.; + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = + h_A(i, j) + alpha * h_x(i) * h_y(j) + alpha * h_y(i) * h_x(j); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + } + } +} + +// Code for non-complex values +template +template +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +Syr2Tester::populateVanillaValues(const T& alpha, + const _HostViewTypeX& h_x, + const _HostViewTypeY& h_y, + const _HostViewTypeA& h_A, + _ViewTypeExpected& h_vanilla) { + if (_useHermitianOption) { + if (_vanillaUsesDifferentOrderOfOps) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = + h_A(i, j) + alpha * h_x(j) * _KAT_A::conj(h_y(i)) + + _KAT_A::conj(alpha) * h_y(j) * _KAT_A::conj(h_x(i)); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = + h_A(i, j) + alpha * h_x(i) * _KAT_A::conj(h_y(j)) + + _KAT_A::conj(alpha) * h_y(i) * _KAT_A::conj(h_x(j)); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + } + } else { + if (_vanillaUsesDifferentOrderOfOps) { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = + h_A(i, j) + alpha * h_x(j) * h_y(i) + alpha * h_y(j) * h_x(i); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + } else { + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + h_vanilla(i, j) = + h_A(i, j) + alpha * h_x(i) * h_y(j) + alpha * h_y(i) * h_x(j); + } else { + h_vanilla(i, j) = h_A(i, j); + } + } + } + } + } +} + +template +template +T Syr2Tester::shrinkAngleToZeroTwoPiRange(const T input) { + T output(input); +#if 0 + T twoPi( 2. * Kokkos::numbers::pi ); + if (input > 0.) { + output -= std::floor( input / twoPi ) * twoPi; + } + else if (input < 0.) { + output += std::floor( -input / twoPi ) * twoPi; + } +#endif + return output; +} + +// Code for complex values +template +template +typename std::enable_if>::value || + std::is_same>::value, + void>::type +Syr2Tester:: + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + std::cout << "h_exp(" << i << "," << j << ") = " << h_expected(i, j) + << ", h_van(" << i << "," << j << ") = " << h_vanilla(i, j) + << std::endl; + } + } + } +#endif + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); + + if (_useAnalyticalResults) { + int numErrorsRealAbs(0); + int numErrorsRealRel(0); + int numErrorsImagAbs(0); + int numErrorsImagRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRealRel(0.); + int iForMaxErrorRealRel(0); + int jForMaxErrorRealRel(0); + _AuxType maxErrorImagRel(0.); + int iForMaxErrorImagRel(0); + int jForMaxErrorImagRel(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_expected(i, j).real() - h_vanilla(i, j).real()); + errorHappened = false; + if (h_expected(i, j).real() == 0.) { + diffThreshold = _KAT_A::abs(_absTol); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRealAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j).real()); + if (maxErrorRealRel < aux) { + maxErrorRealRel = aux; + iForMaxErrorRealRel = i; + jForMaxErrorRealRel = j; + } + + diffThreshold = _KAT_A::abs(_relTol * h_expected(i, j).real()); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRealRel++; + } + } + if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).real() = " << h_expected(i, j).real() + << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() + << ", _KAT_A::abs(h_expected(i,j).real() - " + "h_vanilla(i,j).real()) = " + << diff << ", diffThreshold = " << diffThreshold + << std::endl; +#endif + } + diff = _KAT_A::abs(h_expected(i, j).imag() - h_vanilla(i, j).imag()); + errorHappened = false; + if (h_expected(i, j).imag() == 0.) { + diffThreshold = _KAT_A::abs(_absTol); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsImagAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j).imag()); + if (maxErrorImagRel < aux) { + maxErrorImagRel = aux; + iForMaxErrorImagRel = i; + jForMaxErrorImagRel = j; + } + + diffThreshold = _KAT_A::abs(_relTol * h_expected(i, j).imag()); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsImagRel++; + } + } + if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).imag() = " << h_expected(i, j).imag() + << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() + << ", _KAT_A::abs(h_expected(i,j).imag() - " + "h_vanilla(i,j).imag()) = " + << diff << ", diffThreshold = " << diffThreshold + << std::endl; +#endif + } + } // for j + } // for i + + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla differs too much from analytical on real components" + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_expected(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", h_vanilla(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_vanilla(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrorsReal > 0) { + std::cout << "WARNING" << msg.str() << std::endl; + } +#endif + EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) + << "Failed test" << msg.str(); + } + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla differs too much from analytical on imag components" + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_expected(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", h_vanilla(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_vanilla(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrorsImag > 0) { + std::cout << "WARNING" << msg.str() << std::endl; + } +#endif + EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) + << "Failed test" << msg.str(); + } + } else { + int numErrorsReal(0); + int numErrorsImag(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + if (h_expected(i, j).real() != h_vanilla(i, j).real()) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrorsReal == 0) { + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).real() = " + << h_expected(i, j).real() + << ", h_vanilla(i,j).real() = " << h_vanilla(i, j).real() + << std::endl; + } +#endif + numErrorsReal++; + } + + if (h_expected(i, j).imag() != h_vanilla(i, j).imag()) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrorsImag == 0) { + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j).imag() = " + << h_expected(i, j).imag() + << ", h_vanilla(i,j).imag() = " << h_vanilla(i, j).imag() + << std::endl; + } +#endif + numErrorsImag++; + } + } // for j + } // for i + EXPECT_EQ(numErrorsReal, 0) + << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect on real components" + << ", numErrorsReal = " << numErrorsReal; + EXPECT_EQ(numErrorsImag, 0) + << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect on imag components" + << ", numErrorsImag = " << numErrorsImag; + } +} + +// Code for non-complex values +template +template +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +Syr2Tester:: + compareVanillaAgainstExpected(const T& alpha, + const _ViewTypeExpected& h_vanilla, + const _ViewTypeExpected& h_expected) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + std::cout << "h_exp(" << i << "," << j << ") = " << h_expected(i, j) + << ", h_van(" << i << "," << j << ") = " << h_vanilla(i, j) + << std::endl; + } + } + } +#endif + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); + + if (_useAnalyticalResults) { + int numErrorsAbs(0); + int numErrorsRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRel(0.); + int iForMaxErrorRel(0); + int jForMaxErrorRel(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_expected(i, j) - h_vanilla(i, j)); + errorHappened = false; + if (h_expected(i, j) == 0.) { + diffThreshold = _KAT_A::abs(_absTol); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_expected(i, j)); + if (maxErrorRel < aux) { + maxErrorRel = aux; + iForMaxErrorRel = i; + jForMaxErrorRel = j; + } + + diffThreshold = _KAT_A::abs(_relTol * h_expected(i, j)); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRel++; + } + } + if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j) = " << h_expected(i, j) + << ", h_vanilla(i,j) = " << h_vanilla(i, j) + << ", _KAT_A::abs(h_expected(i,j) - h_vanilla(i,j)) = " + << diff << ", diffThreshold = " << diffThreshold + << std::endl; +#endif + } + } // for j + } // for i + + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla differs too much from expected" + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_expected(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_expected(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", h_vanilla(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_vanilla(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrors(numErrorsAbs + numErrorsRel); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrors > 0) { + std::cout << "WARNING" << msg.str() << std::endl; + } +#endif + EXPECT_LE(numErrors, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } + } else { + int numErrors(0); + + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + if (h_expected(i, j) != h_vanilla(i, j)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrors == 0) { + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_expected(i,j) = " << h_expected(i, j) + << ", h_vanilla(i,j) = " << h_vanilla(i, j) << std::endl; + } +#endif + numErrors++; + } + } // for j + } // for i + EXPECT_EQ(numErrors, 0) + << "Failed test" + << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": vanilla result is incorrect" + << ", numErrors = " << numErrors; + } +} + +// Code for complex values +template +template +typename std::enable_if>::value || + std::is_same>::value, + void>::type +Syr2Tester:: + compareKkSyr2AgainstReference(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_reference) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + std::cout << "h_exp(" << i << "," << j << ") = " << h_reference(i, j) + << ", h_A(" << i << "," << j << ") = " << h_A(i, j) + << std::endl; + } + } + } +#endif + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); + + int numErrorsRealAbs(0); + int numErrorsRealRel(0); + int numErrorsImagAbs(0); + int numErrorsImagRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRealRel(0.); + int iForMaxErrorRealRel(0); + int jForMaxErrorRealRel(0); + _AuxType maxErrorImagRel(0.); + int iForMaxErrorImagRel(0); + int jForMaxErrorImagRel(0); + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_reference(i, j).real() - h_A(i, j).real()); + errorHappened = false; + if (h_reference(i, j).real() == 0.) { + diffThreshold = _KAT_A::abs(_absTol); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRealAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_reference(i, j).real()); + if (maxErrorRealRel < aux) { + maxErrorRealRel = aux; + iForMaxErrorRealRel = i; + jForMaxErrorRealRel = j; + } + + diffThreshold = _KAT_A::abs(_relTol * h_reference(i, j).real()); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRealRel++; + } + } + if (errorHappened && (numErrorsRealAbs + numErrorsRealRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout + << "ERROR, i = " << i << ", j = " << j + << ": h_reference(i,j).real() = " << h_reference(i, j).real() + << ", h_A(i,j).real() = " << h_A(i, j).real() + << ", _KAT_A::abs(h_reference(i,j).real() - h_A(i,j).real()) = " + << diff << ", diffThreshold = " << diffThreshold << std::endl; +#endif + } + diff = _KAT_A::abs(h_reference(i, j).imag() - h_A(i, j).imag()); + errorHappened = false; + if (h_reference(i, j).imag() == 0.) { + diffThreshold = _KAT_A::abs(_absTol); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsImagAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_reference(i, j).imag()); + if (maxErrorImagRel < aux) { + maxErrorImagRel = aux; + iForMaxErrorImagRel = i; + jForMaxErrorImagRel = j; + } + + diffThreshold = _KAT_A::abs(_relTol * h_reference(i, j).imag()); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsImagRel++; + } + } + if (errorHappened && (numErrorsImagAbs + numErrorsImagRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout + << "ERROR, i = " << i << ", j = " << j + << ": h_reference(i,j).imag() = " << h_reference(i, j).imag() + << ", h_A(i,j).imag() = " << h_A(i, j).imag() + << ", _KAT_A::abs(h_reference(i,j).imag() - h_A(i,j).imag()) = " + << diff << ", diffThreshold = " << diffThreshold << std::endl; +#endif + } + } // for j + } // for i + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout + << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_reference(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", h_A(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_reference(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", h_A(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; + if ((_M == 2131) && (_N == 2131)) { + std::cout << "Information" + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", h_reference(11, 2119) = (" << h_reference(11, 2119).real() + << ", " << h_reference(11, 2119).imag() << ")" + << ", h_A(11, 2119) = (" << h_A(11, 2119).real() << ", " + << h_A(11, 2119).imag() << ")" << std::endl; + std::cout << "Information" + << ": A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", h_reference(710, 1065) = (" << h_reference(710, 1065).real() + << ", " << h_reference(710, 1065).imag() << ")" + << ", h_A(710, 1065) = (" << h_A(710, 1065).real() << ", " + << h_A(710, 1065).imag() << ")" << std::endl; + } +#endif + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": syr2 result is incorrect on real components" + << ", numErrorsRealAbs = " << numErrorsRealAbs + << ", numErrorsRealRel = " << numErrorsRealRel + << ", maxErrorRealRel = " << maxErrorRealRel + << ", iForMaxErrorRealRel = " << iForMaxErrorRealRel + << ", jForMaxErrorRealRel = " << jForMaxErrorRealRel + << ", h_reference(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", h_A(i,j).real() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorRealRel, jForMaxErrorRealRel).real() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsReal(numErrorsRealAbs + numErrorsRealRel); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrorsReal > 0) { + std::cout << "WARNING" << msg.str() << std::endl; + } +#endif + EXPECT_LE(numErrorsReal, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ": syr2 result is incorrect on imag components" + << ", numErrorsImagAbs = " << numErrorsImagAbs + << ", numErrorsImagRel = " << numErrorsImagRel + << ", maxErrorImagRel = " << maxErrorImagRel + << ", iForMaxErrorImagRel = " << iForMaxErrorImagRel + << ", jForMaxErrorImagRel = " << jForMaxErrorImagRel + << ", h_reference(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", h_A(i,j).imag() = " + << (((_M > 0) && (_N > 0)) + ? h_A(iForMaxErrorImagRel, jForMaxErrorImagRel).imag() + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrorsImag(numErrorsImagAbs + numErrorsImagRel); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrorsImag > 0) { + std::cout << "WARNING" << msg.str() << std::endl; + } +#endif + EXPECT_LE(numErrorsImag, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } +} + +// Code for non-complex values +template +template +typename std::enable_if>::value && + !std::is_same>::value, + void>::type +Syr2Tester:: + compareKkSyr2AgainstReference(const T& alpha, const _HostViewTypeA& h_A, + const _ViewTypeExpected& h_reference) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (_N <= 2) { + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + std::cout << "h_exp(" << i << "," << j << ") = " << h_reference(i, j) + << ", h_A(" << i << "," << j << ") = " << h_A(i, j) + << std::endl; + } + } + } +#endif + int maxNumErrorsAllowed(static_cast(_M) * static_cast(_N) * + 1.e-3); + + int numErrorsAbs(0); + int numErrorsRel(0); + _AuxType diff(0.); + _AuxType diffThreshold(0.); + bool errorHappened(false); + _AuxType maxErrorRel(0.); + int iForMaxErrorRel(0); + int jForMaxErrorRel(0); + for (int i(0); i < _M; ++i) { + for (int j(0); j < _N; ++j) { + diff = _KAT_A::abs(h_reference(i, j) - h_A(i, j)); + errorHappened = false; + if (h_reference(i, j) == 0.) { + diffThreshold = _KAT_A::abs(_absTol); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsAbs++; + } + } else { + _AuxType aux = diff / _KAT_A::abs(h_reference(i, j)); + if (maxErrorRel < aux) { + maxErrorRel = aux; + iForMaxErrorRel = i; + jForMaxErrorRel = j; + } + + diffThreshold = _KAT_A::abs(_relTol * h_reference(i, j)); + if (diff > diffThreshold) { + errorHappened = true; + numErrorsRel++; + } + } + if (errorHappened && (numErrorsAbs + numErrorsRel == 1)) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "ERROR, i = " << i << ", j = " << j + << ": h_reference(i,j) = " << h_reference(i, j) + << ", h_A(i,j) = " << h_A(i, j) + << ", _KAT_A::abs(h_reference(i,j) - h_A(i,j)) = " << diff + << ", diffThreshold = " << diffThreshold << std::endl; +#endif + } + } // for j + } // for i +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel + << ", h_reference(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", h_A(i,j) = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed << std::endl; +#endif + { + std::ostringstream msg; + msg << ", A is " << _M << " by " << _N << ", _A_is_lr = " << _A_is_lr + << ", _A_is_ll = " << _A_is_ll + << ", alpha type = " << typeid(alpha).name() + << ", _useHermitianOption = " << _useHermitianOption + << ", _useUpOption = " << _useUpOption << ": syr2 result is incorrect" + << ", numErrorsAbs = " << numErrorsAbs + << ", numErrorsRel = " << numErrorsRel + << ", maxErrorRel = " << maxErrorRel + << ", iForMaxErrorRel = " << iForMaxErrorRel + << ", jForMaxErrorRel = " << jForMaxErrorRel << ", h_reference(i,j) = " + << (((_M > 0) && (_N > 0)) + ? h_reference(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", h_A(i,j) = " + << (((_M > 0) && (_N > 0)) ? h_A(iForMaxErrorRel, jForMaxErrorRel) + : 9.999e+99) + << ", maxNumErrorsAllowed = " << maxNumErrorsAllowed; + + int numErrors(numErrorsAbs + numErrorsRel); +#ifdef HAVE_KOKKOSKERNELS_DEBUG + if (numErrors > 0) { + std::cout << "WARNING" << msg.str() << std::endl; + } +#endif + EXPECT_LE(numErrors, maxNumErrorsAllowed) << "Failed test" << msg.str(); + } +} + +template +template +void Syr2Tester:: + callKkSyr2AndCompareAgainstExpected( + const ScalarA& alpha, TX& x, TY& y, + view_stride_adapter<_ViewTypeA, false>& A, + const _ViewTypeExpected& h_expected, const std::string& situation) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2, '" << situation << "', alpha = " << alpha + << std::endl; + std::cout << "In Test_Blas2_syr2.hpp, right before calling KokkosBlas::syr2()" + << ": ViewTypeA = " << typeid(_ViewTypeA).name() + << ", _kkSyr2ShouldThrowException = " << _kkSyr2ShouldThrowException + << std::endl; +#endif + std::string mode = _useHermitianOption ? "H" : "T"; + std::string uplo = _useUpOption ? "U" : "L"; + bool gotStdException(false); + bool gotUnknownException(false); + try { + KokkosBlas::syr2(mode.c_str(), uplo.c_str(), alpha, x, y, A.d_view); + Kokkos::fence(); + } catch (const std::exception& e) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2, '" << situation + << "': caught exception, e.what() = " << e.what() << std::endl; +#endif + gotStdException = true; + } catch (...) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2, '" << situation + << "': caught unknown exception" << std::endl; +#endif + gotUnknownException = true; + } + + EXPECT_EQ(gotUnknownException, false) + << "Failed test, '" << situation + << "': unknown exception should not have happened"; + + EXPECT_EQ(gotStdException, _kkSyr2ShouldThrowException) + << "Failed test, '" << situation << "': kk syr2() should" + << (_kkSyr2ShouldThrowException ? " " : " not ") + << "have thrown a std::exception"; + + if ((gotStdException == false) && (gotUnknownException == false)) { + Kokkos::deep_copy(A.h_base, A.d_base); + this->compareKkSyr2AgainstReference(alpha, A.h_view, h_expected); + } +} + +template +template +void Syr2Tester:: + callKkGerAndCompareKkSyr2AgainstIt( + const ScalarA& alpha, TX& x, TY& y, + view_stride_adapter<_ViewTypeA, false>& org_A, + const _HostViewTypeA& h_A_syr2, const std::string& situation) { + view_stride_adapter<_ViewTypeA, false> A_ger("A_ger", _M, _N); + Kokkos::deep_copy(A_ger.d_base, org_A.d_base); + + // ******************************************************************** + // Call ger() + // ******************************************************************** +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2, '" << situation << "', alpha = " << alpha + << std::endl; + std::cout << "In Test_Blas2_syr2.hpp, right before calling KokkosBlas::ger()" + << ": ViewTypeA = " << typeid(_ViewTypeA).name() + << ", _kkGerShouldThrowException = " << _kkGerShouldThrowException + << std::endl; +#endif + std::string mode = _useHermitianOption ? "H" : "T"; + bool gotStdException(false); + bool gotUnknownException(false); + try { + KokkosBlas::ger(mode.c_str(), alpha, x, y, A_ger.d_view); + Kokkos::fence(); + } catch (const std::exception& e) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2, '" << situation + << "', ger() call 1: caught exception, e.what() = " << e.what() + << std::endl; +#endif + gotStdException = true; + } catch (...) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2, '" << situation + << "', ger() call 1: caught unknown exception" << std::endl; +#endif + gotUnknownException = true; + } + + EXPECT_EQ(gotUnknownException, false) + << "Failed test, '" << situation + << "': unknown exception should not have happened for ger() call 1"; + + EXPECT_EQ(gotStdException, false) + << "Failed test, '" << situation + << "': kk ger() 1 should not have thrown a std::exception"; + + // ******************************************************************** + // Call ger() again + // ******************************************************************** +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout + << "In Test_Blas2_syr2.hpp, right before calling KokkosBlas::ger() again"; +#endif + try { + if (_useHermitianOption) { + KokkosBlas::ger(mode.c_str(), _KAT_A::conj(alpha), y, x, A_ger.d_view); + } else { + KokkosBlas::ger(mode.c_str(), alpha, y, x, A_ger.d_view); + } + Kokkos::fence(); + } catch (const std::exception& e) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2, '" << situation + << "', ger() call 2: caught exception, e.what() = " << e.what() + << std::endl; +#endif + gotStdException = true; + } catch (...) { +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "In Test_Blas2_syr2, '" << situation + << "', ger() call 2: caught unknown exception" << std::endl; +#endif + gotUnknownException = true; + } + + EXPECT_EQ(gotUnknownException, false) + << "Failed test, '" << situation + << "': unknown exception should not have happened for ger() call 2"; + + EXPECT_EQ(gotStdException, false) + << "Failed test, '" << situation + << "': kk ger() 2 should not have thrown a std::exception"; + + // ******************************************************************** + // Prepare h_ger_reference to be compared against h_A_syr2 + // ******************************************************************** + view_stride_adapter<_ViewTypeExpected, true> h_ger_reference( + "h_ger_reference", _M, _N); + Kokkos::deep_copy(h_ger_reference.d_base, A_ger.d_base); + Kokkos::deep_copy(h_ger_reference.h_base, h_ger_reference.d_base); + + std::string uplo = _useUpOption ? "U" : "L"; + for (int i = 0; i < _M; ++i) { + for (int j = 0; j < _N; ++j) { + if (((_useUpOption == true) && (i <= j)) || + ((_useUpOption == false) && (i >= j))) { + // Keep h_ger_reference as already computed + } else { + h_ger_reference.h_view(i, j) = org_A.h_view(i, j); + } + } + } + if (_useHermitianOption && _A_is_complex) { + for (int i(0); i < _N; ++i) { + h_ger_reference.h_view(i, i) = + 0.5 * (h_ger_reference.h_view(i, i) + + _KAT_A::conj(h_ger_reference.h_view(i, i))); + } + } + + // ******************************************************************** + // Compare + // ******************************************************************** + this->compareKkSyr2AgainstReference(alpha, h_A_syr2, h_ger_reference.h_view); +} + +} // namespace Test + +template +#ifdef HAVE_KOKKOSKERNELS_DEBUG +int test_syr2(const std::string& caseName) { + std::cout << "+==============================================================" + "============" + << std::endl; + std::cout << "Starting " << caseName << "..." << std::endl; +#else +int test_syr2(const std::string& /*caseName*/) { +#endif + bool xBool = std::is_same::value || + std::is_same::value || + std::is_same>::value || + std::is_same>::value; + bool yBool = std::is_same::value || + std::is_same::value || + std::is_same>::value || + std::is_same>::value; + bool aBool = std::is_same::value || + std::is_same::value || + std::is_same>::value || + std::is_same>::value; + bool useAnalyticalResults = xBool && yBool && aBool; + +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "+--------------------------------------------------------------" + "------------" + << std::endl; + std::cout << "Starting " << caseName << " for LAYOUTLEFT ..." << std::endl; +#endif + if (true) { + Test::Syr2Tester + tester; + tester.test(0, 0); + tester.test(1, 0); + tester.test(2, 0); + tester.test(13, 0); + tester.test(1024, 0); + + if (useAnalyticalResults) { + tester.test(1024, 0, true, false, false); + tester.test(1024, 0, true, false, true); + tester.test(1024, 0, true, true, false); + tester.test(1024, 0, true, true, true); + } + + tester.test(2, 0, false, false, true); + tester.test(50, 0, false, false, true); + tester.test(2, 0, false, true, false); + tester.test(50, 0, false, true, false); + tester.test(2, 0, false, true, true); + tester.test(50, 0, false, true, true); + + tester.test(50, 4); + tester.test(2131, 0); + } + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Finished " << caseName << " for LAYOUTLEFT" << std::endl; + std::cout << "+--------------------------------------------------------------" + "------------" + << std::endl; +#endif +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "+--------------------------------------------------------------" + "------------" + << std::endl; + std::cout << "Starting " << caseName << " for LAYOUTRIGHT ..." << std::endl; +#endif + if (true) { + Test::Syr2Tester + tester; + tester.test(0, 0); + tester.test(1, 0); + tester.test(2, 0); + tester.test(13, 0); + tester.test(1024, 0); + + if (useAnalyticalResults) { + tester.test(1024, 0, true, false, false); + tester.test(1024, 0, true, false, true); + tester.test(1024, 0, true, true, false); + tester.test(1024, 0, true, true, true); + } + + tester.test(2, 0, false, false, true); + tester.test(50, 0, false, false, true); + tester.test(2, 0, false, true, false); + tester.test(50, 0, false, true, false); + tester.test(2, 0, false, true, true); + tester.test(50, 0, false, true, true); + + tester.test(50, 4); + tester.test(2131, 0); + } + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Finished " << caseName << " for LAYOUTRIGHT" << std::endl; + std::cout << "+--------------------------------------------------------------" + "------------" + << std::endl; +#endif +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "+--------------------------------------------------------------" + "------------" + << std::endl; + std::cout << "Starting " << caseName << " for LAYOUTSTRIDE ..." << std::endl; +#endif + if (true) { + Test::Syr2Tester + tester; + tester.test(0, 0); + tester.test(1, 0); + tester.test(2, 0); + tester.test(13, 0); + tester.test(1024, 0); + + if (useAnalyticalResults) { + tester.test(1024, 0, true, false, false); + tester.test(1024, 0, true, false, true); + tester.test(1024, 0, true, true, false); + tester.test(1024, 0, true, true, true); + } + + tester.test(2, 0, false, false, true); + tester.test(50, 0, false, false, true); + tester.test(2, 0, false, true, false); + tester.test(50, 0, false, true, false); + tester.test(2, 0, false, true, true); + tester.test(50, 0, false, true, true); + + tester.test(50, 4); + tester.test(2131, 0); + } + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Finished " << caseName << " for LAYOUTSTRIDE" << std::endl; + std::cout << "+--------------------------------------------------------------" + "------------" + << std::endl; +#endif +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "+--------------------------------------------------------------" + "------------" + << std::endl; + std::cout << "Starting " << caseName << " for MIXED LAYOUTS ..." << std::endl; +#endif + if (true) { + Test::Syr2Tester + tester; + tester.test(1, 0); + tester.test(2, 0); + tester.test(1024, 0); + + if (useAnalyticalResults) { + tester.test(1024, 0, true, false, true); + tester.test(1024, 0, true, true, true); + } + + tester.test(2, 0, false, false, true); + tester.test(50, 0, false, false, true); + tester.test(2, 0, false, true, true); + tester.test(50, 0, false, true, true); + } + + if (true) { + Test::Syr2Tester + tester; + tester.test(1024, 0); + } + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Finished " << caseName << " for MIXED LAYOUTS" << std::endl; + std::cout << "+--------------------------------------------------------------" + "------------" + << std::endl; +#endif +#endif + +#ifdef HAVE_KOKKOSKERNELS_DEBUG + std::cout << "Finished " << caseName << std::endl; + std::cout << "+==============================================================" + "============" + << std::endl; +#endif + return 1; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr2_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_float"); + test_syr2("test case syr2_float"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr2_complex_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_complex_float"); + test_syr2, Kokkos::complex, + Kokkos::complex, TestDevice>("test case syr2_complex_float"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr2_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_double"); + test_syr2("test case syr2_double"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr2_complex_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_complex_double"); + test_syr2, Kokkos::complex, + Kokkos::complex, TestDevice>( + "test case syr2_complex_double"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, syr2_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_int"); + test_syr2("test case syr2_int"); + Kokkos::Profiling::popRegion(); +} +#endif + +#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) +TEST_F(TestCategory, syr2_int_float_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::syr2_int_float_double"); + test_syr2("test case syr2_mixed_types"); + Kokkos::Profiling::popRegion(); +} +#endif diff --git a/packages/kokkos-kernels/cm_generate_makefile.bash b/packages/kokkos-kernels/cm_generate_makefile.bash index 3358ae2eb8b8..e872789c7266 100755 --- a/packages/kokkos-kernels/cm_generate_makefile.bash +++ b/packages/kokkos-kernels/cm_generate_makefile.bash @@ -178,6 +178,7 @@ get_kernels_tpls_list() { KOKKOSKERNELS_USER_TPL_LIBNAME_CMD= CUBLAS_DEFAULT=OFF CUSPARSE_DEFAULT=OFF + CUSOLVER_DEFAULT=OFF ROCBLAS_DEFAULT=OFF ROCSPARSE_DEFAULT=OFF PARSE_TPLS_LIST=$(echo $KOKKOSKERNELS_TPLS | tr "," "\n") @@ -191,6 +192,9 @@ get_kernels_tpls_list() { if [ "$UC_TPLS" == "CUSPARSE" ]; then CUSPARSE_DEFAULT=ON fi + if [ "$UC_TPLS" == "CUSOLVER" ]; then + CUSOLVER_DEFAULT=ON + fi if [ "$UC_TPLS" == "ROCBLAS" ]; then ROCBLAS_DEFAULT=ON fi @@ -224,6 +228,9 @@ get_kernels_tpls_list() { if [ "$CUSPARSE_DEFAULT" == "OFF" ]; then KOKKOSKERNELS_TPLS_CMD="-DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF ${KOKKOSKERNELS_TPLS_CMD}" fi + if [ "$CUSOLVER_DEFAULT" == "OFF" ]; then + KOKKOSKERNELS_TPLS_CMD="-DKokkosKernels_ENABLE_TPL_CUSOLVER=OFF ${KOKKOSKERNELS_TPLS_CMD}" + fi if [ "$ROCBLAS_DEFAULT" == "OFF" ]; then KOKKOSKERNELS_TPLS_CMD="-DKokkosKernels_ENABLE_TPL_ROCBLAS=OFF ${KOKKOSKERNELS_TPLS_CMD}" fi @@ -320,7 +327,6 @@ display_help_text() { echo "--with-gtest=/Path/To/Gtest: Set path to gtest. (Used in unit and performance" echo " tests.)" echo "--with-hwloc=/Path/To/Hwloc: Set path to hwloc library." - echo "--with-memkind=/Path/To/MemKind: Set path to memkind library." echo "--with-options=[OPT]: Additional options to Kokkos:" echo " compiler_warnings" echo " aggressive_vectorization = add ivdep on loops" @@ -487,10 +493,6 @@ do KOKKOS_HWLOC=ON HWLOC_PATH="${key#*=}" ;; - --with-memkind*) - KOKKOS_MEMKIND=ON - MEMKIND_PATH="${key#*=}" - ;; --arch*) KOKKOS_ARCH="${key#*=}" ;; @@ -710,15 +712,6 @@ else KOKKOS_HWLOC_CMD= fi -if [ "$KOKKOS_MEMKIND" == "ON" ]; then - KOKKOS_MEMKIND_CMD=-DKokkos_ENABLE_MEMKIND=ON - if [ "$MEMKIND_PATH" != "" ]; then - KOKKOS_MEMKIND_PATH_CMD=-DMEMKIND_ROOT=$MEMKIND_PATH - fi -else - KOKKOS_MEMKIND_CMD= -fi - # Currently assumes script is in base kokkos-kernels directory if [ ! -e ${KOKKOSKERNELS_PATH}/CMakeLists.txt ]; then @@ -811,9 +804,9 @@ cd ${KOKKOS_INSTALL_PATH} # Configure kokkos echo "" -echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} ${KOKKOS_PASSTHRU_CMAKE_FLAGS} ${KOKKOS_PATH} +echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} ${KOKKOS_PASSTHRU_CMAKE_FLAGS} ${KOKKOS_PATH} echo "" -cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} ${KOKKOS_PASSTHRU_CMAKE_FLAGS} ${KOKKOS_PATH} +cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} ${KOKKOS_PASSTHRU_CMAKE_FLAGS} ${KOKKOS_PATH} # Install kokkos library make install -j $KOKKOS_MAKEINSTALL_J diff --git a/packages/kokkos-kernels/cmake/Dependencies.cmake b/packages/kokkos-kernels/cmake/Dependencies.cmake index b94dd3e4c922..a52f0c098cd4 100644 --- a/packages/kokkos-kernels/cmake/Dependencies.cmake +++ b/packages/kokkos-kernels/cmake/Dependencies.cmake @@ -1,6 +1,6 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_REQUIRED_PACKAGES Kokkos - LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE METIS SuperLU Cholmod CUBLAS ROCBLAS ROCSPARSE + LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK METIS SuperLU Cholmod CUBLAS CUSPARSE CUSOLVER ROCBLAS ROCSPARSE ROCSOLVER TEST_OPTIONAL_TPLS yamlcpp ) # NOTE: If you update names in LIB_OPTIONAL_TPLS above, make sure to map those names in diff --git a/packages/kokkos-kernels/cmake/KokkosKernels_config.h.in b/packages/kokkos-kernels/cmake/KokkosKernels_config.h.in index d94860e38067..ef8fea78b845 100644 --- a/packages/kokkos-kernels/cmake/KokkosKernels_config.h.in +++ b/packages/kokkos-kernels/cmake/KokkosKernels_config.h.in @@ -53,6 +53,7 @@ /* Whether to build kernels for execution space Kokkos::HIP */ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_HIP #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_HIPSPACE +#cmakedefine KOKKOSKERNELS_INST_MEMSPACE_HIPMANAGEDSPACE /* Whether to build kernels for execution space Kokkos::Experimental::SYCL */ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_SYCL #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_SYCLSPACE @@ -114,10 +115,12 @@ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_LAPACK /* MKL library */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_MKL -/* CUSPARSE */ -#cmakedefine KOKKOSKERNELS_ENABLE_TPL_CUSPARSE /* CUBLAS */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_CUBLAS +/* CUSPARSE */ +#cmakedefine KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +/* CUSOLVER */ +#cmakedefine KOKKOSKERNELS_ENABLE_TPL_CUSOLVER /* MAGMA */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_MAGMA /* SuperLU */ @@ -138,6 +141,8 @@ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_ROCBLAS /* ROCSPARSE */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE +/* ROCSOLVER */ +#cmakedefine KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER #cmakedefine KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV diff --git a/packages/kokkos-kernels/cmake/Modules/FindTPLCUBLAS.cmake b/packages/kokkos-kernels/cmake/Modules/FindTPLCUBLAS.cmake index 890c2dac6277..164f3bf4c48f 100644 --- a/packages/kokkos-kernels/cmake/Modules/FindTPLCUBLAS.cmake +++ b/packages/kokkos-kernels/cmake/Modules/FindTPLCUBLAS.cmake @@ -1,18 +1,47 @@ -FIND_PACKAGE(CUDA) - -INCLUDE(FindPackageHandleStandardArgs) -IF (NOT CUDA_FOUND) - #Important note here: this find Module is named TPLCUBLAS - #The eventual target is named CUBLAS. To avoid naming conflicts - #the find module is called TPLCUBLAS. This call will cause - #the find_package call to fail in a "standard" CMake way - FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUBLAS REQUIRED_VARS CUDA_FOUND) -ELSE() - #The libraries might be empty - OR they might explicitly be not found - IF("${CUDA_CUBLAS_LIBRARIES}" MATCHES "NOTFOUND") - FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUBLAS REQUIRED_VARS CUDA_CUBLAS_LIBRARIES) +if(CUBLAS_LIBRARIES AND CUBLAS_LIBRARY_DIRS AND CUBLAS_INCLUDE_DIRS) + kokkoskernels_find_imported(CUBLAS INTERFACE + LIBRARIES ${CUBLAS_LIBRARIES} + LIBRARY_PATHS ${CUBLAS_LIBRARY_DIRS} + HEADER_PATHS ${CUBLAS_INCLUDE_DIRS} + ) +elseif(CUBLAS_LIBRARIES AND CUBLAS_LIBRARY_DIRS) + kokkoskernels_find_imported(CUBLAS INTERFACE + LIBRARIES ${CUBLAS_LIBRARIES} + LIBRARY_PATHS ${CUBLAS_LIBRARY_DIRS} + HEADER cublas.h + ) +elseif(CUBLAS_LIBRARIES) + kokkoskernels_find_imported(CUBLAS INTERFACE + LIBRARIES ${CUBLAS_LIBRARIES} + HEADER cublas.h + ) +elseif(CUBLAS_LIBRARY_DIRS) + kokkoskernels_find_imported(CUBLAS INTERFACE + LIBRARIES cublas + LIBRARY_PATHS ${CUBLAS_LIBRARY_DIRS} + HEADER cublas.h + ) +elseif(CUBLAS_ROOT OR KokkosKernels_CUBLAS_ROOT) # nothing specific provided, just ROOT + kokkoskernels_find_imported(CUBLAS INTERFACE + LIBRARIES cublas + HEADER cublas.h + ) +else() # backwards-compatible way + FIND_PACKAGE(CUDA) + INCLUDE(FindPackageHandleStandardArgs) + IF (NOT CUDA_FOUND) + #Important note here: this find Module is named TPLCUBLAS + #The eventual target is named CUBLAS. To avoid naming conflicts + #the find module is called TPLCUBLAS. This call will cause + #the find_package call to fail in a "standard" CMake way + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUBLAS REQUIRED_VARS CUDA_FOUND) ELSE() - KOKKOSKERNELS_CREATE_IMPORTED_TPL(CUBLAS INTERFACE - LINK_LIBRARIES "${CUDA_CUBLAS_LIBRARIES}") + #The libraries might be empty - OR they might explicitly be not found + IF("${CUDA_CUBLAS_LIBRARIES}" MATCHES "NOTFOUND") + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUBLAS REQUIRED_VARS CUDA_CUBLAS_LIBRARIES) + ELSE() + KOKKOSKERNELS_CREATE_IMPORTED_TPL(CUBLAS INTERFACE + LINK_LIBRARIES "${CUDA_CUBLAS_LIBRARIES}") + ENDIF() ENDIF() -ENDIF() +endif() diff --git a/packages/kokkos-kernels/cmake/Modules/FindTPLCUSOLVER.cmake b/packages/kokkos-kernels/cmake/Modules/FindTPLCUSOLVER.cmake new file mode 100644 index 000000000000..3e436394950f --- /dev/null +++ b/packages/kokkos-kernels/cmake/Modules/FindTPLCUSOLVER.cmake @@ -0,0 +1,46 @@ +if(CUSOLVER_LIBRARIES AND CUSOLVER_LIBRARY_DIRS AND CUSOLVER_INCLUDE_DIRS) + kokkoskernels_find_imported(CUSOLVER INTERFACE + LIBRARIES ${CUSOLVER_LIBRARIES} + LIBRARY_PATHS ${CUSOLVER_LIBRARY_DIRS} + HEADER_PATHS ${CUSOLVER_INCLUDE_DIRS} + ) +elseif(CUSOLVER_LIBRARIES AND CUSOLVER_LIBRARY_DIRS) + kokkoskernels_find_imported(CUSOLVER INTERFACE + LIBRARIES ${CUSOLVER_LIBRARIES} + LIBRARY_PATHS ${CUSOLVER_LIBRARY_DIRS} + HEADER cusolverDn.h + ) +elseif(CUSOLVER_LIBRARIES) + kokkoskernels_find_imported(CUSOLVER INTERFACE + LIBRARIES ${CUSOLVER_LIBRARIES} + HEADER cusolverDn.h + ) +elseif(CUSOLVER_LIBRARY_DIRS) + kokkoskernels_find_imported(CUSOLVER INTERFACE + LIBRARIES cusolver + LIBRARY_PATHS ${CUSOLVER_LIBRARY_DIRS} + HEADER cusolverDn.h + ) +elseif(CUSOLVER_ROOT OR KokkosKernels_CUSOLVER_ROOT) # nothing specific provided, just ROOT + kokkoskernels_find_imported(CUSOLVER INTERFACE + LIBRARIES cusolver + HEADER cusolverDn.h + ) +else() # backwards-compatible way + FIND_PACKAGE(CUDA) + INCLUDE(FindPackageHandleStandardArgs) + IF (NOT CUDA_FOUND) + #Important note here: this find Module is named TPLCUSOLVER + #The eventual target is named CUSOLVER. To avoid naming conflicts + #the find module is called TPLCUSOLVER. This call will cause + #the find_package call to fail in a "standard" CMake way + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUSOLVER REQUIRED_VARS CUDA_FOUND) + ELSE() + #The libraries might be empty - OR they might explicitly be not found + IF("${CUDA_cusolver_LIBRARY}" MATCHES "NOTFOUND") + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUSOLVER REQUIRED_VARS CUDA_cusolver_LIBRARY) + ELSE() + KOKKOSKERNELS_CREATE_IMPORTED_TPL(CUSOLVER INTERFACE LINK_LIBRARIES "${CUDA_cusolver_LIBRARY}") + ENDIF() + ENDIF() +endif() diff --git a/packages/kokkos-kernels/cmake/Modules/FindTPLCUSPARSE.cmake b/packages/kokkos-kernels/cmake/Modules/FindTPLCUSPARSE.cmake index f6e02129ae4a..6302f85d783a 100644 --- a/packages/kokkos-kernels/cmake/Modules/FindTPLCUSPARSE.cmake +++ b/packages/kokkos-kernels/cmake/Modules/FindTPLCUSPARSE.cmake @@ -1,17 +1,46 @@ -FIND_PACKAGE(CUDA) - -INCLUDE(FindPackageHandleStandardArgs) -IF (NOT CUDA_FOUND) - #Important note here: this find Module is named TPLCUSPARSE - #The eventual target is named CUSPARSE. To avoid naming conflicts - #the find module is called TPLCUSPARSE. This call will cause - #the find_package call to fail in a "standard" CMake way - FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUSPARSE REQUIRED_VARS CUDA_FOUND) -ELSE() - #The libraries might be empty - OR they might explicitly be not found - IF("${CUDA_cusparse_LIBRARY}" MATCHES "NOTFOUND") - FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUSPARSE REQUIRED_VARS CUDA_cusparse_LIBRARY) +if(CUSPARSE_LIBRARIES AND CUSPARSE_LIBRARY_DIRS AND CUSPARSE_INCLUDE_DIRS) + kokkoskernels_find_imported(CUSPARSE INTERFACE + LIBRARIES ${CUSPARSE_LIBRARIES} + LIBRARY_PATHS ${CUSPARSE_LIBRARY_DIRS} + HEADER_PATHS ${CUSPARSE_INCLUDE_DIRS} + ) +elseif(CUSPARSE_LIBRARIES AND CUSPARSE_LIBRARY_DIRS) + kokkoskernels_find_imported(CUSPARSE INTERFACE + LIBRARIES ${CUSPARSE_LIBRARIES} + LIBRARY_PATHS ${CUSPARSE_LIBRARY_DIRS} + HEADER cusparse.h + ) +elseif(CUSPARSE_LIBRARIES) + kokkoskernels_find_imported(CUSPARSE INTERFACE + LIBRARIES ${CUSPARSE_LIBRARIES} + HEADER cusparse.h + ) +elseif(CUSPARSE_LIBRARY_DIRS) + kokkoskernels_find_imported(CUSPARSE INTERFACE + LIBRARIES cusparse + LIBRARY_PATHS ${CUSPARSE_LIBRARY_DIRS} + HEADER cusparse.h + ) +elseif(CUSPARSE_ROOT OR KokkosKernels_CUSPARSE_ROOT) # nothing specific provided, just ROOT + kokkoskernels_find_imported(CUSPARSE INTERFACE + LIBRARIES cusparse + HEADER cusparse.h + ) +else() # backwards-compatible way + FIND_PACKAGE(CUDA) + INCLUDE(FindPackageHandleStandardArgs) + IF (NOT CUDA_FOUND) + #Important note here: this find Module is named TPLCUSPARSE + #The eventual target is named CUSPARSE. To avoid naming conflicts + #the find module is called TPLCUSPARSE. This call will cause + #the find_package call to fail in a "standard" CMake way + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUSPARSE REQUIRED_VARS CUDA_FOUND) ELSE() - KOKKOSKERNELS_CREATE_IMPORTED_TPL(CUSPARSE LIBRARY ${CUDA_cusparse_LIBRARY}) + #The libraries might be empty - OR they might explicitly be not found + IF("${CUDA_cusparse_LIBRARY}" MATCHES "NOTFOUND") + FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUSPARSE REQUIRED_VARS CUDA_cusparse_LIBRARY) + ELSE() + KOKKOSKERNELS_CREATE_IMPORTED_TPL(CUSPARSE INTERFACE LINK_LIBRARIES "${CUDA_cusparse_LIBRARY}") + ENDIF() ENDIF() -ENDIF() +endif() diff --git a/packages/kokkos-kernels/cmake/Modules/FindTPLROCSOLVER.cmake b/packages/kokkos-kernels/cmake/Modules/FindTPLROCSOLVER.cmake new file mode 100644 index 000000000000..8f2a92cfdae2 --- /dev/null +++ b/packages/kokkos-kernels/cmake/Modules/FindTPLROCSOLVER.cmake @@ -0,0 +1,9 @@ +# LBV: 11/08/2023: This file follows the partern of FindTPLROCBLAS.cmake/FindTPLROCSPARSE.cmake +FIND_PACKAGE(ROCSOLVER) +if(TARGET roc::rocsolver) + SET(TPL_ROCSOLVER_IMPORTED_NAME roc::rocsolver) + SET(TPL_IMPORTED_NAME roc::rocsolver) + ADD_LIBRARY(KokkosKernels::ROCSOLVER ALIAS roc::rocsolver) +ELSE() + MESSAGE(FATAL_ERROR "Package ROCSOLVER requested but not found") +ENDIF() diff --git a/packages/kokkos-kernels/cmake/kokkoskernels_components.cmake b/packages/kokkos-kernels/cmake/kokkoskernels_components.cmake index 49bc2f4ae608..16a784bd1ffe 100644 --- a/packages/kokkos-kernels/cmake/kokkoskernels_components.cmake +++ b/packages/kokkos-kernels/cmake/kokkoskernels_components.cmake @@ -102,4 +102,4 @@ IF ( KokkosKernels_ENABLE_COMPONENT_BATCHED ELSE() SET(KOKKOSKERNELS_ALL_COMPONENTS_ENABLED OFF CACHE BOOL "" FORCE) ENDIF() -mark_as_advanced(FORCE KOKKOSKERNELS_ALL_COMPONENTS_ENABLED) +mark_as_advanced(FORCE KOKKOSKERNELS_ALL_COMPONENTS_ENABLED) \ No newline at end of file diff --git a/packages/kokkos-kernels/cmake/kokkoskernels_eti_devices.cmake b/packages/kokkos-kernels/cmake/kokkoskernels_eti_devices.cmake index 8c6cb540ae3a..8c38be098c96 100644 --- a/packages/kokkos-kernels/cmake/kokkoskernels_eti_devices.cmake +++ b/packages/kokkos-kernels/cmake/kokkoskernels_eti_devices.cmake @@ -23,20 +23,20 @@ SET(MEM_SPACES MEMSPACE_CUDASPACE MEMSPACE_CUDAUVMSPACE MEMSPACE_HIPSPACE + MEMSPACE_HIPMANAGEDSPACE MEMSPACE_SYCLSPACE MEMSPACE_SYCLSHAREDSPACE MEMSPACE_OPENMPTARGET MEMSPACE_HOSTSPACE - MEMSPACE_HBWSPACE ) SET(MEMSPACE_CUDASPACE_CPP_TYPE Kokkos::CudaSpace) SET(MEMSPACE_CUDAUVMSPACE_CPP_TYPE Kokkos::CudaUVMSpace) SET(MEMSPACE_HIPSPACE_CPP_TYPE Kokkos::HIPSpace) +SET(MEMSPACE_HIPMANAGEDSPACE_CPP_TYPE Kokkos::HIPManagedSpace) SET(MEMSPACE_SYCLSPACE_CPP_TYPE Kokkos::Experimental::SYCLDeviceUSMSpace) SET(MEMSPACE_SYCLSHAREDSPACE_CPP_TYPE Kokkos::Experimental::SYCLSharedUSMSpace) SET(MEMSPACE_OPENMPTARGETSPACE_CPP_TYPE Kokkos::Experimental::OpenMPTargetSpace) SET(MEMSPACE_HOSTSPACE_CPP_TYPE Kokkos::HostSpace) -SET(MEMSPACE_HBWSPACE_CPP_TYPE Kokkos::HBWSpace) IF(KOKKOS_ENABLE_CUDA) KOKKOSKERNELS_ADD_OPTION( @@ -85,10 +85,19 @@ IF(KOKKOS_ENABLE_HIP) BOOL "Whether to pre instantiate kernels for the memory space Kokkos::HIPSpace. Disabling this when Kokkos_ENABLE_HIP is enabled may increase build times. Default: ON if Kokkos is HIP-enabled, OFF otherwise." ) + KOKKOSKERNELS_ADD_OPTION( + INST_MEMSPACE_HIPMANAGEDSPACE + OFF + BOOL + "Whether to pre instantiate kernels for the memory space Kokkos::HIPManagedSpace. Disabling this when Kokkos_ENABLE_HIP is enabled may increase build times. Default: OFF." + ) IF(KOKKOSKERNELS_INST_EXECSPACE_HIP AND KOKKOSKERNELS_INST_MEMSPACE_HIPSPACE) LIST(APPEND DEVICE_LIST "") ENDIF() + IF(KOKKOSKERNELS_INST_EXECSPACE_HIP AND KOKKOSKERNELS_INST_MEMSPACE_HIPMANAGEDSPACE) + LIST(APPEND DEVICE_LIST "") + ENDIF() IF( Trilinos_ENABLE_COMPLEX_DOUBLE AND ((NOT DEFINED CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS) OR (NOT CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS)) ) MESSAGE( WARNING "The CMake option CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS is either undefined or OFF. Please set CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS:BOOL=ON when building with HIP and complex double enabled.") @@ -152,13 +161,6 @@ KOKKOSKERNELS_ADD_OPTION( "Whether to pre instantiate kernels for the memory space Kokkos::HostSpace. Disabling this when one of the Host execution spaces is enabled may increase build times. Default: ON" ) -KOKKOSKERNELS_ADD_OPTION( - INST_MEMSPACE_HBWSPACE - OFF - BOOL - "Whether to pre instantiate kernels for the memory space Kokkos::HBWSpace." -) - KOKKOSKERNELS_ADD_OPTION( INST_EXECSPACE_OPENMP ${KOKKOSKERNELS_INST_EXECSPACE_OPENMP_DEFAULT} @@ -197,12 +199,12 @@ KOKKOSKERNELS_ADD_OPTION( ) SET(EXECSPACE_CUDA_VALID_MEM_SPACES CUDASPACE CUDAUVMSPACE) -SET(EXECSPACE_HIP_VALID_MEM_SPACES HIPSPACE) +SET(EXECSPACE_HIP_VALID_MEM_SPACES HIPSPACE HIPMANAGEDSPACE) SET(EXECSPACE_SYCL_VALID_MEM_SPACES SYCLSPACE SYCLSHAREDSPACE) SET(EXECSPACE_OPENMPTARGET_VALID_MEM_SPACES OPENMPTARGETSPACE) -SET(EXECSPACE_SERIAL_VALID_MEM_SPACES HBWSPACE HOSTSPACE) -SET(EXECSPACE_OPENMP_VALID_MEM_SPACES HBWSPACE HOSTSPACE) -SET(EXECSPACE_THREADS_VALID_MEM_SPACES HBWSPACE HOSTSPACE) +SET(EXECSPACE_SERIAL_VALID_MEM_SPACES HOSTSPACE) +SET(EXECSPACE_OPENMP_VALID_MEM_SPACES HOSTSPACE) +SET(EXECSPACE_THREADS_VALID_MEM_SPACES HOSTSPACE) SET(DEVICES) FOREACH(EXEC ${EXEC_SPACES}) IF (KOKKOSKERNELS_INST_${EXEC}) diff --git a/packages/kokkos-kernels/cmake/kokkoskernels_features.cmake b/packages/kokkos-kernels/cmake/kokkoskernels_features.cmake index aacc1c845109..211c0c740e0a 100644 --- a/packages/kokkos-kernels/cmake/kokkoskernels_features.cmake +++ b/packages/kokkos-kernels/cmake/kokkoskernels_features.cmake @@ -27,3 +27,38 @@ IF (KOKKOSKERNELS_ENABLE_TPL_BLAS OR KOKKOSKERNELS_ENABLE_TPL_MKL OR KOKKOSKERNE INCLUDE(CheckHostBlasReturnComplex.cmake) CHECK_HOST_BLAS_RETURN_COMPLEX(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) ENDIF() + +# ================================================================== +# Lapack requirements +# ================================================================== + +IF (KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_ROCBLAS AND NOT KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) + MESSAGE(FATAL_ERROR "rocSOLVER requires rocBLAS and rocSPARSE, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_ROCBLAS:BOOL=ON and KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE:BOOL=ON.") +ELSEIF (KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) + MESSAGE(FATAL_ERROR "rocSOLVER requires rocSPARSE, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE:BOOL=ON.") +ELSEIF (KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_ROCBLAS) + MESSAGE(FATAL_ERROR "rocSOLVER requires rocBLAS, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_ROCBLAS:BOOL=ON.") +ENDIF() + +# TPL_ENABLE_CUDA default enables CUBLAS and CUSOLVER in Trilinos, but not CUSPARSE. CUSPARSE is a required TPL for CUSOLVER support in KokkosKernels. +IF (KOKKOSKERNELS_HAS_TRILINOS AND TPL_ENABLE_CUDA) + # Checks disable CUSOLVER in KokkosKernels if TPL dependency requirements are not met. This is a compatibility workaround to allow existing configuration options for Trilinos to continue working. + IF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_CUBLAS AND NOT KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) + MESSAGE(WARNING "cuSOLVER requires cuBLAS and cuSPARSE, disabling cuSOLVER. To use cuSOLVER, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_CUBLAS:BOOL=ON and KOKKOSKERNELS_ENABLE_TPL_CUSPARSE:BOOL=ON to use.") + SET(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER OFF CACHE BOOL "Disabling KOKKOSKERNELS_ENABLE_TPL_CUSOLVER - this capability requires both CUBLAS and CUSPARSE TPLs" FORCE) + ELSEIF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) + MESSAGE(WARNING "cuSOLVER requires cuSPARSE, disabling cuSOLVER. To use cuSOLVER, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_CUSPARSE:BOOL=ON to use.") + SET(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER OFF CACHE BOOL "Disabling KOKKOSKERNELS_ENABLE_TPL_CUSOLVER - this capability requires both CUBLAS and CUSPARSE TPLs" FORCE) + ELSEIF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_CUBLAS) + MESSAGE(WARNING "cuSOLVER requires cuBLAS, disabling cuSOLVER. To use cuSOLVER, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_CUBLAS:BOOL=ON to use.") + SET(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER OFF CACHE BOOL "Disabling KOKKOSKERNELS_ENABLE_TPL_CUSOLVER - this capability requires both CUBLAS and CUSPARSE TPLs" FORCE) + ENDIF() +ELSE() + IF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_CUBLAS AND NOT KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) + MESSAGE(FATAL_ERROR "cuSOLVER requires cuBLAS and cuSPARSE, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_CUBLAS:BOOL=ON and KOKKOSKERNELS_ENABLE_TPL_CUSPARSE:BOOL=ON.") + ELSEIF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) + MESSAGE(FATAL_ERROR "cuSOLVER requires cuSPARSE, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_CUSPARSE:BOOL=ON.") + ELSEIF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER AND NOT KOKKOSKERNELS_ENABLE_TPL_CUBLAS) + MESSAGE(FATAL_ERROR "cuSOLVER requires cuBLAS, please reconfigure with KOKKOSKERNELS_ENABLE_TPL_CUBLAS:BOOL=ON.") + ENDIF() +ENDIF() diff --git a/packages/kokkos-kernels/cmake/kokkoskernels_tpls.cmake b/packages/kokkos-kernels/cmake/kokkoskernels_tpls.cmake index 08c71581482f..d1a44721e6d7 100644 --- a/packages/kokkos-kernels/cmake/kokkoskernels_tpls.cmake +++ b/packages/kokkos-kernels/cmake/kokkoskernels_tpls.cmake @@ -447,28 +447,35 @@ ENDIF() KOKKOSKERNELS_ADD_OPTION(NO_DEFAULT_CUDA_TPLS OFF BOOL "Whether CUDA TPLs should be enabled by default. Default: OFF") SET(CUBLAS_DEFAULT ${KOKKOS_ENABLE_CUDA}) SET(CUSPARSE_DEFAULT ${KOKKOS_ENABLE_CUDA}) +SET(CUSOLVER_DEFAULT ${KOKKOS_ENABLE_CUDA}) IF(KOKKOSKERNELS_NO_DEFAULT_CUDA_TPLS) SET(CUBLAS_DEFAULT OFF) SET(CUSPARSE_DEFAULT OFF) + SET(CUSOLVER_DEFAULT OFF) ENDIF() KOKKOSKERNELS_ADD_TPL_OPTION(CUBLAS ${CUBLAS_DEFAULT} "Whether to enable CUBLAS" DEFAULT_DOCSTRING "ON if CUDA-enabled Kokkos, otherwise OFF") KOKKOSKERNELS_ADD_TPL_OPTION(CUSPARSE ${CUSPARSE_DEFAULT} "Whether to enable CUSPARSE" DEFAULT_DOCSTRING "ON if CUDA-enabled Kokkos, otherwise OFF") +KOKKOSKERNELS_ADD_TPL_OPTION(CUSOLVER ${CUSOLVER_DEFAULT} "Whether to enable CUSOLVER" + DEFAULT_DOCSTRING "ON if CUDA-enabled Kokkos, otherwise OFF") KOKKOSKERNELS_ADD_OPTION(NO_DEFAULT_ROCM_TPLS OFF BOOL "Whether ROCM TPLs should be enabled by default. Default: OFF") # Unlike CUDA, ROCm does not automatically install these TPLs SET(ROCBLAS_DEFAULT OFF) SET(ROCSPARSE_DEFAULT OFF) +SET(ROCSOLVER_DEFAULT OFF) # Since the default is OFF we do not really need this piece of logic here. # IF(KOKKOSKERNELS_NO_DEFAULT_ROCM_TPLS) # SET(ROCBLAS_DEFAULT OFF) # SET(ROCSPARSE_DEFAULT OFF) # ENDIF() KOKKOSKERNELS_ADD_TPL_OPTION(ROCBLAS ${ROCBLAS_DEFAULT} "Whether to enable ROCBLAS" - DEFAULT_DOCSTRING "ON if HIP-enabled Kokkos, otherwise OFF") + DEFAULT_DOCSTRING "OFF even if HIP-enabled Kokkos") KOKKOSKERNELS_ADD_TPL_OPTION(ROCSPARSE ${ROCSPARSE_DEFAULT} "Whether to enable ROCSPARSE" - DEFAULT_DOCSTRING "ON if HIP-enabled Kokkos, otherwise OFF") + DEFAULT_DOCSTRING "OFF even if HIP-enabled Kokkos") +KOKKOSKERNELS_ADD_TPL_OPTION(ROCSOLVER ${ROCSOLVER_DEFAULT} "Whether to enable ROCSOLVER" + DEFAULT_DOCSTRING "OFF even if HIP-enabled Kokkos") IF (KOKKOSKERNELS_ENABLE_TPL_MAGMA) IF (F77_BLAS_MANGLE STREQUAL "(name,NAME) name ## _") @@ -498,6 +505,7 @@ IF (NOT KOKKOSKERNELS_HAS_TRILINOS) KOKKOSKERNELS_IMPORT_TPL(MKL) KOKKOSKERNELS_IMPORT_TPL(CUBLAS) KOKKOSKERNELS_IMPORT_TPL(CUSPARSE) + KOKKOSKERNELS_IMPORT_TPL(CUSOLVER) KOKKOSKERNELS_IMPORT_TPL(CBLAS) KOKKOSKERNELS_IMPORT_TPL(LAPACKE) KOKKOSKERNELS_IMPORT_TPL(CHOLMOD) @@ -507,6 +515,7 @@ IF (NOT KOKKOSKERNELS_HAS_TRILINOS) KOKKOSKERNELS_IMPORT_TPL(MAGMA) KOKKOSKERNELS_IMPORT_TPL(ROCBLAS) KOKKOSKERNELS_IMPORT_TPL(ROCSPARSE) + KOKKOSKERNELS_IMPORT_TPL(ROCSOLVER) ELSE () IF (Trilinos_ENABLE_SuperLU5_API) SET(HAVE_KOKKOSKERNELS_SUPERLU5_API TRUE) diff --git a/packages/kokkos-kernels/common/impl/KokkosKernels_ViewUtils.hpp b/packages/kokkos-kernels/common/impl/KokkosKernels_ViewUtils.hpp index ac4abb6457fe..2ae8fb609d72 100644 --- a/packages/kokkos-kernels/common/impl/KokkosKernels_ViewUtils.hpp +++ b/packages/kokkos-kernels/common/impl/KokkosKernels_ViewUtils.hpp @@ -19,11 +19,6 @@ #include "Kokkos_Core.hpp" namespace KokkosKernels::Impl { -// lbv - 07/26/2023: -// MemoryTraits::impl_value was added -// in Kokkos 4.1.00 so we should guard -// the content of this header until v4.3.0 -#if KOKKOS_VERSION >= 40100 || defined(DOXY) /*! \brief Yields a type that is View with Kokkos::Unmanaged added to the memory * traits @@ -59,7 +54,6 @@ auto make_unmanaged(const View &v) { return typename with_unmanaged::type(v); } -#endif // KOKKOS_VERSION >= 40100 } // namespace KokkosKernels::Impl #endif diff --git a/packages/kokkos-kernels/common/src/KokkosKernels_ExecSpaceUtils.hpp b/packages/kokkos-kernels/common/src/KokkosKernels_ExecSpaceUtils.hpp index 2ec09f4069e4..4d3a3002b45b 100644 --- a/packages/kokkos-kernels/common/src/KokkosKernels_ExecSpaceUtils.hpp +++ b/packages/kokkos-kernels/common/src/KokkosKernels_ExecSpaceUtils.hpp @@ -215,10 +215,21 @@ inline void kk_get_free_total_memory(size_t& free_mem, total_mem /= n_streams; } template <> +inline void kk_get_free_total_memory(size_t& free_mem, + size_t& total_mem, + int n_streams) { + kk_get_free_total_memory(free_mem, total_mem, n_streams); +} +template <> inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) { kk_get_free_total_memory(free_mem, total_mem, 1); } +template <> +inline void kk_get_free_total_memory( + size_t& free_mem, size_t& total_mem) { + kk_get_free_total_memory(free_mem, total_mem, 1); +} #endif // FIXME_SYCL Use compiler extension instead of low level interface when diff --git a/packages/kokkos-kernels/common/src/KokkosKernels_PrintConfiguration.hpp b/packages/kokkos-kernels/common/src/KokkosKernels_PrintConfiguration.hpp index cd2333b3ec67..c2e3a5187f3d 100644 --- a/packages/kokkos-kernels/common/src/KokkosKernels_PrintConfiguration.hpp +++ b/packages/kokkos-kernels/common/src/KokkosKernels_PrintConfiguration.hpp @@ -44,6 +44,18 @@ inline void print_cusparse_version_if_enabled(std::ostream& os) { << "KOKKOSKERNELS_ENABLE_TPL_CUSPARSE: no\n"; #endif } + +inline void print_cusolver_version_if_enabled(std::ostream& os) { +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_CUSOLVER: " << cusolver_version_string() + << "\n"; +#else + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_CUSOLVER: no\n"; +#endif +} + inline void print_enabled_tpls(std::ostream& os) { #ifdef KOKKOSKERNELS_ENABLE_TPL_LAPACK os << " " @@ -96,6 +108,7 @@ inline void print_enabled_tpls(std::ostream& os) { #endif print_cublas_version_if_enabled(os); print_cusparse_version_if_enabled(os); + print_cusolver_version_if_enabled(os); #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS os << " " << "KOKKOSKERNELS_ENABLE_TPL_ROCBLAS: yes\n"; @@ -110,6 +123,13 @@ inline void print_enabled_tpls(std::ostream& os) { os << " " << "KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE: no\n"; #endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER: yes\n"; +#else + os << " " + << "KOKKOSKERNELS_ENABLE_TPL_ROCOLVER: no\n"; +#endif #ifdef KOKKOSKERNELS_ENABLE_TPL_METIS os << "KOKKOSKERNELS_ENABLE_TPL_METIS: yes\n"; #else diff --git a/packages/kokkos-kernels/common/src/KokkosKernels_TplsVersion.hpp b/packages/kokkos-kernels/common/src/KokkosKernels_TplsVersion.hpp index 38de7c13991c..3e00d72457a8 100644 --- a/packages/kokkos-kernels/common/src/KokkosKernels_TplsVersion.hpp +++ b/packages/kokkos-kernels/common/src/KokkosKernels_TplsVersion.hpp @@ -28,6 +28,10 @@ #include "cusparse.h" #endif +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) +#include "cusolver_common.h" +#endif + namespace KokkosKernels { #if defined(KOKKOSKERNELS_ENABLE_TPL_CUBLAS) @@ -53,5 +57,16 @@ inline std::string cusparse_version_string() { } #endif +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) +inline std::string cusolver_version_string() { + std::stringstream ss; + + ss << CUSOLVER_VER_MAJOR << "." << CUSOLVER_VER_MINOR << "." + << CUSOLVER_VER_PATCH << "." << CUSOLVER_VER_BUILD; + + return ss.str(); +} +#endif + } // namespace KokkosKernels #endif // _KOKKOSKERNELS_TPLS_VERSIONS_HPP diff --git a/packages/kokkos-kernels/common/src/KokkosKernels_Utils.hpp b/packages/kokkos-kernels/common/src/KokkosKernels_Utils.hpp index e1c15505ffdb..ba8049cecfa4 100644 --- a/packages/kokkos-kernels/common/src/KokkosKernels_Utils.hpp +++ b/packages/kokkos-kernels/common/src/KokkosKernels_Utils.hpp @@ -890,7 +890,7 @@ void permute_block_vector(typename idx_array_type::value_type num_elements, // TODO BMK: clean this up by removing 1st argument. It is unused but // its name gives the impression that only num_elements of the vector are // zeroed, when really it's always the whole thing. -template +template void zero_vector(ExecSpaceIn &exec_space_in, typename value_array_type::value_type /* num_elements */, value_array_type &vector) { @@ -906,8 +906,7 @@ void zero_vector(typename value_array_type::value_type /* num_elements */, using ne_tmp_t = typename value_array_type::value_type; ne_tmp_t ne_tmp = ne_tmp_t(0); MyExecSpace my_exec_space; - zero_vector(my_exec_space, ne_tmp, - vector); + zero_vector(my_exec_space, ne_tmp, vector); } template diff --git a/packages/kokkos-kernels/common/src/KokkosKernels_helpers.hpp b/packages/kokkos-kernels/common/src/KokkosKernels_helpers.hpp index b36360b991ac..1b725f2f5c6a 100644 --- a/packages/kokkos-kernels/common/src/KokkosKernels_helpers.hpp +++ b/packages/kokkos-kernels/common/src/KokkosKernels_helpers.hpp @@ -29,11 +29,11 @@ namespace Impl { // Used to reduce number of code instantiations. template struct GetUnifiedLayoutPreferring { - typedef typename std::conditional< - ((ViewType::rank == 1) && (!std::is_same::value)) || - ((ViewType::rank == 0)), - PreferredLayoutType, typename ViewType::array_layout>::type array_layout; + using array_layout = typename std::conditional< + ((ViewType::rank == 1) && !std::is_same_v) || + (ViewType::rank == 0), + PreferredLayoutType, typename ViewType::array_layout>::type; }; template diff --git a/packages/kokkos-kernels/common/src/KokkosLinAlg_config.h b/packages/kokkos-kernels/common/src/KokkosLinAlg_config.h index fccfe799ca62..fe97c1de8bec 100644 --- a/packages/kokkos-kernels/common/src/KokkosLinAlg_config.h +++ b/packages/kokkos-kernels/common/src/KokkosLinAlg_config.h @@ -18,6 +18,8 @@ #ifndef KOKKOSLINALG_CONFIG_H #define KOKKOSLINALG_CONFIG_H +[[deprecated("KokkosLinAlg_config.h is deprecated!")]] + #include #endif // KOKKOSLINALG_CONFIG_H diff --git a/packages/kokkos-kernels/docs/developer/apidocs/sparse.rst b/packages/kokkos-kernels/docs/developer/apidocs/sparse.rst index 415f72eec826..3a55e50c8b22 100644 --- a/packages/kokkos-kernels/docs/developer/apidocs/sparse.rst +++ b/packages/kokkos-kernels/docs/developer/apidocs/sparse.rst @@ -94,3 +94,16 @@ par_ilut gmres ----- .. doxygenfunction:: gmres(KernelHandle* handle, AMatrix& A, BType& B, XType& X, Preconditioner* precond) + +sptrsv +------ +.. doxygenfunction:: sptrsv_symbolic(const ExecutionSpace &space, KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries) +.. doxygenfunction:: sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries) +.. doxygenfunction:: sptrsv_symbolic(ExecutionSpace &space, KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values) +.. doxygenfunction:: sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values) +.. doxygenfunction:: sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, BType b, XType x) +.. doxygenfunction:: sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, BType b, XType x) +.. doxygenfunction:: sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, XType x, XType b) +.. doxygenfunction:: sptrsv_solve(KernelHandle *handle, XType x, XType b) +.. doxygenfunction:: sptrsv_solve(ExecutionSpace &space, KernelHandle *handleL, KernelHandle *handleU, XType x, XType b) +.. doxygenfunction:: sptrsv_solve(KernelHandle *handleL, KernelHandle *handleU, XType x, XType b) diff --git a/packages/kokkos-kernels/docs/requirements.txt b/packages/kokkos-kernels/docs/requirements.txt index 188f51e62dee..75f092707b27 100644 --- a/packages/kokkos-kernels/docs/requirements.txt +++ b/packages/kokkos-kernels/docs/requirements.txt @@ -1 +1,2 @@ -breathe \ No newline at end of file +breathe +sphinx-rtd-theme \ No newline at end of file diff --git a/packages/kokkos-kernels/example/wiki/CMakeLists.txt b/packages/kokkos-kernels/example/wiki/CMakeLists.txt index 11c6e0d97df5..1e751f57979d 100644 --- a/packages/kokkos-kernels/example/wiki/CMakeLists.txt +++ b/packages/kokkos-kernels/example/wiki/CMakeLists.txt @@ -1,2 +1,3 @@ +ADD_SUBDIRECTORY(blas) ADD_SUBDIRECTORY(sparse) ADD_SUBDIRECTORY(graph) diff --git a/packages/kokkos-kernels/example/wiki/blas/CMakeLists.txt b/packages/kokkos-kernels/example/wiki/blas/CMakeLists.txt new file mode 100644 index 000000000000..245957bc893e --- /dev/null +++ b/packages/kokkos-kernels/example/wiki/blas/CMakeLists.txt @@ -0,0 +1,19 @@ +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../../../../test_common) + +KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( + wiki_blas2_ger + SOURCES KokkosBlas2_wiki_ger.cpp + ) + +KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( + wiki_blas2_syr + SOURCES KokkosBlas2_wiki_syr.cpp + ) + +KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( + wiki_blas2_syr2 + SOURCES KokkosBlas2_wiki_syr2.cpp + ) diff --git a/packages/kokkos-kernels/example/wiki/blas/KokkosBlas2_wiki_ger.cpp b/packages/kokkos-kernels/example/wiki/blas/KokkosBlas2_wiki_ger.cpp new file mode 100644 index 000000000000..89eaaf929230 --- /dev/null +++ b/packages/kokkos-kernels/example/wiki/blas/KokkosBlas2_wiki_ger.cpp @@ -0,0 +1,23 @@ +#include +#include + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + { + constexpr int M = 5; + constexpr int N = 4; + + Kokkos::View A("A", M, N); + Kokkos::View x("X", M); + Kokkos::View y("Y", N); + + Kokkos::deep_copy(A, 1.0); + Kokkos::deep_copy(x, 3.0); + Kokkos::deep_copy(y, 1.3); + + const double alpha = Kokkos::ArithTraits::one(); + + KokkosBlas::ger("T", alpha, x, y, A); + } + Kokkos::finalize(); +} diff --git a/packages/kokkos-kernels/example/wiki/blas/KokkosBlas2_wiki_syr.cpp b/packages/kokkos-kernels/example/wiki/blas/KokkosBlas2_wiki_syr.cpp new file mode 100644 index 000000000000..26c6a489b896 --- /dev/null +++ b/packages/kokkos-kernels/example/wiki/blas/KokkosBlas2_wiki_syr.cpp @@ -0,0 +1,20 @@ +#include +#include + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + { + constexpr int M = 5; + + Kokkos::View A("A", M, M); + Kokkos::View x("X", M); + + Kokkos::deep_copy(A, 1.0); + Kokkos::deep_copy(x, 3.0); + + const double alpha = double(1.0); + + KokkosBlas::syr("T", "U", alpha, x, A); + } + Kokkos::finalize(); +} diff --git a/packages/kokkos-kernels/example/wiki/blas/KokkosBlas2_wiki_syr2.cpp b/packages/kokkos-kernels/example/wiki/blas/KokkosBlas2_wiki_syr2.cpp new file mode 100644 index 000000000000..c1c8e5d0d12f --- /dev/null +++ b/packages/kokkos-kernels/example/wiki/blas/KokkosBlas2_wiki_syr2.cpp @@ -0,0 +1,22 @@ +#include +#include + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + { + constexpr int M = 5; + + Kokkos::View A("A", M, M); + Kokkos::View x("X", M); + Kokkos::View y("Y", M); + + Kokkos::deep_copy(A, 1.0); + Kokkos::deep_copy(x, 3.0); + Kokkos::deep_copy(y, 1.3); + + const double alpha = double(1.0); + + KokkosBlas::syr2("T", "U", alpha, x, y, A); + } + Kokkos::finalize(); +} diff --git a/packages/kokkos-kernels/graph/unit_test/CMakeLists.txt b/packages/kokkos-kernels/graph/unit_test/CMakeLists.txt index 63539d977681..b49795315915 100644 --- a/packages/kokkos-kernels/graph/unit_test/CMakeLists.txt +++ b/packages/kokkos-kernels/graph/unit_test/CMakeLists.txt @@ -10,6 +10,12 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_C # # ##################### +IF (KokkosKernels_TEST_ETI_ONLY) + IF (NOT KokkosKernels_INST_DOUBLE AND NOT KokkosKernels_INST_FLOAT) + MESSAGE(FATAL_ERROR "Because only ETI'd type combinations are enabled for testing, the Kokkos Kernels graph tests require that double or float is enabled in ETI.") + ENDIF () +ENDIF () + ##################### # # # Add GPU backends # diff --git a/packages/kokkos-kernels/graph/unit_test/Test_Graph_graph_color.hpp b/packages/kokkos-kernels/graph/unit_test/Test_Graph_graph_color.hpp index 5d4eec03ca2c..101c489bc059 100644 --- a/packages/kokkos-kernels/graph/unit_test/Test_Graph_graph_color.hpp +++ b/packages/kokkos-kernels/graph/unit_test/Test_Graph_graph_color.hpp @@ -110,10 +110,15 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, COLORING_DEFAULT, COLORING_SERIAL, COLORING_VB, COLORING_VBBIT, COLORING_VBCS}; -#ifdef KOKKOS_ENABLE_CUDA + // FIXME: VBD sometimes fails on CUDA and HIP +#if defined(KOKKOS_ENABLE_CUDA) if (!std::is_same::value) { coloring_algorithms.push_back(COLORING_VBD); } +#elif defined(KOKKOS_ENABLE_HIP) + if (!std::is_same::value) { + coloring_algorithms.push_back(COLORING_VBD); + } #else coloring_algorithms.push_back(COLORING_VBD); #endif @@ -174,9 +179,15 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, } } } - EXPECT_TRUE((num_conflict == conf)); - - EXPECT_TRUE((num_conflict == 0)); + EXPECT_TRUE((num_conflict == conf)) + << "Coloring algo " << (int)coloring_algorithm + << ": kk_is_d1_coloring_valid returned incorrect number of conflicts (" + << num_conflict << ", should be " << conf << ")"; + + EXPECT_TRUE((num_conflict == 0)) + << "Coloring algo " << (int)coloring_algorithm + << ": D1 coloring produced invalid coloring (" << num_conflict + << " conflicts)"; } // device::execution_space::finalize(); } diff --git a/packages/kokkos-kernels/lapack/CMakeLists.txt b/packages/kokkos-kernels/lapack/CMakeLists.txt index 7c0c3183bded..f825a2184a10 100644 --- a/packages/kokkos-kernels/lapack/CMakeLists.txt +++ b/packages/kokkos-kernels/lapack/CMakeLists.txt @@ -28,19 +28,12 @@ IF (KOKKOSKERNELS_ENABLE_TPL_LAPACK OR KOKKOSKERNELS_ENABLE_TPL_MKL OR KOKKOSKER ENDIF() # Include cuda lapack TPL source file -IF (KOKKOSKERNELS_ENABLE_TPL_MAGMA) +IF (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) LIST(APPEND SOURCES lapack/tpls/KokkosLapack_Cuda_tpl.cpp ) ENDIF() -# Include rocm lapack TPL source file -IF (KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER) - LIST(APPEND SOURCES - lapack/tpls/KokkosLapack_Rocm_tpl.cpp - ) -ENDIF() - ################## # # # ETI generation # @@ -65,3 +58,10 @@ KOKKOSKERNELS_GENERATE_ETI(Lapack_trtri trtri SOURCE_LIST SOURCES TYPE_LISTS FLOATS LAYOUTS DEVICES ) + +KOKKOSKERNELS_GENERATE_ETI(Lapack_svd svd + COMPONENTS lapack + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS LAYOUTS DEVICES +) diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp b/packages/kokkos-kernels/lapack/eti/generated_specializations_cpp/svd/KokkosLapack_svd_eti_spec_inst.cpp.in similarity index 72% rename from packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp rename to packages/kokkos-kernels/lapack/eti/generated_specializations_cpp/svd/KokkosLapack_svd_eti_spec_inst.cpp.in index 348b9feeab04..62dd75475f00 100644 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp +++ b/packages/kokkos-kernels/lapack/eti/generated_specializations_cpp/svd/KokkosLapack_svd_eti_spec_inst.cpp.in @@ -14,5 +14,13 @@ // //@HEADER -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_01 -#include + +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosKernels_config.h" +#include "KokkosLapack_svd_spec.hpp" + +namespace KokkosLapack { +namespace Impl { +@LAPACK_SVD_ETI_INST_BLOCK@ + } //IMPL +} //Kokkos diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_HBWSpace.hpp b/packages/kokkos-kernels/lapack/eti/generated_specializations_hpp/KokkosLapack_svd_eti_spec_avail.hpp.in similarity index 75% rename from packages/kokkos/core/src/decl/Kokkos_Declare_HBWSpace.hpp rename to packages/kokkos-kernels/lapack/eti/generated_specializations_hpp/KokkosLapack_svd_eti_spec_avail.hpp.in index 1328c9313524..49e526b7e8ad 100644 --- a/packages/kokkos/core/src/decl/Kokkos_Declare_HBWSpace.hpp +++ b/packages/kokkos-kernels/lapack/eti/generated_specializations_hpp/KokkosLapack_svd_eti_spec_avail.hpp.in @@ -14,11 +14,11 @@ // //@HEADER -#ifndef KOKKOS_DECLARE_HBWSPACE_HPP -#define KOKKOS_DECLARE_HBWSPACE_HPP - -#ifdef KOKKOS_ENABLE_HBWSPACE -#include -#endif - +#ifndef KOKKOSLAPACK_SVD_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSLAPACK_SVD_ETI_SPEC_AVAIL_HPP_ +namespace KokkosLapack { +namespace Impl { +@LAPACK_SVD_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos #endif diff --git a/packages/kokkos-kernels/lapack/impl/KokkosLapack_gesv_spec.hpp b/packages/kokkos-kernels/lapack/impl/KokkosLapack_gesv_spec.hpp index b9f854931182..97d74280ffc4 100644 --- a/packages/kokkos-kernels/lapack/impl/KokkosLapack_gesv_spec.hpp +++ b/packages/kokkos-kernels/lapack/impl/KokkosLapack_gesv_spec.hpp @@ -28,7 +28,7 @@ namespace KokkosLapack { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct gesv_eti_spec_avail { enum : bool { value = false }; }; @@ -46,12 +46,16 @@ struct gesv_eti_spec_avail { EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template <> \ struct gesv_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ Kokkos::View, \ - Kokkos::MemoryTraits > > { \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ enum : bool { value = true }; \ }; @@ -65,24 +69,28 @@ namespace Impl { // Unification layer /// \brief Implementation of KokkosLapack::gesv. -template ::value, - bool eti_spec_avail = gesv_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = + gesv_eti_spec_avail::value> struct GESV { - static void gesv(const AMatrix &A, const BXMV &B, const IPIVV &IPIV); + static void gesv(const ExecutionSpace &space, const AMatrix &A, const BXMV &B, + const IPIVV &IPIV); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of gesv for multi vectors. // Unification layer -template -struct GESV { - static void gesv(const AMatrix & /* A */, const BXMV & /* B */, - const IPIVV & /* IPIV */) { +template +struct GESV { + static void gesv(const ExecutionSpace & /* space */, const AMatrix & /* A */, + const BXMV & /* B */, const IPIVV & /* IPIV */) { // NOTE: Might add the implementation of KokkosLapack::gesv later throw std::runtime_error( "No fallback implementation of GESV (general LU factorization & solve) " - "exists. Enable LAPACK and/or MAGMA TPL."); + "exists. Enable LAPACK, CUSOLVER, ROCSOLVER or MAGMA TPL."); } }; @@ -100,31 +108,33 @@ struct GESV { #define KOKKOSLAPACK_GESV_ETI_SPEC_DECL(SCALAR_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ extern template struct GESV< \ + EXEC_SPACE_TYPE, \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ false, true>; #define KOKKOSLAPACK_GESV_ETI_SPEC_INST(SCALAR_TYPE, LAYOUT_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ template struct GESV< \ + EXEC_SPACE_TYPE, \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::MemoryTraits>, \ false, true>; #include diff --git a/packages/kokkos-kernels/lapack/impl/KokkosLapack_svd_impl.hpp b/packages/kokkos-kernels/lapack/impl/KokkosLapack_svd_impl.hpp new file mode 100644 index 000000000000..49df758936b6 --- /dev/null +++ b/packages/kokkos-kernels/lapack/impl/KokkosLapack_svd_impl.hpp @@ -0,0 +1,34 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSLAPACK_IMPL_SVD_HPP_ +#define KOKKOSLAPACK_IMPL_SVD_HPP_ + +/// \file KokkosLapack_svd_impl.hpp +/// \brief Implementation(s) of singular value decomposition of a dense matrix. + +#include +#include + +namespace KokkosLapack { +namespace Impl { + +// NOTE: Might add the implementation of KokkosLapack::svd later + +} // namespace Impl +} // namespace KokkosLapack + +#endif // KOKKOSLAPACK_IMPL_SVD_HPP diff --git a/packages/kokkos-kernels/lapack/impl/KokkosLapack_svd_spec.hpp b/packages/kokkos-kernels/lapack/impl/KokkosLapack_svd_spec.hpp new file mode 100644 index 000000000000..fc0a34f790a3 --- /dev/null +++ b/packages/kokkos-kernels/lapack/impl/KokkosLapack_svd_spec.hpp @@ -0,0 +1,156 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#ifndef KOKKOSLAPACK_IMPL_SVD_SPEC_HPP_ +#define KOKKOSLAPACK_IMPL_SVD_SPEC_HPP_ + +#include +#include +#include + +// Include the actual functors +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +#include +#endif + +namespace KokkosLapack { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct svd_eti_spec_avail { + enum : bool { value = false }; +}; +} // namespace Impl +} // namespace KokkosLapack + +// +// Macro for declaration of full specialization availability +// KokkosLapack::Impl::SVD. This is NOT for users!!! All +// the declarations of full specializations go in this header file. +// We may spread out definitions (see _INST macro below) across one or +// more .cpp files. +// +#define KOKKOSLAPACK_SVD_ETI_SPEC_AVAIL(SCALAR_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template <> \ + struct svd_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type *, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ + }; + +// Include the actual specialization declarations +#include +#include + +namespace KokkosLapack { +namespace Impl { + +// Unification layer +/// \brief Implementation of KokkosLapack::svd. + +template ::value, + bool eti_spec_avail = svd_eti_spec_avail< + ExecutionSpace, AMatrix, SVector, UMatrix, VMatrix>::value> +struct SVD { + static void svd(const ExecutionSpace &space, const char jobu[], + const char jobvt[], const AMatrix &A, const SVector &S, + const UMatrix &U, const VMatrix &Vt); +}; + +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +//! Full specialization of svd +// Unification layer +template +struct SVD { + static void svd(const ExecutionSpace & /* space */, const char * /* jobu */, + const char * /* jobvt */, const AMatrix & /* A */, + const SVector & /* S */, const UMatrix & /* U */, + const VMatrix & /* Vt */) { + // NOTE: Might add the implementation of KokkosLapack::svd later + throw std::runtime_error( + "No fallback implementation of SVD (singular value decomposition) " + "exists. Enable LAPACK, CUSOLVER or ROCSOLVER TPL to use this " + "function."); + } +}; + +#endif +} // namespace Impl +} // namespace KokkosLapack + +// +// Macro for declaration of full specialization of +// KokkosLapack::Impl::SVD. This is NOT for users!!! All +// the declarations of full specializations go in this header file. +// We may spread out definitions (see _DEF macro below) across one or +// more .cpp files. +// +#define KOKKOSLAPACK_SVD_ETI_SPEC_DECL(SCALAR_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + extern template struct SVD< \ + EXEC_SPACE_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type *, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; + +#define KOKKOSLAPACK_SVD_ETI_SPEC_INST(SCALAR_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template struct SVD< \ + EXEC_SPACE_TYPE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type *, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; + +#include + +#endif // KOKKOSLAPACK_IMPL_SVD_SPEC_HPP_ diff --git a/packages/kokkos-kernels/lapack/src/KokkosLapack_gesv.hpp b/packages/kokkos-kernels/lapack/src/KokkosLapack_gesv.hpp index 4c9058f8abe1..b66583bbdf2b 100644 --- a/packages/kokkos-kernels/lapack/src/KokkosLapack_gesv.hpp +++ b/packages/kokkos-kernels/lapack/src/KokkosLapack_gesv.hpp @@ -34,28 +34,50 @@ namespace KokkosLapack { /// \brief Solve the dense linear equation system A*X = B. /// +/// \tparam ExecutionSpace the space where the kernel will run. /// \tparam AMatrix Input matrix/Output LU, as a 2-D Kokkos::View. /// \tparam BXMV Input (right-hand side)/Output (solution) (multi)vector, as a -/// 1-D or 2-D Kokkos::View. \tparam IPIVV Output pivot indices, as a 1-D -/// Kokkos::View +/// 1-D or 2-D Kokkos::View. +/// \tparam IPIVV Output pivot indices, as a 1-D Kokkos::View /// +/// \param space [in] execution space instance used to specified how to execute +/// the gesv kernels. /// \param A [in,out] On entry, the N-by-N matrix to be solved. On exit, the /// factors L and U from /// the factorization A = P*L*U; the unit diagonal elements of L are not /// stored. /// \param B [in,out] On entry, the right hand side (multi)vector B. On exit, -/// the solution (multi)vector X. \param IPIV [out] On exit, the pivot indices -/// (for partial pivoting). If the View extents are zero and -/// its data pointer is NULL, pivoting is not used. +/// the solution (multi)vector X. +/// \param IPIV [out] On exit, the pivot indices (for partial pivoting). +/// If the View extents are zero and its data pointer is NULL, pivoting is not +/// used. /// -template -void gesv(const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { - // NOTE: Currently, KokkosLapack::gesv only supports for MAGMA TPL and LAPACK - // TPL. - // MAGMA TPL should be enabled to call the MAGMA GPU interface for - // device views LAPACK TPL should be enabled to call the LAPACK - // interface for host views +template +void gesv(const ExecutionSpace& space, const AMatrix& A, const BXMV& B, + const IPIVV& IPIV) { + // NOTE: Currently, KokkosLapack::gesv only supports LAPACK, MAGMA and + // rocSOLVER TPLs. + // MAGMA/rocSOLVER TPL should be enabled to call the MAGMA/rocSOLVER GPU + // interface for device views LAPACK TPL should be enabled to call the + // LAPACK interface for host views + static_assert( + Kokkos::SpaceAccessibility::accessible); + static_assert( + Kokkos::SpaceAccessibility::accessible); +#if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) + if constexpr (!std::is_same_v) { + static_assert( + Kokkos::SpaceAccessibility::accessible); + } +#else + static_assert( + Kokkos::SpaceAccessibility::accessible); +#endif static_assert(Kokkos::is_view::value, "KokkosLapack::gesv: A must be a Kokkos::View."); static_assert(Kokkos::is_view::value, @@ -137,15 +159,38 @@ void gesv(const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { if (BXMV::rank == 1) { auto B_i = BXMV_Internal(B.data(), B.extent(0), 1); - KokkosLapack::Impl::GESV::gesv(A_i, B_i, IPIV_i); + KokkosLapack::Impl::GESV::gesv(space, A_i, B_i, IPIV_i); } else { // BXMV::rank == 2 auto B_i = BXMV_Internal(B.data(), B.extent(0), B.extent(1)); - KokkosLapack::Impl::GESV::gesv(A_i, B_i, IPIV_i); + KokkosLapack::Impl::GESV::gesv(space, A_i, B_i, IPIV_i); } } +/// \brief Solve the dense linear equation system A*X = B. +/// +/// \tparam AMatrix Input matrix/Output LU, as a 2-D Kokkos::View. +/// \tparam BXMV Input (right-hand side)/Output (solution) (multi)vector, as a +/// 1-D or 2-D Kokkos::View. +/// \tparam IPIVV Output pivot indices, as a 1-D Kokkos::View +/// +/// \param A [in,out] On entry, the N-by-N matrix to be solved. On exit, the +/// factors L and U from +/// the factorization A = P*L*U; the unit diagonal elements of L are not +/// stored. +/// \param B [in,out] On entry, the right hand side (multi)vector B. On exit, +/// the solution (multi)vector X. +/// \param IPIV [out] On exit, the pivot indices (for partial pivoting). +/// If the View extents are zero and its data pointer is NULL, pivoting is not +/// used. +/// +template +void gesv(const AMatrix& A, const BXMV& B, const IPIVV& IPIV) { + typename AMatrix::execution_space space{}; + gesv(space, A, B, IPIV); +} + } // namespace KokkosLapack #endif // KOKKOSLAPACK_GESV_HPP_ diff --git a/packages/kokkos-kernels/lapack/src/KokkosLapack_svd.hpp b/packages/kokkos-kernels/lapack/src/KokkosLapack_svd.hpp new file mode 100644 index 000000000000..71ea7cc30f81 --- /dev/null +++ b/packages/kokkos-kernels/lapack/src/KokkosLapack_svd.hpp @@ -0,0 +1,246 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/// \file KokkosLapack_svd.hpp +/// \brief Singular Value Decomposition (SVD) +/// +/// This file provides KokkosLapack::svd. This function performs a +/// local (no MPI) singular value decomposition of the input matrix A +/// and returns the singular values and vectors dedending on input flags. + +#ifndef KOKKOSLAPACK_SVD_HPP_ +#define KOKKOSLAPACK_SVD_HPP_ + +#include + +#include "KokkosLapack_svd_spec.hpp" +#include "KokkosKernels_Error.hpp" + +namespace KokkosLapack { + +// clang-format off +/// \brief Compute the Singular Value Decomposition of A = U*S*Vt +/// +/// \tparam ExecutionSpace the space where the kernel will run. +/// \tparam AMatrix (mxn) matrix as a rank-2 Kokkos::View. +/// \tparam SVector min(m,n) vector as a rank-1 Kokkos::View +/// \tparam UMatrix (mxm) matrix as a rank-2 Kokkos::View +/// \tparam VMatrix (nxn) matrix as a rank-2 Kokkos::View +/// +/// \param space [in] execution space instance used to specified how to execute +/// the svd kernels. +/// \param jobu [in] flag to control the computation of the left singular +/// vectors when set to: 'A' all vectors are computed, 'S' the first min(m,n) +/// singular vectors are computed, 'O' the first min(m,n) singular vectors are +/// overwritten into A, 'N' no singular vectors are computed. +/// \param jobvt [in] flag to control the computation of the right singular +/// vectors when set to: 'A' all vectors are computed, 'S' the first min(m,n) +/// singular vectors are computed, 'O' the first min(m,n) singular vectors are +/// overwritten into A, 'N' no singular vectors are computed. +/// \param A [in] An m-by-n matrix to be decomposed using its singular values. +/// \param S [out] Vector of the min(m, n) singular values of A. +/// \param U [out] the first min(m, n) columns of U are the left singular +/// vectors of A. +/// \param Vt [out] the first min(m, n) columns of Vt are the right singular +/// vectors of A. +/// +// clang-format on +template +void svd(const ExecutionSpace& space, const char jobu[], const char jobvt[], + const AMatrix& A, const SVector& S, const UMatrix& U, + const VMatrix& Vt) { + static_assert( + Kokkos::SpaceAccessibility::accessible); + static_assert( + Kokkos::SpaceAccessibility::accessible); + static_assert( + Kokkos::SpaceAccessibility::accessible); + static_assert( + Kokkos::SpaceAccessibility::accessible); + static_assert(Kokkos::is_view::value, + "KokkosLapack::svd: A must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosLapack::svd: S must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosLapack::svd: U must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosLapack::svd: Vt must be a Kokkos::View."); + static_assert(AMatrix::rank() == 2, "KokkosLapack::svd: A must have rank 2."); + static_assert(SVector::rank() == 1, "KokkosLapack::svd: S must have rank 1."); + static_assert(UMatrix::rank() == 2, "KokkosLapack::svd: U must have rank 2."); + static_assert(VMatrix::rank() == 2, + "KokkosLapack::svd: Vt must have rank 2."); + + int64_t m = A.extent(0); + int64_t n = A.extent(1); + int64_t rankA = Kokkos::min(m, n); + + // No work to do since the matrix is empty... + // Also do not send a matrix with size zero + // to Lapack TPLs or they will complain! + if ((m == 0) || (n == 0)) { + return; + } + + // Check the jobu and jobvt control flags + // The only valid options there are 'A', 'S', 'O' and 'N' + const bool is_jobu_invalid = + !((jobu[0] == 'A') || (jobu[0] == 'a') || (jobu[0] == 'S') || + (jobu[0] == 's') || (jobu[0] == 'O') || (jobu[0] == 'o') || + (jobu[0] == 'N') || (jobu[0] == 'n')); + + const bool is_jobvt_invalid = + !((jobvt[0] == 'A') || (jobvt[0] == 'a') || (jobvt[0] == 'S') || + (jobvt[0] == 's') || (jobvt[0] == 'O') || (jobvt[0] == 'o') || + (jobvt[0] == 'N') || (jobvt[0] == 'n')); + + if (is_jobu_invalid && is_jobvt_invalid) { + std::ostringstream oss; + oss << "KokkosLapack::svd: both jobu and jobvt are invalid!\n" + << "Possible values are A, S, O or N, submitted values are " << jobu[0] + << " and " << jobvt[0] << "\n"; + KokkosKernels::Impl::throw_runtime_exception(oss.str()); + } + if (is_jobu_invalid) { + std::ostringstream oss; + oss << "KokkosLapack::svd: jobu is invalid!\n" + << "Possible values are A, S, O or N, submitted value is " << jobu[0] + << "\n"; + KokkosKernels::Impl::throw_runtime_exception(oss.str()); + } + if (is_jobvt_invalid) { + std::ostringstream oss; + oss << "KokkosLapack::svd: jobvt is invalid!\n" + << "Possible values are A, S, O or N, submitted value is " << jobvt[0] + << "\n"; + KokkosKernels::Impl::throw_runtime_exception(oss.str()); + } + + if (((jobu[0] == 'O') || (jobu[0] == 'o')) && + ((jobvt[0] == 'O') || (jobvt[0] == 'o'))) { + std::ostringstream oss; + oss << "KokkosLapack::svd: jobu and jobvt cannot be O at the same time!\n"; + KokkosKernels::Impl::throw_runtime_exception(oss.str()); + } + + // Check validity of output views sizes + // Note that of jobu/jobvt are set to O or N + // then the associated matrix does not need storage + bool is_extent_invalid = false; + std::ostringstream os; + if (S.extent_int(0) != rankA) { + is_extent_invalid = true; + os << "KokkosLapack::svd: S has extent " << S.extent(0) << ", instead of " + << rankA << ".\n"; + } + if ((jobu[0] == 'A') || (jobu[0] == 'a') || (jobu[0] == 'S') || + (jobu[0] == 's')) { + if (U.extent_int(0) != m || U.extent_int(1) != m) { + is_extent_invalid = true; + os << "KokkosLapack::svd: U has extents (" << U.extent(0) << ", " + << U.extent(1) << ") instead of (" << m << ", " << m << ").\n"; + } + } + if ((jobvt[0] == 'A') || (jobvt[0] == 'a') || (jobvt[0] == 'S') || + (jobvt[0] == 's')) { + if (Vt.extent_int(0) != n || Vt.extent_int(1) != n) { + is_extent_invalid = true; + os << "KokkosLapack::svd: V has extents (" << Vt.extent(0) << ", " + << Vt.extent(1) << ") instead of (" << n << ", " << n << ").\n"; + } + } + if (is_extent_invalid) { + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } + +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) + if (std::is_same_v && + (A.extent(0) < A.extent(1))) { + throw std::runtime_error( + "CUSOLVER does not support SVD for matrices with more columns " + "than rows, you can transpose you matrix first then compute " + "SVD of that transpose: At=VSUt, and swap the output U and Vt" + " and transpose them to recover the desired SVD."); + } +#endif + + using AMatrix_Internal = Kokkos::View< + typename AMatrix::non_const_value_type**, typename AMatrix::array_layout, + typename AMatrix::device_type, Kokkos::MemoryTraits>; + + using SVector_Internal = Kokkos::View< + typename SVector::non_const_value_type*, typename SVector::array_layout, + typename SVector::device_type, Kokkos::MemoryTraits>; + + using UMatrix_Internal = Kokkos::View< + typename UMatrix::non_const_value_type**, typename UMatrix::array_layout, + typename UMatrix::device_type, Kokkos::MemoryTraits>; + + using VMatrix_Internal = Kokkos::View< + typename VMatrix::non_const_value_type**, typename VMatrix::array_layout, + typename VMatrix::device_type, Kokkos::MemoryTraits>; + + AMatrix_Internal A_i = A; + SVector_Internal S_i = S; + UMatrix_Internal U_i = U; + VMatrix_Internal Vt_i = Vt; + + KokkosLapack::Impl::SVD::svd(space, jobu, + jobvt, A_i, + S_i, U_i, + Vt_i); +} + +// clang-format off +/// \brief Compute the Singular Value Decomposition of A = U*S*Vt +/// +/// \tparam AMatrix (mxn) matrix as a rank-2 Kokkos::View. +/// \tparam SVector min(m,n) vector as a rank-1 Kokkos::View +/// \tparam UMatrix (mxm) matrix as a rank-2 Kokkos::View +/// \tparam VMatrix (nxn) matrix as a rank-2 Kokkos::View +/// +/// \param jobu [in] flag to control the computation of the left singular +/// vectors when set to: 'A' all vectors are computed, 'S' the first min(m,n) +/// singular vectors are computed, 'O' the first min(m,n) singular vectors are +/// overwritten into A, 'N' no singular vectors are computed. +/// \param jobvt [in] flag to control the computation of the right singular +/// vectors when set to: 'A' all vectors are computed, 'S' the first min(m,n) +/// singular vectors are computed, 'O' the first min(m,n) singular vectors are +/// overwritten into A, 'N' no singular vectors are computed. +/// \param A [in] An m-by-n matrix to be decomposed using its singular values. +/// \param S [out] Vector of the min(m, n) singular values of A. +/// \param U [out] the first min(m, n) columns of U are the left singular +/// vectors of A. +/// \param Vt [out] the first min(m, n) columns of Vt are the right singular +/// vectors of A. +/// +// clang-format on +template +void svd(const char jobu[], const char jobvt[], const AMatrix& A, + const SVector& S, const UMatrix& U, const VMatrix& Vt) { + typename AMatrix::execution_space space{}; + svd(space, jobu, jobvt, A, S, U, Vt); +} + +} // namespace KokkosLapack + +#endif // KOKKOSLAPACK_SVD_HPP_ diff --git a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_Cuda_tpl.hpp b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_Cuda_tpl.hpp index 2ce9f699545f..6749a4740f67 100644 --- a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_Cuda_tpl.hpp +++ b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_Cuda_tpl.hpp @@ -16,6 +16,29 @@ #ifndef KOKKOSLAPACK_CUDA_TPL_HPP_ #define KOKKOSLAPACK_CUDA_TPL_HPP_ +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) +#include "KokkosLapack_cusolver.hpp" + +namespace KokkosLapack { +namespace Impl { + +CudaLapackSingleton::CudaLapackSingleton() { + cusolverStatus_t stat = cusolverDnCreate(&handle); + if (stat != CUSOLVER_STATUS_SUCCESS) + Kokkos::abort("CUSOLVER initialization failed\n"); + + Kokkos::push_finalize_hook([&]() { cusolverDnDestroy(handle); }); +} + +CudaLapackSingleton& CudaLapackSingleton::singleton() { + static CudaLapackSingleton s; + return s; +} + +} // namespace Impl +} // namespace KokkosLapack +#endif // defined (KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) + #if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) #include diff --git a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_Host_tpl.cpp b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_Host_tpl.cpp index d629a17f1d9d..add0a802bd9f 100644 --- a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_Host_tpl.cpp +++ b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_Host_tpl.cpp @@ -38,6 +38,31 @@ void F77_BLAS_MANGLE(cgesv, CGESV)(int*, int*, std::complex*, int*, int*, void F77_BLAS_MANGLE(zgesv, ZGESV)(int*, int*, std::complex*, int*, int*, std::complex*, int*, int*); +/// +/// Gesvd +/// + +void F77_BLAS_MANGLE(sgesvd, SGESVD)(const char*, const char*, const int*, + const int*, float*, const int*, float*, + float*, const int*, float*, const int*, + float*, int*, int*); +void F77_BLAS_MANGLE(dgesvd, DGESVD)(const char*, const char*, const int*, + const int*, double*, const int*, double*, + double*, const int*, double*, const int*, + double*, int*, int*); +void F77_BLAS_MANGLE(cgesvd, CGESVD)(const char*, const char*, const int*, + const int*, std::complex*, + const int*, float*, std::complex*, + const int*, std::complex*, + const int*, std::complex*, int*, + float*, int*); +void F77_BLAS_MANGLE(zgesvd, ZGESVD)(const char*, const char*, const int*, + const int*, std::complex*, + const int*, double*, std::complex*, + const int*, std::complex*, + const int*, std::complex*, int*, + double*, int*); + /// /// Trtri /// @@ -64,6 +89,11 @@ void F77_BLAS_MANGLE(ztrtri, ZTRTRI)(const char*, const char*, int*, #define F77_FUNC_CGESV F77_BLAS_MANGLE(cgesv, CGESV) #define F77_FUNC_ZGESV F77_BLAS_MANGLE(zgesv, ZGESV) +#define F77_FUNC_SGESVD F77_BLAS_MANGLE(sgesvd, SGESVD) +#define F77_FUNC_DGESVD F77_BLAS_MANGLE(dgesvd, DGESVD) +#define F77_FUNC_CGESVD F77_BLAS_MANGLE(cgesvd, CGESVD) +#define F77_FUNC_ZGESVD F77_BLAS_MANGLE(zgesvd, ZGESVD) + #define F77_FUNC_STRTRI F77_BLAS_MANGLE(strtri, STRTRI) #define F77_FUNC_DTRTRI F77_BLAS_MANGLE(dtrtri, DTRTRI) #define F77_FUNC_CTRTRI F77_BLAS_MANGLE(ctrtri, CTRTRI) @@ -82,6 +112,15 @@ void HostLapack::gesv(int n, int rhs, float* a, int lda, int* ipiv, F77_FUNC_SGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); } template <> +void HostLapack::gesvd(const char jobu, const char jobvt, const int m, + const int n, float* a, const int lda, float* s, + float* u, const int ldu, float* vt, + const int ldvt, float* work, int lwork, + float* /*rwork*/, int info) { + F77_FUNC_SGESVD(&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, + &lwork, &info); +} +template <> int HostLapack::trtri(const char uplo, const char diag, int n, const float* a, int lda) { int info = 0; @@ -99,6 +138,15 @@ void HostLapack::gesv(int n, int rhs, double* a, int lda, int* ipiv, F77_FUNC_DGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); } template <> +void HostLapack::gesvd(const char jobu, const char jobvt, const int m, + const int n, double* a, const int lda, double* s, + double* u, const int ldu, double* vt, + const int ldvt, double* work, int lwork, + double* /*rwork*/, int info) { + F77_FUNC_DGESVD(&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, + &lwork, &info); +} +template <> int HostLapack::trtri(const char uplo, const char diag, int n, const double* a, int lda) { int info = 0; @@ -118,6 +166,15 @@ void HostLapack >::gesv(int n, int rhs, F77_FUNC_CGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); } template <> +void HostLapack >::gesvd( + const char jobu, const char jobvt, const int m, const int n, + std::complex* a, const int lda, float* s, std::complex* u, + const int ldu, std::complex* vt, const int ldvt, + std::complex* work, int lwork, float* rwork, int info) { + F77_FUNC_CGESVD(&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, + &lwork, rwork, &info); +} +template <> int HostLapack >::trtri(const char uplo, const char diag, int n, const std::complex* a, int lda) { @@ -138,6 +195,15 @@ void HostLapack >::gesv(int n, int rhs, F77_FUNC_ZGESV(&n, &rhs, a, &lda, ipiv, b, &ldb, &info); } template <> +void HostLapack >::gesvd( + const char jobu, const char jobvt, const int m, const int n, + std::complex* a, const int lda, double* s, std::complex* u, + const int ldu, std::complex* vt, const int ldvt, + std::complex* work, int lwork, double* rwork, int info) { + F77_FUNC_ZGESVD(&jobu, &jobvt, &m, &n, a, &lda, s, u, &ldu, vt, &ldvt, work, + &lwork, rwork, &info); +} +template <> int HostLapack >::trtri(const char uplo, const char diag, int n, const std::complex* a, diff --git a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_Host_tpl.hpp b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_Host_tpl.hpp index d74099aaec36..9eca83afea0d 100644 --- a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_Host_tpl.hpp +++ b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_Host_tpl.hpp @@ -33,6 +33,12 @@ struct HostLapack { static void gesv(int n, int rhs, T *a, int lda, int *ipiv, T *b, int ldb, int info); + static void gesvd(const char jobu, const char jobvt, const int m, const int n, + T *A, const int lda, + typename Kokkos::ArithTraits::mag_type *S, T *U, + const int ldu, T *Vt, const int ldvt, T *work, int lwork, + typename Kokkos::ArithTraits::mag_type *rwork, int info); + static int trtri(const char uplo, const char diag, int n, const T *a, int lda); }; diff --git a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_cusolver.hpp b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_cusolver.hpp new file mode 100644 index 000000000000..006fd68b6fe3 --- /dev/null +++ b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_cusolver.hpp @@ -0,0 +1,92 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSLAPACK_CUSOLVER_HPP_ +#define KOKKOSLAPACK_CUSOLVER_HPP_ + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER +#include + +namespace KokkosLapack { +namespace Impl { + +// Declaration of the singleton for cusolver +// this is the only header that needs to be +// included when using cusolverDn. +struct CudaLapackSingleton { + cusolverDnHandle_t handle; + + CudaLapackSingleton(); + + static CudaLapackSingleton& singleton(); +}; + +inline void cusolver_internal_error_throw(cusolverStatus_t cusolverStatus, + const char* name, const char* file, + const int line) { + std::ostringstream out; + out << name << " error( "; + switch (cusolverStatus) { + case CUSOLVER_STATUS_NOT_INITIALIZED: + out << "CUSOLVER_STATUS_NOT_INITIALIZED): cusolver handle was not " + "created correctly."; + break; + case CUSOLVER_STATUS_ALLOC_FAILED: + out << "CUSOLVER_STATUS_ALLOC_FAILED): you might tried to allocate too " + "much memory"; + break; + case CUSOLVER_STATUS_INVALID_VALUE: + out << "CUSOLVER_STATUS_INVALID_VALUE)"; + break; + case CUSOLVER_STATUS_ARCH_MISMATCH: + out << "CUSOLVER_STATUS_ARCH_MISMATCH)"; + break; + case CUSOLVER_STATUS_EXECUTION_FAILED: + out << "CUSOLVER_STATUS_EXECUTION_FAILED)"; + break; + case CUSOLVER_STATUS_INTERNAL_ERROR: + out << "CUSOLVER_STATUS_INTERNAL_ERROR)"; + break; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + out << "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED)"; + break; + default: out << "unrecognized error code): this is bad!"; break; + } + if (file) { + out << " " << file << ":" << line; + } + throw std::runtime_error(out.str()); +} + +inline void cusolver_internal_safe_call(cusolverStatus_t cusolverStatus, + const char* name, + const char* file = nullptr, + const int line = 0) { + if (CUSOLVER_STATUS_SUCCESS != cusolverStatus) { + cusolver_internal_error_throw(cusolverStatus, name, file, line); + } +} + +// The macro below defines is the public interface for the safe cusolver calls. +// The functions themselves are protected by impl namespace. +#define KOKKOS_CUSOLVER_SAFE_CALL_IMPL(call) \ + KokkosLapack::Impl::cusolver_internal_safe_call(call, #call, __FILE__, \ + __LINE__) + +} // namespace Impl +} // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_CUSOLVER +#endif // KOKKOSLAPACK_CUSOLVER_HPP_ diff --git a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp index a3d8bb6ee9ad..9fbd299ca528 100644 --- a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_gesv_tpl_spec_avail.hpp @@ -20,7 +20,7 @@ namespace KokkosLapack { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct gesv_tpl_spec_avail { enum : bool { value = false }; }; @@ -31,9 +31,12 @@ struct gesv_tpl_spec_avail { #define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(SCALAR, LAYOUT, MEMSPACE) \ template \ struct gesv_tpl_spec_avail< \ + ExecSpace, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ Kokkos::MemoryTraits > > { \ enum : bool { value = true }; \ }; @@ -46,37 +49,29 @@ KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -/* -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK( double, Kokkos::LayoutRight, -Kokkos::HostSpace) #endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK( float, Kokkos::LayoutRight, -Kokkos::HostSpace) #endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK( Kokkos::complex, -Kokkos::LayoutRight, Kokkos::HostSpace) #endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_LAPACK( Kokkos::complex, -Kokkos::LayoutRight, Kokkos::HostSpace) #endif -*/ #endif +} // namespace Impl +} // namespace KokkosLapack // MAGMA #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA +#include "magma_v2.h" -#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUT, MEMSPACE) \ - template \ - struct gesv_tpl_spec_avail< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits > > { \ - enum : bool { value = true }; \ +namespace KokkosLapack { +namespace Impl { +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct gesv_tpl_spec_avail< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(double, Kokkos::LayoutLeft, @@ -87,28 +82,85 @@ KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) +} // namespace Impl +} // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA + +// CUSOLVER +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER +namespace KokkosLapack { +namespace Impl { + +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct gesv_tpl_spec_avail< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; -/* -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA( double, Kokkos::LayoutRight, -Kokkos::CudaSpace) #endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA( float, Kokkos::LayoutRight, -Kokkos::CudaSpace) #endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA( -Kokkos::complex,Kokkos::LayoutRight, Kokkos::CudaSpace) #endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) - KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, -Kokkos::LayoutRight, Kokkos::CudaSpace) #endif -*/ +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(double, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(float, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::CudaSpace) + +#if defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(double, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(float, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) #endif } // namespace Impl } // namespace KokkosLapack +#endif // CUSOLVER + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER +#include + +namespace KokkosLapack { +namespace Impl { + +#define KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct gesv_tpl_spec_avail< \ + Kokkos::HIP, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(double, Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(float, Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIPSpace) + +} // namespace Impl +} // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER #endif diff --git a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp index 5846e177d653..41592e079acb 100644 --- a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_gesv_tpl_spec_decl.hpp @@ -45,229 +45,109 @@ inline void gesv_print_specialization() { namespace KokkosLapack { namespace Impl { -#define KOKKOSLAPACK_DGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GESV< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View< \ - int*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - PViewType; \ - \ - static void gesv(const AViewType& A, const BViewType& B, \ - const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_LAPACK,double]"); \ - gesv_print_specialization(); \ - const bool with_pivot = \ - !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ - \ - const int N = static_cast(A.extent(1)); \ - const int AST = static_cast(A.stride(1)); \ - const int LDA = (AST == 0) ? 1 : AST; \ - const int BST = static_cast(B.stride(1)); \ - const int LDB = (BST == 0) ? 1 : BST; \ - const int NRHS = static_cast(B.extent(1)); \ - \ - int info = 0; \ - \ - if (with_pivot) { \ - HostLapack::gesv(N, NRHS, A.data(), LDA, IPIV.data(), \ - B.data(), LDB, info); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; +template +void lapackGesvWrapper(const AViewType& A, const BViewType& B, + const IPIVViewType& IPIV) { + using Scalar = typename AViewType::non_const_value_type; + + const bool with_pivot = !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); + + const int N = static_cast(A.extent(1)); + const int AST = static_cast(A.stride(1)); + const int LDA = (AST == 0) ? 1 : AST; + const int BST = static_cast(B.stride(1)); + const int LDB = (BST == 0) ? 1 : BST; + const int NRHS = static_cast(B.extent(1)); + + int info = 0; + + if (with_pivot) { + if constexpr (Kokkos::ArithTraits::is_complex) { + using MagType = typename Kokkos::ArithTraits::mag_type; + + HostLapack>::gesv( + N, NRHS, reinterpret_cast*>(A.data()), LDA, + IPIV.data(), reinterpret_cast*>(B.data()), LDB, + info); + } else { + HostLapack::gesv(N, NRHS, A.data(), LDA, IPIV.data(), B.data(), + LDB, info); + } + } +} -#define KOKKOSLAPACK_SGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ +#define KOKKOSLAPACK_GESV_LAPACK(SCALAR, LAYOUT, EXECSPACE, MEM_SPACE) \ + template <> \ struct GESV< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View< \ - int*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - PViewType; \ + EXECSPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + gesv_eti_spec_avail< \ + EXECSPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AViewType = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using BViewType = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PViewType = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ \ - static void gesv(const AViewType& A, const BViewType& B, \ - const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_LAPACK,float]"); \ + static void gesv(const EXECSPACE& /* space */, const AViewType& A, \ + const BViewType& B, const PViewType& IPIV) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_LAPACK," #SCALAR \ + "]"); \ gesv_print_specialization(); \ - const bool with_pivot = \ - !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ - \ - const int N = static_cast(A.extent(1)); \ - const int AST = static_cast(A.stride(1)); \ - const int LDA = (AST == 0) ? 1 : AST; \ - const int BST = static_cast(B.stride(1)); \ - const int LDB = (BST == 0) ? 1 : BST; \ - const int NRHS = static_cast(B.extent(1)); \ - \ - int info = 0; \ - \ - if (with_pivot) { \ - HostLapack::gesv(N, NRHS, A.data(), LDA, IPIV.data(), B.data(), \ - LDB, info); \ - } \ + lapackGesvWrapper(A, B, IPIV); \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSLAPACK_ZGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GESV**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View< \ - int*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - PViewType; \ - \ - static void gesv(const AViewType& A, const BViewType& B, \ - const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosLapack::gesv[TPL_LAPACK,complex]"); \ - gesv_print_specialization(); \ - const bool with_pivot = \ - !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ - \ - const int N = static_cast(A.extent(1)); \ - const int AST = static_cast(A.stride(1)); \ - const int LDA = (AST == 0) ? 1 : AST; \ - const int BST = static_cast(B.stride(1)); \ - const int LDB = (BST == 0) ? 1 : BST; \ - const int NRHS = static_cast(B.extent(1)); \ - \ - int info = 0; \ - \ - if (with_pivot) { \ - HostLapack >::gesv( \ - N, NRHS, reinterpret_cast*>(A.data()), LDA, \ - IPIV.data(), reinterpret_cast*>(B.data()), \ - LDB, info); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - -#define KOKKOSLAPACK_CGESV_LAPACK(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GESV**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View< \ - int*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - PViewType; \ - \ - static void gesv(const AViewType& A, const BViewType& B, \ - const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosLapack::gesv[TPL_LAPACK,complex]"); \ - gesv_print_specialization(); \ - const bool with_pivot = \ - !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ - \ - const int N = static_cast(A.extent(1)); \ - const int AST = static_cast(A.stride(1)); \ - const int LDA = (AST == 0) ? 1 : AST; \ - const int BST = static_cast(B.stride(1)); \ - const int LDB = (BST == 0) ? 1 : BST; \ - const int NRHS = static_cast(B.extent(1)); \ - \ - int info = 0; \ - \ - if (with_pivot) { \ - HostLapack >::gesv( \ - N, NRHS, reinterpret_cast*>(A.data()), LDA, \ - IPIV.data(), reinterpret_cast*>(B.data()), \ - LDB, info); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - -KOKKOSLAPACK_DGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSLAPACK_DGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, false) - -KOKKOSLAPACK_SGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSLAPACK_SGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +#if defined(KOKKOS_ENABLE_SERIAL) +KOKKOSLAPACK_GESV_LAPACK(float, Kokkos::LayoutLeft, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(double, Kokkos::LayoutLeft, Kokkos::Serial, + Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Serial, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Serial, Kokkos::HostSpace) +#endif -KOKKOSLAPACK_ZGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSLAPACK_ZGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +#if defined(KOKKOS_ENABLE_OPENMP) +KOKKOSLAPACK_GESV_LAPACK(float, Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(double, Kokkos::LayoutLeft, Kokkos::OpenMP, + Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::OpenMP, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::OpenMP, Kokkos::HostSpace) +#endif -KOKKOSLAPACK_CGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, true) -KOKKOSLAPACK_CGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, false) +#if defined(KOKKOS_ENABLE_THREADS) +KOKKOSLAPACK_GESV_LAPACK(float, Kokkos::LayoutLeft, Kokkos::Threads, + Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(double, Kokkos::LayoutLeft, Kokkos::Threads, + Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Threads, Kokkos::HostSpace) +KOKKOSLAPACK_GESV_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Threads, Kokkos::HostSpace) +#endif } // namespace Impl } // namespace KokkosLapack @@ -275,265 +155,403 @@ KOKKOSLAPACK_CGESV_LAPACK(Kokkos::LayoutLeft, Kokkos::HostSpace, false) // MAGMA #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA -#include +#include namespace KokkosLapack { namespace Impl { -#define KOKKOSLAPACK_DGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GESV< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View< \ - magma_int_t*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - PViewType; \ - \ - static void gesv(const AViewType& A, const BViewType& B, \ - const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_MAGMA,double]"); \ - gesv_print_specialization(); \ - const bool with_pivot = \ - !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ - \ - magma_int_t N = static_cast(A.extent(1)); \ - magma_int_t AST = static_cast(A.stride(1)); \ - magma_int_t LDA = (AST == 0) ? 1 : AST; \ - magma_int_t BST = static_cast(B.stride(1)); \ - magma_int_t LDB = (BST == 0) ? 1 : BST; \ - magma_int_t NRHS = static_cast(B.extent(1)); \ - \ - KokkosLapack::Impl::MagmaSingleton& s = \ - KokkosLapack::Impl::MagmaSingleton::singleton(); \ - magma_int_t info = 0; \ - \ - if (with_pivot) { \ - magma_dgesv_gpu(N, NRHS, reinterpret_cast(A.data()), \ - LDA, IPIV.data(), \ - reinterpret_cast(B.data()), LDB, \ - &info); \ - } else { \ - magma_dgesv_nopiv_gpu( \ - N, NRHS, reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB, &info); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; +template +void magmaGesvWrapper(const ExecSpace& space, const AViewType& A, + const BViewType& B, const IPIVViewType& IPIV) { + using scalar_type = typename AViewType::non_const_value_type; + + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_MAGMA," + + Kokkos::ArithTraits::name() + "]"); + gesv_print_specialization(); + + const bool with_pivot = !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); + + magma_int_t N = static_cast(A.extent(1)); + magma_int_t AST = static_cast(A.stride(1)); + magma_int_t LDA = (AST == 0) ? 1 : AST; + magma_int_t BST = static_cast(B.stride(1)); + magma_int_t LDB = (BST == 0) ? 1 : BST; + magma_int_t NRHS = static_cast(B.extent(1)); + + KokkosLapack::Impl::MagmaSingleton& s = + KokkosLapack::Impl::MagmaSingleton::singleton(); + magma_int_t info = 0; + + space.fence(); + if constexpr (std::is_same_v) { + if (with_pivot) { + magma_sgesv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, + IPIV.data(), reinterpret_cast(B.data()), + LDB, &info); + } else { + magma_sgesv_nopiv_gpu(N, NRHS, reinterpret_cast(A.data()), + LDA, reinterpret_cast(B.data()), + LDB, &info); + } + } -#define KOKKOSLAPACK_SGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ + if constexpr (std::is_same_v) { + if (with_pivot) { + magma_dgesv_gpu(N, NRHS, reinterpret_cast(A.data()), LDA, + IPIV.data(), reinterpret_cast(B.data()), + LDB, &info); + } else { + magma_dgesv_nopiv_gpu( + N, NRHS, reinterpret_cast(A.data()), LDA, + reinterpret_cast(B.data()), LDB, &info); + } + } + + if constexpr (std::is_same_v>) { + if (with_pivot) { + magma_cgesv_gpu( + N, NRHS, reinterpret_cast(A.data()), LDA, + IPIV.data(), reinterpret_cast(B.data()), LDB, + &info); + } else { + magma_cgesv_nopiv_gpu( + N, NRHS, reinterpret_cast(A.data()), LDA, + reinterpret_cast(B.data()), LDB, &info); + } + } + + if constexpr (std::is_same_v>) { + if (with_pivot) { + magma_zgesv_gpu( + N, NRHS, reinterpret_cast(A.data()), LDA, + IPIV.data(), reinterpret_cast(B.data()), LDB, + &info); + } else { + magma_zgesv_nopiv_gpu( + N, NRHS, reinterpret_cast(A.data()), LDA, + reinterpret_cast(B.data()), LDB, &info); + } + } + ExecSpace().fence(); + Kokkos::Profiling::popRegion(); +} + +#define KOKKOSLAPACK_GESV_MAGMA(SCALAR, LAYOUT, MEM_SPACE) \ + template <> \ struct GESV< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View< \ + Kokkos::MemoryTraits>, \ + true, \ + gesv_eti_spec_avail< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using BViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PViewType = Kokkos::View< \ magma_int_t*, LAYOUT, \ Kokkos::Device, \ - Kokkos::MemoryTraits > \ - PViewType; \ - \ - static void gesv(const AViewType& A, const BViewType& B, \ - const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_MAGMA,float]"); \ - gesv_print_specialization(); \ - const bool with_pivot = \ - !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ - \ - magma_int_t N = static_cast(A.extent(1)); \ - magma_int_t AST = static_cast(A.stride(1)); \ - magma_int_t LDA = (AST == 0) ? 1 : AST; \ - magma_int_t BST = static_cast(B.stride(1)); \ - magma_int_t LDB = (BST == 0) ? 1 : BST; \ - magma_int_t NRHS = static_cast(B.extent(1)); \ - \ - KokkosLapack::Impl::MagmaSingleton& s = \ - KokkosLapack::Impl::MagmaSingleton::singleton(); \ - magma_int_t info = 0; \ + Kokkos::MemoryTraits>; \ \ - if (with_pivot) { \ - magma_sgesv_gpu(N, NRHS, reinterpret_cast(A.data()), \ - LDA, IPIV.data(), \ - reinterpret_cast(B.data()), LDB, \ - &info); \ - } else { \ - magma_sgesv_nopiv_gpu( \ - N, NRHS, reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB, &info); \ - } \ - Kokkos::Profiling::popRegion(); \ + static void gesv(const Kokkos::Cuda& space, const AViewType& A, \ + const BViewType& B, const PViewType& IPIV) { \ + magmaGesvWrapper(space, A, B, IPIV); \ } \ }; -#define KOKKOSLAPACK_ZGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GESV**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View< \ - magma_int_t*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - PViewType; \ - \ - static void gesv(const AViewType& A, const BViewType& B, \ - const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosLapack::gesv[TPL_MAGMA,complex]"); \ - gesv_print_specialization(); \ - const bool with_pivot = \ - !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ - \ - magma_int_t N = static_cast(A.extent(1)); \ - magma_int_t AST = static_cast(A.stride(1)); \ - magma_int_t LDA = (AST == 0) ? 1 : AST; \ - magma_int_t BST = static_cast(B.stride(1)); \ - magma_int_t LDB = (BST == 0) ? 1 : BST; \ - magma_int_t NRHS = static_cast(B.extent(1)); \ - \ - KokkosLapack::Impl::MagmaSingleton& s = \ - KokkosLapack::Impl::MagmaSingleton::singleton(); \ - magma_int_t info = 0; \ - \ - if (with_pivot) { \ - magma_zgesv_gpu( \ - N, NRHS, reinterpret_cast(A.data()), LDA, \ - IPIV.data(), reinterpret_cast(B.data()), \ - LDB, &info); \ - } else { \ - magma_zgesv_nopiv_gpu( \ - N, NRHS, reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB, &info); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; +KOKKOSLAPACK_GESV_MAGMA(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_MAGMA(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_MAGMA(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::CudaSpace) -#define KOKKOSLAPACK_CGESV_MAGMA(LAYOUT, MEM_SPACE, ETI_SPEC_AVAIL) \ - template \ - struct GESV**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View< \ - magma_int_t*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits > \ - PViewType; \ - \ - static void gesv(const AViewType& A, const BViewType& B, \ - const PViewType& IPIV) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosLapack::gesv[TPL_MAGMA,complex]"); \ - gesv_print_specialization(); \ - const bool with_pivot = \ - !((IPIV.extent(0) == 0) && (IPIV.data() == nullptr)); \ - \ - magma_int_t N = static_cast(A.extent(1)); \ - magma_int_t AST = static_cast(A.stride(1)); \ - magma_int_t LDA = (AST == 0) ? 1 : AST; \ - magma_int_t BST = static_cast(B.stride(1)); \ - magma_int_t LDB = (BST == 0) ? 1 : BST; \ - magma_int_t NRHS = static_cast(B.extent(1)); \ - \ - KokkosLapack::Impl::MagmaSingleton& s = \ - KokkosLapack::Impl::MagmaSingleton::singleton(); \ - magma_int_t info = 0; \ - \ - if (with_pivot) { \ - magma_cgesv_gpu( \ - N, NRHS, reinterpret_cast(A.data()), LDA, \ - IPIV.data(), reinterpret_cast(B.data()), \ - LDB, &info); \ - } else { \ - magma_cgesv_nopiv_gpu( \ - N, NRHS, reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB, &info); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +} // namespace Impl +} // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA + +// CUSOLVER +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER +#include "KokkosLapack_cusolver.hpp" + +namespace KokkosLapack { +namespace Impl { + +template +void cusolverGesvWrapper(const ExecutionSpace& space, const IPIVViewType& IPIV, + const AViewType& A, const BViewType& B) { + using memory_space = typename AViewType::memory_space; + using Scalar = typename BViewType::non_const_value_type; + using ALayout_t = typename AViewType::array_layout; + using BLayout_t = typename BViewType::array_layout; + + const int m = A.extent_int(0); + const int n = A.extent_int(1); + const int lda = std::is_same_v ? A.stride(0) + : A.stride(1); + + (void)B; + + const int nrhs = B.extent_int(1); + const int ldb = std::is_same_v ? B.stride(0) + : B.stride(1); + int lwork = 0; + Kokkos::View info("getrf info"); + + CudaLapackSingleton& s = CudaLapackSingleton::singleton(); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnSetStream(s.handle, space.cuda_stream())); + if constexpr (std::is_same_v) { + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnSgetrf_bufferSize(s.handle, m, n, A.data(), lda, &lwork)); + Kokkos::View Workspace("getrf workspace", lwork); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSgetrf(s.handle, m, n, A.data(), + lda, Workspace.data(), + IPIV.data(), info.data())); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnSgetrs(s.handle, CUBLAS_OP_N, m, nrhs, A.data(), lda, + IPIV.data(), B.data(), ldb, info.data())); + } + if constexpr (std::is_same_v) { + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnDgetrf_bufferSize(s.handle, m, n, A.data(), lda, &lwork)); + Kokkos::View Workspace("getrf workspace", lwork); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnDgetrf(s.handle, m, n, A.data(), + lda, Workspace.data(), + IPIV.data(), info.data())); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnDgetrs(s.handle, CUBLAS_OP_N, m, nrhs, A.data(), lda, + IPIV.data(), B.data(), ldb, info.data())); + } + if constexpr (std::is_same_v>) { + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnCgetrf_bufferSize( + s.handle, m, n, reinterpret_cast(A.data()), lda, &lwork)); + Kokkos::View Workspace("getrf workspace", lwork); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnCgetrf(s.handle, m, n, reinterpret_cast(A.data()), + lda, reinterpret_cast(Workspace.data()), + IPIV.data(), info.data())); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnCgetrs( + s.handle, CUBLAS_OP_N, m, nrhs, reinterpret_cast(A.data()), + lda, IPIV.data(), reinterpret_cast(B.data()), ldb, + info.data())); + } + if constexpr (std::is_same_v>) { + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgetrf_bufferSize( + s.handle, m, n, reinterpret_cast(A.data()), lda, + &lwork)); + Kokkos::View Workspace("getrf workspace", + lwork); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgetrf( + s.handle, m, n, reinterpret_cast(A.data()), lda, + reinterpret_cast(Workspace.data()), IPIV.data(), + info.data())); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnZgetrs( + s.handle, CUBLAS_OP_N, m, nrhs, + reinterpret_cast(A.data()), lda, IPIV.data(), + reinterpret_cast(B.data()), ldb, info.data())); + } + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSetStream(s.handle, NULL)); +} + +#define KOKKOSLAPACK_GESV_CUSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ + template <> \ + struct GESV< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + gesv_eti_spec_avail< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using BViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PViewType = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void gesv(const Kokkos::Cuda& space, const AViewType& A, \ + const BViewType& B, const PViewType& IPIV) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::gesv[TPL_CUSOLVER," #SCALAR \ + "]"); \ + gesv_print_specialization(); \ + \ + cusolverGesvWrapper(space, IPIV, A, B); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -KOKKOSLAPACK_DGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSLAPACK_DGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +KOKKOSLAPACK_GESV_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::CudaSpace) -KOKKOSLAPACK_SGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSLAPACK_SGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +#if defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) +KOKKOSLAPACK_GESV_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSLAPACK_GESV_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSLAPACK_GESV_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +#endif -KOKKOSLAPACK_ZGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSLAPACK_ZGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +} // namespace Impl +} // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_CUSOLVER -KOKKOSLAPACK_CGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSLAPACK_CGESV_MAGMA(Kokkos::LayoutLeft, Kokkos::CudaSpace, false) +// ROCSOLVER +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER +#include +#include + +namespace KokkosLapack { +namespace Impl { + +template +void rocsolverGesvWrapper(const ExecutionSpace& space, const IPIVViewType& IPIV, + const AViewType& A, const BViewType& B) { + using Scalar = typename BViewType::non_const_value_type; + using ALayout_t = typename AViewType::array_layout; + using BLayout_t = typename BViewType::array_layout; + + const rocblas_int N = static_cast(A.extent(0)); + const rocblas_int nrhs = static_cast(B.extent(1)); + const rocblas_int lda = std::is_same_v + ? A.stride(0) + : A.stride(1); + const rocblas_int ldb = std::is_same_v + ? B.stride(0) + : B.stride(1); + Kokkos::View info("rocsolver info"); + + KokkosBlas::Impl::RocBlasSingleton& s = + KokkosBlas::Impl::RocBlasSingleton::singleton(); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + rocblas_set_stream(s.handle, space.hip_stream())); + if constexpr (std::is_same_v) { + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_sgesv(s.handle, N, nrhs, A.data(), + lda, IPIV.data(), B.data(), + ldb, info.data())); + } + if constexpr (std::is_same_v) { + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_dgesv(s.handle, N, nrhs, A.data(), + lda, IPIV.data(), B.data(), + ldb, info.data())); + } + if constexpr (std::is_same_v>) { + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_cgesv( + s.handle, N, nrhs, reinterpret_cast(A.data()), + lda, IPIV.data(), reinterpret_cast(B.data()), + ldb, info.data())); + } + if constexpr (std::is_same_v>) { + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_zgesv( + s.handle, N, nrhs, reinterpret_cast(A.data()), + lda, IPIV.data(), reinterpret_cast(B.data()), + ldb, info.data())); + } + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); +} + +#define KOKKOSLAPACK_GESV_ROCSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ + template <> \ + struct GESV< \ + Kokkos::HIP, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + gesv_eti_spec_avail< \ + Kokkos::HIP, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AViewType = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using BViewType = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using PViewType = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void gesv(const Kokkos::HIP& space, const AViewType& A, \ + const BViewType& B, const PViewType& IPIV) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosLapack::gesv[TPL_ROCSOLVER," #SCALAR "]"); \ + gesv_print_specialization(); \ + \ + rocsolverGesvWrapper(space, IPIV, A, B); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSLAPACK_GESV_ROCSOLVER(float, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_ROCSOLVER(double, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSLAPACK_GESV_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::HIPSpace) } // namespace Impl } // namespace KokkosLapack -#endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA +#endif // KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER #endif diff --git a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_magma.hpp b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_magma.hpp index 66529d73de37..dfde113fa663 100644 --- a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_magma.hpp +++ b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_magma.hpp @@ -16,13 +16,16 @@ #ifndef KOKKOSLAPACK_MAGMA_HPP_ #define KOKKOSLAPACK_MAGMA_HPP_ -// If LAPACK TPL is enabled, it is preferred over magma's LAPACK + #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA #include "magma_v2.h" namespace KokkosLapack { namespace Impl { +// Declaration of the singleton for cusolver +// this is the only header that needs to be +// included when using cusolverDn. struct MagmaSingleton { MagmaSingleton(); @@ -31,5 +34,6 @@ struct MagmaSingleton { } // namespace Impl } // namespace KokkosLapack -#endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA +#endif + #endif // KOKKOSLAPACK_MAGMA_HPP_ diff --git a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_svd_tpl_spec_avail.hpp b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_svd_tpl_spec_avail.hpp new file mode 100644 index 000000000000..7a7403209fa5 --- /dev/null +++ b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_svd_tpl_spec_avail.hpp @@ -0,0 +1,171 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_HPP_ +#define KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_HPP_ + +namespace KokkosLapack { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct svd_tpl_spec_avail { + enum : bool { value = false }; +}; + +// LAPACK +#if defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) || \ + defined(KOKKOSKERNELS_ENABLE_TPL_MKL) +#define KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(SCALAR, LAYOUT, EXECSPACE) \ + template <> \ + struct svd_tpl_spec_avail< \ + EXECSPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ + }; + +#if defined(KOKKOS_ENABLE_SERIAL) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, + Kokkos::Serial) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, + Kokkos::Serial) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Serial) +#endif + +#if defined(KOKKOS_ENABLE_OPENMP) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, + Kokkos::OpenMP) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, + Kokkos::OpenMP) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::OpenMP) +#endif + +#if defined(KOKKOS_ENABLE_THREADS) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(float, Kokkos::LayoutLeft, + Kokkos::Threads) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(double, Kokkos::LayoutLeft, + Kokkos::Threads) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Threads) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_LAPACK(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::Threads) +#endif + +#endif // KOKKOSKERNELS_ENABLE_TPL_LAPACK || KOKKOSKERNELS_ENABLE_TPL_MKL + +// CUSOLVER +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER +#define KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct svd_tpl_spec_avail< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ + }; + +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(float, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(double, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::CudaSpace) + +#if defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(float, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(double, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_CUSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +#endif // CUDAUVMSPACE +#endif // CUSOLVER + +// ROCSOLVER +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER +#define KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(SCALAR, LAYOUT, MEMSPACE) \ + template <> \ + struct svd_tpl_spec_avail< \ + Kokkos::HIP, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ + }; + +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(float, Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(double, Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, Kokkos::HIPSpace) + +#if defined(KOKKOSKERNELS_INST_MEMSPACE_HIPMANAGEDSPACE) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(float, Kokkos::LayoutLeft, + Kokkos::HIPManagedSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(double, Kokkos::LayoutLeft, + Kokkos::HIPManagedSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, + Kokkos::HIPManagedSpace) +KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_ROCSOLVER(Kokkos::complex, + Kokkos::LayoutLeft, + Kokkos::HIPManagedSpace) +#endif // HIPMANAGEDSPACE +#endif // ROCSOLVER + +} // namespace Impl +} // namespace KokkosLapack + +#endif // KOKKOSLAPACK_SVD_TPL_SPEC_AVAIL_HPP_ diff --git a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp new file mode 100644 index 000000000000..4385fa40d636 --- /dev/null +++ b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_svd_tpl_spec_decl.hpp @@ -0,0 +1,688 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSLAPACK_SVD_TPL_SPEC_DECL_HPP_ +#define KOKKOSLAPACK_SVD_TPL_SPEC_DECL_HPP_ + +#include "KokkosKernels_Error.hpp" +#include "Kokkos_ArithTraits.hpp" + +namespace KokkosLapack { +namespace Impl { +template +inline void svd_print_specialization() { +#ifdef KOKKOSKERNELS_ENABLE_CHECK_SPECIALIZATION +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER + if constexpr (std::is_same_v) { + printf( + "KokkosLapack::svd<> TPL Cusolver specialization for < %s , %s, %s, %s " + ">\n", + typeid(AMatrix).name(), typeid(SVector).name(), typeid(UMatrix).name(), + typeid(VMatrix).name()); + } +#endif +#endif +} +} // namespace Impl +} // namespace KokkosLapack + +// LAPACK +#if defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL) +#include "KokkosLapack_Host_tpl.hpp" + +namespace KokkosLapack { +namespace Impl { + +template +void lapackSvdWrapper(const ExecutionSpace& /* space */, const char jobu[], + const char jobvt[], const AMatrix& A, const SVector& S, + const UMatrix& U, const VMatrix& Vt) { + using memory_space = typename AMatrix::memory_space; + using Scalar = typename AMatrix::non_const_value_type; + using Magnitude = typename SVector::non_const_value_type; + using ALayout_t = typename AMatrix::array_layout; + using ULayout_t = typename UMatrix::array_layout; + using VLayout_t = typename VMatrix::array_layout; + + static_assert(std::is_same_v, + "KokkosLapack - svd: A needs to have a Kokkos::LayoutLeft"); + static_assert(std::is_same_v, + "KokkosLapack - svd: U needs to have a Kokkos::LayoutLeft"); + static_assert(std::is_same_v, + "KokkosLapack - svd: Vt needs to have a Kokkos::LayoutLeft"); + + const int m = A.extent_int(0); + const int n = A.extent_int(1); + const int lda = A.stride(1); + const int ldu = U.stride(1); + const int ldvt = Vt.stride(1); + + int lwork = -1, info = 0; + Kokkos::View rwork("svd rwork buffer", + 5 * Kokkos::min(m, n)); + Kokkos::View work("svd work buffer", 1); + if constexpr (Kokkos::ArithTraits::is_complex) { + HostLapack>::gesvd( + jobu[0], jobvt[0], m, n, + reinterpret_cast*>(A.data()), lda, S.data(), + reinterpret_cast*>(U.data()), ldu, + reinterpret_cast*>(Vt.data()), ldvt, + reinterpret_cast*>(work.data()), lwork, + rwork.data(), info); + + lwork = static_cast(work(0).real()); + + work = Kokkos::View("svd work buffer", lwork); + HostLapack>::gesvd( + jobu[0], jobvt[0], m, n, + reinterpret_cast*>(A.data()), lda, S.data(), + reinterpret_cast*>(U.data()), ldu, + reinterpret_cast*>(Vt.data()), ldvt, + reinterpret_cast*>(work.data()), lwork, + rwork.data(), info); + } else { + HostLapack::gesvd(jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), + U.data(), ldu, Vt.data(), ldvt, work.data(), + lwork, rwork.data(), info); + + lwork = static_cast(work(0)); + + work = Kokkos::View("svd work buffer", lwork); + HostLapack::gesvd(jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), + U.data(), ldu, Vt.data(), ldvt, work.data(), + lwork, rwork.data(), info); + } +} + +#define KOKKOSLAPACK_SVD_LAPACK(SCALAR, LAYOUT, EXEC_SPACE) \ + template <> \ + struct SVD< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + svd_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AMatrix = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using SVector = \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using UMatrix = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using VMatrix = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void svd(const EXEC_SPACE& space, const char jobu[], \ + const char jobvt[], const AMatrix& A, const SVector& S, \ + const UMatrix& U, const VMatrix& Vt) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::svd[TPL_LAPACK," #SCALAR \ + "]"); \ + svd_print_specialization(); \ + \ + lapackSvdWrapper(space, jobu, jobvt, A, S, U, Vt); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#if defined(KOKKOS_ENABLE_SERIAL) +KOKKOSLAPACK_SVD_LAPACK(float, Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSLAPACK_SVD_LAPACK(double, Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Serial) +KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Serial) +#endif + +#if defined(KOKKOS_ENABLE_OPENMP) +KOKKOSLAPACK_SVD_LAPACK(float, Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSLAPACK_SVD_LAPACK(double, Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::OpenMP) +KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::OpenMP) +#endif + +#if defined(KOKKOS_ENABLE_THREADS) +KOKKOSLAPACK_SVD_LAPACK(float, Kokkos::LayoutLeft, Kokkos::Threads) +KOKKOSLAPACK_SVD_LAPACK(double, Kokkos::LayoutLeft, Kokkos::Threads) +KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Threads) +KOKKOSLAPACK_SVD_LAPACK(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Threads) +#endif + +} // namespace Impl +} // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_LAPACK + +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#include "mkl.h" + +namespace KokkosLapack { +namespace Impl { + +template +void mklSvdWrapper(const ExecutionSpace& /* space */, const char jobu[], + const char jobvt[], const AMatrix& A, const SVector& S, + const UMatrix& U, const VMatrix& Vt) { + using memory_space = typename AMatrix::memory_space; + using Scalar = typename AMatrix::non_const_value_type; + using Magnitude = typename SVector::non_const_value_type; + using ALayout_t = typename AMatrix::array_layout; + using ULayout_t = typename UMatrix::array_layout; + using VLayout_t = typename VMatrix::array_layout; + + static_assert(std::is_same_v, + "KokkosLapack - svd: A needs to have a Kokkos::LayoutLeft"); + static_assert(std::is_same_v, + "KokkosLapack - svd: U needs to have a Kokkos::LayoutLeft"); + static_assert(std::is_same_v, + "KokkosLapack - svd: Vt needs to have a Kokkos::LayoutLeft"); + + const lapack_int m = A.extent_int(0); + const lapack_int n = A.extent_int(1); + const lapack_int lda = A.stride(1); + const lapack_int ldu = U.stride(1); + const lapack_int ldvt = Vt.stride(1); + + Kokkos::View rwork("svd rwork buffer", + Kokkos::min(m, n) - 1); + lapack_int ret = 0; + if constexpr (std::is_same_v) { + ret = + LAPACKE_sgesvd(LAPACK_COL_MAJOR, jobu[0], jobvt[0], m, n, A.data(), lda, + S.data(), U.data(), ldu, Vt.data(), ldvt, rwork.data()); + } + if constexpr (std::is_same_v) { + ret = + LAPACKE_dgesvd(LAPACK_COL_MAJOR, jobu[0], jobvt[0], m, n, A.data(), lda, + S.data(), U.data(), ldu, Vt.data(), ldvt, rwork.data()); + } + if constexpr (std::is_same_v>) { + ret = LAPACKE_cgesvd( + LAPACK_COL_MAJOR, jobu[0], jobvt[0], m, n, + reinterpret_cast(A.data()), lda, S.data(), + reinterpret_cast(U.data()), ldu, + reinterpret_cast(Vt.data()), ldvt, rwork.data()); + } + if constexpr (std::is_same_v>) { + ret = LAPACKE_zgesvd( + LAPACK_COL_MAJOR, jobu[0], jobvt[0], m, n, + reinterpret_cast(A.data()), lda, S.data(), + reinterpret_cast(U.data()), ldu, + reinterpret_cast(Vt.data()), ldvt, + rwork.data()); + } + + if (ret != 0) { + std::ostringstream os; + os << "KokkosLapack::svd: MKL failed with return value: " << ret << "\n"; + KokkosKernels::Impl::throw_runtime_exception(os.str()); + } +} + +#define KOKKOSLAPACK_SVD_MKL(SCALAR, LAYOUT, EXEC_SPACE) \ + template <> \ + struct SVD< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + svd_eti_spec_avail< \ + EXEC_SPACE, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AMatrix = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using SVector = \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using UMatrix = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using VMatrix = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void svd(const EXEC_SPACE& space, const char jobu[], \ + const char jobvt[], const AMatrix& A, const SVector& S, \ + const UMatrix& U, const VMatrix& Vt) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::svd[TPL_LAPACK," #SCALAR \ + "]"); \ + svd_print_specialization(); \ + \ + mklSvdWrapper(space, jobu, jobvt, A, S, U, Vt); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#if defined(KOKKOS_ENABLE_SERIAL) +KOKKOSLAPACK_SVD_MKL(float, Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSLAPACK_SVD_MKL(double, Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::Serial) +KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Serial) +#endif + +#if defined(KOKKOS_ENABLE_OPENMP) +KOKKOSLAPACK_SVD_MKL(float, Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSLAPACK_SVD_MKL(double, Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, Kokkos::OpenMP) +KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::OpenMP) +#endif + +#if defined(KOKKOS_ENABLE_THREADS) +KOKKOSLAPACK_SVD_MKL(float, Kokkos::LayoutLeft, Kokkos::Threads) +KOKKOSLAPACK_SVD_MKL(double, Kokkos::LayoutLeft, Kokkos::Threads) +KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Threads) +KOKKOSLAPACK_SVD_MKL(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::Threads) +#endif + +} // namespace Impl +} // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_MKL + +// CUSOLVER +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSOLVER +#include "KokkosLapack_cusolver.hpp" + +namespace KokkosLapack { +namespace Impl { + +template +void cusolverSvdWrapper(const ExecutionSpace& space, const char jobu[], + const char jobvt[], const AMatrix& A, const SVector& S, + const UMatrix& U, const VMatrix& Vt) { + using memory_space = typename AMatrix::memory_space; + using Scalar = typename AMatrix::non_const_value_type; + using Magnitude = typename SVector::non_const_value_type; + using ALayout_t = typename AMatrix::array_layout; + using ULayout_t = typename UMatrix::array_layout; + using VLayout_t = typename VMatrix::array_layout; + + static_assert(std::is_same_v, + "KokkosLapack - svd: A needs to have a Kokkos::LayoutLeft"); + static_assert(std::is_same_v, + "KokkosLapack - svd: U needs to have a Kokkos::LayoutLeft"); + static_assert(std::is_same_v, + "KokkosLapack - svd: Vt needs to have a Kokkos::LayoutLeft"); + + const int m = A.extent_int(0); + const int n = A.extent_int(1); + const int lda = A.stride(1); + const int ldu = U.stride(1); + const int ldvt = Vt.stride(1); + + int lwork = 0; + Kokkos::View info("svd info"); + Kokkos::View rwork("svd rwork buffer", + Kokkos::min(m, n) - 1); + + CudaLapackSingleton& s = CudaLapackSingleton::singleton(); + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnSetStream(s.handle, space.cuda_stream())); + if constexpr (std::is_same_v) { + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnSgesvd_bufferSize(s.handle, m, n, &lwork)); + Kokkos::View work("svd work buffer", lwork); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSgesvd( + s.handle, jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), U.data(), + ldu, Vt.data(), ldvt, work.data(), lwork, rwork.data(), info.data())); + } + if constexpr (std::is_same_v) { + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnDgesvd_bufferSize(s.handle, m, n, &lwork)); + Kokkos::View work("svd work buffer", lwork); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnDgesvd( + s.handle, jobu[0], jobvt[0], m, n, A.data(), lda, S.data(), U.data(), + ldu, Vt.data(), ldvt, work.data(), lwork, rwork.data(), info.data())); + } + if constexpr (std::is_same_v>) { + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnCgesvd_bufferSize(s.handle, m, n, &lwork)); + Kokkos::View work("svd work buffer", lwork); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnCgesvd(s.handle, jobu[0], jobvt[0], m, n, + reinterpret_cast(A.data()), lda, S.data(), + reinterpret_cast(U.data()), ldu, + reinterpret_cast(Vt.data()), ldvt, + reinterpret_cast(work.data()), lwork, + rwork.data(), info.data())); + } + if constexpr (std::is_same_v>) { + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnZgesvd_bufferSize(s.handle, m, n, &lwork)); + Kokkos::View work("svd work buffer", lwork); + + KOKKOS_CUSOLVER_SAFE_CALL_IMPL( + cusolverDnZgesvd(s.handle, jobu[0], jobvt[0], m, n, + reinterpret_cast(A.data()), lda, + S.data(), reinterpret_cast(U.data()), + ldu, reinterpret_cast(Vt.data()), + ldvt, reinterpret_cast(work.data()), + lwork, rwork.data(), info.data())); + } + KOKKOS_CUSOLVER_SAFE_CALL_IMPL(cusolverDnSetStream(s.handle, NULL)); +} + +#define KOKKOSLAPACK_SVD_CUSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ + template <> \ + struct SVD< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + svd_eti_spec_avail< \ + Kokkos::Cuda, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using SVector = \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using UMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using VMatrix = Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void svd(const Kokkos::Cuda& space, const char jobu[], \ + const char jobvt[], const AMatrix& A, const SVector& S, \ + const UMatrix& U, const VMatrix& Vt) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::svd[TPL_CUSOLVER," #SCALAR \ + "]"); \ + svd_print_specialization(); \ + \ + cusolverSvdWrapper(space, jobu, jobvt, A, S, U, Vt); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSLAPACK_SVD_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_SVD_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaSpace) +KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::CudaSpace) + +#if defined(KOKKOSKERNELS_INST_MEMSPACE_CUDAUVMSPACE) +KOKKOSLAPACK_SVD_CUSOLVER(float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSLAPACK_SVD_CUSOLVER(double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) +KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSLAPACK_SVD_CUSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +#endif + +} // namespace Impl +} // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_CUSOLVER + +// ROCSOLVER +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER +#include +#include + +namespace KokkosLapack { +namespace Impl { + +template +void rocsolverSvdWrapper(const ExecutionSpace& space, const char jobu[], + const char jobvt[], const AMatrix& A, const SVector& S, + const UMatrix& U, const VMatrix& Vt) { + using memory_space = typename AMatrix::memory_space; + using Scalar = typename AMatrix::non_const_value_type; + using Magnitude = typename SVector::non_const_value_type; + using ALayout_t = typename AMatrix::array_layout; + using ULayout_t = typename UMatrix::array_layout; + using VLayout_t = typename VMatrix::array_layout; + + static_assert(std::is_same_v, + "KokkosLapack - svd: A needs to have a Kokkos::LayoutLeft"); + static_assert(std::is_same_v, + "KokkosLapack - svd: U needs to have a Kokkos::LayoutLeft"); + static_assert(std::is_same_v, + "KokkosLapack - svd: Vt needs to have a Kokkos::LayoutLeft"); + + const rocblas_int m = A.extent_int(0); + const rocblas_int n = A.extent_int(1); + const rocblas_int lda = A.stride(1); + const rocblas_int ldu = U.stride(1); + const rocblas_int ldvt = Vt.stride(1); + + rocblas_svect UVecMode = rocblas_svect_all; + if ((jobu[0] == 'S') || (jobu[0] == 's')) { + UVecMode = rocblas_svect_singular; + } else if ((jobu[0] == 'O') || (jobu[0] == 'o')) { + UVecMode = rocblas_svect_overwrite; + } else if ((jobu[0] == 'N') || (jobu[0] == 'n')) { + UVecMode = rocblas_svect_none; + } + rocblas_svect VVecMode = rocblas_svect_all; + if ((jobvt[0] == 'S') || (jobvt[0] == 's')) { + VVecMode = rocblas_svect_singular; + } else if ((jobvt[0] == 'O') || (jobvt[0] == 'o')) { + VVecMode = rocblas_svect_overwrite; + } else if ((jobvt[0] == 'N') || (jobvt[0] == 'n')) { + VVecMode = rocblas_svect_none; + } + + const rocblas_workmode WorkMode = rocblas_outofplace; + + Kokkos::View info("svd info"); + Kokkos::View rwork("svd rwork buffer", + Kokkos::min(m, n) - 1); + + KokkosBlas::Impl::RocBlasSingleton& s = + KokkosBlas::Impl::RocBlasSingleton::singleton(); + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( + rocblas_set_stream(s.handle, space.hip_stream())); + if constexpr (std::is_same_v) { + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_sgesvd( + s.handle, UVecMode, VVecMode, m, n, A.data(), lda, S.data(), U.data(), + ldu, Vt.data(), ldvt, rwork.data(), WorkMode, info.data())); + } + if constexpr (std::is_same_v) { + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_dgesvd( + s.handle, UVecMode, VVecMode, m, n, A.data(), lda, S.data(), U.data(), + ldu, Vt.data(), ldvt, rwork.data(), WorkMode, info.data())); + } + if constexpr (std::is_same_v>) { + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_cgesvd( + s.handle, UVecMode, VVecMode, m, n, + reinterpret_cast(A.data()), lda, S.data(), + reinterpret_cast(U.data()), ldu, + reinterpret_cast(Vt.data()), ldvt, rwork.data(), + WorkMode, info.data())); + } + if constexpr (std::is_same_v>) { + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocsolver_zgesvd( + s.handle, UVecMode, VVecMode, m, n, + reinterpret_cast(A.data()), lda, S.data(), + reinterpret_cast(U.data()), ldu, + reinterpret_cast(Vt.data()), ldvt, + rwork.data(), WorkMode, info.data())); + } + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); +} + +#define KOKKOSLAPACK_SVD_ROCSOLVER(SCALAR, LAYOUT, MEM_SPACE) \ + template <> \ + struct SVD< \ + Kokkos::HIP, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, \ + svd_eti_spec_avail< \ + Kokkos::HIP, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>>::value> { \ + using AMatrix = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using SVector = \ + Kokkos::View::mag_type*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>; \ + using UMatrix = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using VMatrix = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + \ + static void svd(const Kokkos::HIP& space, const char jobu[], \ + const char jobvt[], const AMatrix& A, const SVector& S, \ + const UMatrix& U, const VMatrix& Vt) { \ + Kokkos::Profiling::pushRegion("KokkosLapack::svd[TPL_ROCSOLVER," #SCALAR \ + "]"); \ + svd_print_specialization(); \ + \ + rocsolverSvdWrapper(space, jobu, jobvt, A, S, U, Vt); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +KOKKOSLAPACK_SVD_ROCSOLVER(float, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_SVD_ROCSOLVER(double, Kokkos::LayoutLeft, Kokkos::HIPSpace) +KOKKOSLAPACK_SVD_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::HIPSpace) +KOKKOSLAPACK_SVD_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::HIPSpace) + +#if defined(KOKKOSKERNELS_INST_MEMSPACE_HIPMANAGEDSPACE) +KOKKOSLAPACK_SVD_ROCSOLVER(float, Kokkos::LayoutLeft, Kokkos::HIPManagedSpace) +KOKKOSLAPACK_SVD_ROCSOLVER(double, Kokkos::LayoutLeft, Kokkos::HIPManagedSpace) +KOKKOSLAPACK_SVD_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::HIPManagedSpace) +KOKKOSLAPACK_SVD_ROCSOLVER(Kokkos::complex, Kokkos::LayoutLeft, + Kokkos::HIPManagedSpace) +#endif + +} // namespace Impl +} // namespace KokkosLapack +#endif // KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER + +#endif // KOKKOSLAPACK_SVD_TPL_SPEC_DECL_HPP_ diff --git a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp index 655b5b857944..b7e9c6e341e7 100644 --- a/packages/kokkos-kernels/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/lapack/tpls/KokkosLapack_trtri_tpl_spec_decl.hpp @@ -18,6 +18,7 @@ #define KOKKOSLAPACK_TRTRI_TPL_SPEC_DECL_HPP_ #include "KokkosLapack_Host_tpl.hpp" // trtri prototype + #ifdef KOKKOSKERNELS_ENABLE_TPL_MAGMA #include "KokkosLapack_magma.hpp" #endif diff --git a/packages/kokkos-kernels/lapack/unit_test/Test_Lapack.hpp b/packages/kokkos-kernels/lapack/unit_test/Test_Lapack.hpp index 815c442884c0..1a717521f8d7 100644 --- a/packages/kokkos-kernels/lapack/unit_test/Test_Lapack.hpp +++ b/packages/kokkos-kernels/lapack/unit_test/Test_Lapack.hpp @@ -18,5 +18,6 @@ #include "Test_Lapack_gesv.hpp" #include "Test_Lapack_trtri.hpp" +#include "Test_Lapack_svd.hpp" #endif // TEST_LAPACK_HPP diff --git a/packages/kokkos-kernels/lapack/unit_test/Test_Lapack_gesv.hpp b/packages/kokkos-kernels/lapack/unit_test/Test_Lapack_gesv.hpp index 06f51b7eb0f0..77774d1d3f3a 100644 --- a/packages/kokkos-kernels/lapack/unit_test/Test_Lapack_gesv.hpp +++ b/packages/kokkos-kernels/lapack/unit_test/Test_Lapack_gesv.hpp @@ -15,13 +15,15 @@ //@HEADER // only enable this test where KokkosLapack supports gesv: -// CUDA+MAGMA and HOST+LAPACK -#if (defined(TEST_CUDA_LAPACK_CPP) && \ - defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)) || \ - (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ - (defined(TEST_OPENMP_LAPACK_CPP) || \ - defined(TEST_OPENMPTARGET_LAPACK_CPP) || \ - defined(TEST_SERIAL_LAPACK_CPP) || defined(TEST_THREADS_LAPACK_CPP))) +// CUDA+(MAGMA or CUSOLVER), HIP+ROCSOLVER and HOST+LAPACK +#if (defined(TEST_CUDA_LAPACK_CPP) && \ + (defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) || \ + defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER))) || \ + (defined(TEST_HIP_LAPACK_CPP) && \ + defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER)) || \ + (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ + (defined(TEST_OPENMP_LAPACK_CPP) || defined(TEST_SERIAL_LAPACK_CPP) || \ + defined(TEST_THREADS_LAPACK_CPP))) #include #include @@ -34,11 +36,13 @@ namespace Test { -template +template void impl_test_gesv(const char* mode, const char* padding, int N) { - typedef typename Device::execution_space execution_space; - typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::ArithTraits ats; + using execution_space = typename Device::execution_space; + using ScalarA = typename ViewTypeA::value_type; + using ats = Kokkos::ArithTraits; + + execution_space space{}; Kokkos::Random_XorShift64_Pool rand_pool(13718); @@ -80,7 +84,9 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { Kokkos::deep_copy(h_X0, X0); // Allocate IPIV view on host - typedef Kokkos::View ViewTypeP; + using ViewTypeP = typename std::conditional< + MAGMA, Kokkos::View, + Kokkos::View>::type; ViewTypeP ipiv; int Nt = 0; if (mode[0] == 'Y') { @@ -90,7 +96,7 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { // Solve. try { - KokkosLapack::gesv(A, B, ipiv); + KokkosLapack::gesv(space, A, B, ipiv); } catch (const std::runtime_error& error) { // Check for expected runtime errors due to: // no-pivoting case (note: only MAGMA supports no-pivoting interface) @@ -124,26 +130,30 @@ void impl_test_gesv(const char* mode, const char* padding, int N) { // Checking vs ref on CPU, this eps is about 10^-9 typedef typename ats::mag_type mag_type; - const mag_type eps = 1.0e7 * ats::epsilon(); + const mag_type eps = 3.0e7 * ats::epsilon(); bool test_flag = true; for (int i = 0; i < N; i++) { if (ats::abs(h_B(i) - h_X0(i)) > eps) { test_flag = false; - // printf( " Error %d, pivot %c, padding %c: result( %.15lf ) != - // solution( %.15lf ) at (%d)\n", N, mode[0], padding[0], - // ats::abs(h_B(i)), ats::abs(h_X0(i)), int(i) ); - // break; + printf( + " Error %d, pivot %c, padding %c: result( %.15lf ) !=" + "solution( %.15lf ) at (%d), error=%.15e, eps=%.15e\n", + N, mode[0], padding[0], ats::abs(h_B(i)), ats::abs(h_X0(i)), int(i), + ats::abs(h_B(i) - h_X0(i)), eps); + break; } } ASSERT_EQ(test_flag, true); } -template +template void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, int nrhs) { - typedef typename Device::execution_space execution_space; - typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::ArithTraits ats; + using execution_space = typename Device::execution_space; + using ScalarA = typename ViewTypeA::value_type; + using ats = Kokkos::ArithTraits; + + execution_space space{}; Kokkos::Random_XorShift64_Pool rand_pool(13718); @@ -185,7 +195,9 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, Kokkos::deep_copy(h_X0, X0); // Allocate IPIV view on host - typedef Kokkos::View ViewTypeP; + using ViewTypeP = typename std::conditional< + MAGMA, Kokkos::View, + Kokkos::View>::type; ViewTypeP ipiv; int Nt = 0; if (mode[0] == 'Y') { @@ -195,7 +207,7 @@ void impl_test_gesv_mrhs(const char* mode, const char* padding, int N, // Solve. try { - KokkosLapack::gesv(A, B, ipiv); + KokkosLapack::gesv(space, A, B, ipiv); } catch (const std::runtime_error& error) { // Check for expected runtime errors due to: // no-pivoting case (note: only MAGMA supports no-pivoting interface) @@ -253,41 +265,51 @@ int test_gesv(const char* mode) { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ll; - typedef Kokkos::View view_type_b_ll; - Test::impl_test_gesv( + using view_type_a_ll = Kokkos::View; + using view_type_b_ll = Kokkos::View; + +#if (defined(TEST_CUDA_LAPACK_CPP) && \ + defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER)) || \ + (defined(TEST_HIP_LAPACK_CPP) && \ + defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER)) || \ + (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ + (defined(TEST_OPENMP_LAPACK_CPP) || defined(TEST_SERIAL_LAPACK_CPP) || \ + defined(TEST_THREADS_LAPACK_CPP))) + Test::impl_test_gesv( &mode[0], "N", 2); // no padding - Test::impl_test_gesv( + Test::impl_test_gesv( &mode[0], "N", 13); // no padding - Test::impl_test_gesv( + Test::impl_test_gesv( &mode[0], "N", 179); // no padding - Test::impl_test_gesv( + Test::impl_test_gesv( &mode[0], "N", 64); // no padding - Test::impl_test_gesv( + Test::impl_test_gesv( &mode[0], "N", 1024); // no padding - Test::impl_test_gesv(&mode[0], "Y", - 13); // padding - Test::impl_test_gesv(&mode[0], "Y", - 179); // padding + +#elif defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) && defined(KOKKOS_ENABLE_CUDA) + if constexpr (std::is_same_v) { + Test::impl_test_gesv( + &mode[0], "N", 2); // no padding + Test::impl_test_gesv( + &mode[0], "N", 13); // no padding + Test::impl_test_gesv( + &mode[0], "N", 179); // no padding + Test::impl_test_gesv( + &mode[0], "N", 64); // no padding + Test::impl_test_gesv( + &mode[0], "N", 1024); // no padding + + Test::impl_test_gesv( + &mode[0], "Y", + 13); // padding + Test::impl_test_gesv( + &mode[0], "Y", + 179); // padding + } +#endif #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || - (!defined(KOKKOSKERNELS_ETI_ONLY) && - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; - Test::impl_test_gesv(&mode[0], "N", - 2); //no padding Test::impl_test_gesv(&mode[0], "N", 13); //no padding Test::impl_test_gesv(&mode[0], "N", 179); //no padding - Test::impl_test_gesv(&mode[0], "N", - 64); //no padding Test::impl_test_gesv(&mode[0], "N", 1024);//no padding Test::impl_test_gesv(&mode[0], "Y", 13); //padding - Test::impl_test_gesv(&mode[0], "Y", - 179); //padding #endif - */ // Supress unused parameters on CUDA10 (void)mode; return 1; @@ -298,42 +320,50 @@ int test_gesv_mrhs(const char* mode) { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ll; - typedef Kokkos::View view_type_b_ll; - Test::impl_test_gesv_mrhs( + using view_type_a_ll = Kokkos::View; + using view_type_b_ll = Kokkos::View; + +#if (defined(TEST_CUDA_LAPACK_CPP) && \ + defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER)) || \ + (defined(TEST_HIP_LAPACK_CPP) && \ + defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER)) || \ + (defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) && \ + (defined(TEST_OPENMP_LAPACK_CPP) || defined(TEST_SERIAL_LAPACK_CPP) || \ + defined(TEST_THREADS_LAPACK_CPP))) + Test::impl_test_gesv_mrhs( &mode[0], "N", 2, 5); // no padding - Test::impl_test_gesv_mrhs( + Test::impl_test_gesv_mrhs( &mode[0], "N", 13, 5); // no padding - Test::impl_test_gesv_mrhs( + Test::impl_test_gesv_mrhs( &mode[0], "N", 179, 5); // no padding - Test::impl_test_gesv_mrhs( + Test::impl_test_gesv_mrhs( &mode[0], "N", 64, 5); // no padding - Test::impl_test_gesv_mrhs( + Test::impl_test_gesv_mrhs( &mode[0], "N", 1024, 5); // no padding - Test::impl_test_gesv_mrhs( - &mode[0], "Y", 13, 5); // padding - Test::impl_test_gesv_mrhs( - &mode[0], "Y", 179, 5); // padding + +// When appropriate run MAGMA specific tests +#elif defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) && defined(KOKKOS_ENABLE_CUDA) + if constexpr (std::is_same_v) { + Test::impl_test_gesv_mrhs( + &mode[0], "N", 2, 5); // no padding + Test::impl_test_gesv_mrhs( + &mode[0], "N", 13, 5); // no padding + Test::impl_test_gesv_mrhs( + &mode[0], "N", 179, 5); // no padding + Test::impl_test_gesv_mrhs( + &mode[0], "N", 64, 5); // no padding + Test::impl_test_gesv_mrhs( + &mode[0], "N", 1024, 5); // no padding + + Test::impl_test_gesv_mrhs( + &mode[0], "Y", 13, 5); // padding + Test::impl_test_gesv_mrhs( + &mode[0], "Y", 179, 5); // padding + } +#endif #endif - /* - #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || - (!defined(KOKKOSKERNELS_ETI_ONLY) && - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) typedef Kokkos::View view_type_a_lr; typedef Kokkos::View view_type_b_lr; - Test::impl_test_gesv_mrhs(&mode[0], - "N", 2, 5);//no padding Test::impl_test_gesv_mrhs(&mode[0], "N", 13, 5);//no padding - Test::impl_test_gesv_mrhs(&mode[0], - "N", 179, 5);//no padding Test::impl_test_gesv_mrhs(&mode[0], "N", 64, 5);//no padding - Test::impl_test_gesv_mrhs(&mode[0], - "N", 1024,5);//no padding Test::impl_test_gesv_mrhs(&mode[0], "Y", 13, 5);//padding - Test::impl_test_gesv_mrhs(&mode[0], - "Y", 179, 5);//padding #endif - */ // Supress unused parameters on CUDA10 (void)mode; return 1; @@ -411,4 +441,4 @@ TEST_F(TestCategory, gesv_mrhs_complex_float) { } #endif -#endif // CUDA+MAGMA or LAPACK+HOST +#endif // CUDA+(MAGMA or CUSOLVER) or HIP+ROCSOLVER or LAPACK+HOST diff --git a/packages/kokkos-kernels/lapack/unit_test/Test_Lapack_svd.hpp b/packages/kokkos-kernels/lapack/unit_test/Test_Lapack_svd.hpp new file mode 100644 index 000000000000..da9f9ba480b2 --- /dev/null +++ b/packages/kokkos-kernels/lapack/unit_test/Test_Lapack_svd.hpp @@ -0,0 +1,658 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include +#include +#include +#include + +#include + +namespace Test { + +template +void check_triple_product( + const AMatrix& A, const SVector& S, const UMatrix& U, const VMatrix& Vt, + typename Kokkos::ArithTraits< + typename AMatrix::non_const_value_type>::mag_type tol) { + // After a successful SVD decomposition we have A=U*S*V + // So using gemm we should be able to compare the above + // triple product to the original matrix A. + using execution_space = typename AMatrix::execution_space; + + AMatrix temp("intermediate U*S product", A.extent(0), A.extent(1)); + AMatrix M("U*S*V product", A.extent(0), A.extent(1)); + + // First compute the left side of the product: temp = U*S + Kokkos::parallel_for( + Kokkos::RangePolicy(0, U.extent_int(0)), + KOKKOS_LAMBDA(const int& rowIdx) { + for (int colIdx = 0; colIdx < U.extent_int(1); ++colIdx) { + if (colIdx < S.extent_int(0)) { + temp(rowIdx, colIdx) = U(rowIdx, colIdx) * S(colIdx); + } + } + }); + + // Second compute the right side of the product: M = temp*V = U*S*V + KokkosBlas::gemm("N", "N", 1, temp, Vt, 0, M); + + typename AMatrix::HostMirror A_h = Kokkos::create_mirror_view(A); + typename AMatrix::HostMirror M_h = Kokkos::create_mirror_view(M); + Kokkos::deep_copy(A_h, A); + Kokkos::deep_copy(M_h, M); + for (int rowIdx = 0; rowIdx < A.extent_int(0); ++rowIdx) { + for (int colIdx = 0; colIdx < A.extent_int(1); ++colIdx) { + if (tol < Kokkos::abs(A_h(rowIdx, colIdx))) { + EXPECT_NEAR_KK_REL(A_h(rowIdx, colIdx), M_h(rowIdx, colIdx), tol); + } else { + EXPECT_NEAR_KK(A_h(rowIdx, colIdx), M_h(rowIdx, colIdx), tol); + } + } + } +} + +template +void check_unitary_orthogonal_matrix( + const Matrix& M, typename Kokkos::ArithTraits< + typename Matrix::non_const_value_type>::mag_type tol) { + // After a successful SVD decomposition the matrices + // U and V are unitary matrices. Thus we can check + // the property UUt=UtU=I and VVt=VtV=I using gemm. + using scalar_type = typename Matrix::non_const_value_type; + + Matrix I0("M*Mt", M.extent(0), M.extent(0)); + KokkosBlas::gemm("N", "C", 1, M, M, 0, I0); + typename Matrix::HostMirror I0_h = Kokkos::create_mirror_view(I0); + Kokkos::deep_copy(I0_h, I0); + for (int rowIdx = 0; rowIdx < M.extent_int(0); ++rowIdx) { + for (int colIdx = 0; colIdx < M.extent_int(0); ++colIdx) { + if (rowIdx == colIdx) { + EXPECT_NEAR_KK_REL(I0_h(rowIdx, colIdx), + Kokkos::ArithTraits::one(), tol); + } else { + EXPECT_NEAR_KK(I0_h(rowIdx, colIdx), + Kokkos::ArithTraits::zero(), tol); + } + } + } + + Matrix I1("Mt*M", M.extent(1), M.extent(1)); + KokkosBlas::gemm("C", "N", 1, M, M, 0, I1); + typename Matrix::HostMirror I1_h = Kokkos::create_mirror_view(I1); + Kokkos::deep_copy(I1_h, I1); + for (int rowIdx = 0; rowIdx < M.extent_int(1); ++rowIdx) { + for (int colIdx = 0; colIdx < M.extent_int(1); ++colIdx) { + if (rowIdx == colIdx) { + EXPECT_NEAR_KK_REL(I1_h(rowIdx, colIdx), + Kokkos::ArithTraits::one(), tol); + } else { + EXPECT_NEAR_KK(I1_h(rowIdx, colIdx), + Kokkos::ArithTraits::zero(), tol); + } + } + } +} + +template +int impl_analytic_2x2_svd() { + using scalar_type = typename AMatrix::value_type; + using mag_type = typename Kokkos::ArithTraits::mag_type; + using vector_type = + Kokkos::View; + using KAT_S = Kokkos::ArithTraits; + + const mag_type eps = KAT_S::eps(); + + AMatrix A("A", 2, 2), U("U", 2, 2), Vt("Vt", 2, 2), Aref("A ref", 2, 2); + vector_type S("S", 2); + + typename AMatrix::HostMirror A_h = Kokkos::create_mirror_view(A); + + // A = [3 0] + // [4 5] + // USV = 1/sqrt(10) [1 -3] * sqrt(5) [3 0] * 1/sqrt(2) [ 1 1] + // [3 1] [0 1] [-1 1] + A_h(0, 0) = 3; + A_h(1, 0) = 4; + A_h(1, 1) = 5; + + Kokkos::deep_copy(A, A_h); + Kokkos::deep_copy(Aref, A_h); + + KokkosLapack::svd("A", "A", A, S, U, Vt); + // Don't really need to fence here as we deep_copy right after... + + typename vector_type::HostMirror S_h = Kokkos::create_mirror_view(S); + Kokkos::deep_copy(S_h, S); + typename AMatrix::HostMirror U_h = Kokkos::create_mirror_view(U); + Kokkos::deep_copy(U_h, U); + typename AMatrix::HostMirror Vt_h = Kokkos::create_mirror_view(Vt); + Kokkos::deep_copy(Vt_h, Vt); + + // The singular values for this problem + // are known: sqrt(45) and sqrt(5) + EXPECT_NEAR_KK_REL(S_h(0), static_cast(Kokkos::sqrt(45)), + 100 * eps); + EXPECT_NEAR_KK_REL(S_h(1), static_cast(Kokkos::sqrt(5)), 100 * eps); + + // The singular vectors should be identical + // or of oposite sign we check the first + // component of the vectors to determine + // the proper signed comparison. + std::vector Uref = { + static_cast(1 / Kokkos::sqrt(10)), + static_cast(3 / Kokkos::sqrt(10)), + static_cast(-3 / Kokkos::sqrt(10)), + static_cast(1 / Kokkos::sqrt(10))}; + std::vector Vtref = { + static_cast(1 / Kokkos::sqrt(2)), + static_cast(-1 / Kokkos::sqrt(2)), + static_cast(1 / Kokkos::sqrt(2)), + static_cast(1 / Kokkos::sqrt(2))}; + + // Both rotations and reflections are valid + // vector basis so we need to check both signs + // to confirm proper SVD was achieved. + Kokkos::View U_real("U real", 2, 2), + Vt_real("Vt real", 2, 2); + if constexpr (KAT_S::is_complex) { + U_real(0, 0) = U_h(0, 0).real(); + U_real(0, 1) = U_h(0, 1).real(); + U_real(1, 0) = U_h(1, 0).real(); + U_real(1, 1) = U_h(1, 1).real(); + + Vt_real(0, 0) = Vt_h(0, 0).real(); + Vt_real(0, 1) = Vt_h(0, 1).real(); + Vt_real(1, 0) = Vt_h(1, 0).real(); + Vt_real(1, 1) = Vt_h(1, 1).real(); + } else { + U_real(0, 0) = U_h(0, 0); + U_real(0, 1) = U_h(0, 1); + U_real(1, 0) = U_h(1, 0); + U_real(1, 1) = U_h(1, 1); + + Vt_real(0, 0) = Vt_h(0, 0); + Vt_real(0, 1) = Vt_h(0, 1); + Vt_real(1, 0) = Vt_h(1, 0); + Vt_real(1, 1) = Vt_h(1, 1); + } + + const mag_type tol = 100 * KAT_S::eps(); + const mag_type one_sqrt10 = static_cast(1 / Kokkos::sqrt(10)); + const mag_type one_sqrt2 = static_cast(1 / Kokkos::sqrt(2)); + + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(0, 0)), one_sqrt10, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(0, 1)), 3 * one_sqrt10, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(1, 0)), 3 * one_sqrt10, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(1, 1)), one_sqrt10, tol); + + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(0, 0)), one_sqrt2, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(0, 1)), one_sqrt2, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(1, 0)), one_sqrt2, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(1, 1)), one_sqrt2, tol); + + check_unitary_orthogonal_matrix(U, tol); + check_unitary_orthogonal_matrix(Vt, tol); + + check_triple_product(Aref, S, U, Vt, tol); + + return 0; +} + +template +int impl_analytic_2x3_svd() { + using scalar_type = typename AMatrix::value_type; + using mag_type = typename Kokkos::ArithTraits::mag_type; + using vector_type = + Kokkos::View; + using KAT_S = Kokkos::ArithTraits; + + const mag_type tol = 100 * KAT_S::eps(); + + AMatrix A("A", 2, 3), U("U", 2, 2), Vt("Vt", 3, 3), Aref("A ref", 2, 3); + vector_type S("S", 2); + + typename AMatrix::HostMirror A_h = Kokkos::create_mirror_view(A); + + // A = [3 2 2] + // [2 3 -2] + // USVt = 1/sqrt(2) [1 1] * [5 0 0] * 1/(3*sqrt(2)) [ 3 3 0] + // [1 -1] [0 3 0] [ 1 -1 4] + // [2*sqrt(2) -2*sqrt(2) + // -sqrt(2)] + A_h(0, 0) = 3; + A_h(0, 1) = 2; + A_h(0, 2) = 2; + A_h(1, 0) = 2; + A_h(1, 1) = 3; + A_h(1, 2) = -2; + + Kokkos::deep_copy(A, A_h); + Kokkos::deep_copy(Aref, A_h); + + try { + KokkosLapack::svd("A", "A", A, S, U, Vt); + } catch (const std::runtime_error& e) { + std::string test_string = e.what(); + std::string cusolver_m_less_than_n = + "CUSOLVER does not support SVD for matrices with more columns " + "than rows, you can transpose you matrix first then compute " + "SVD of that transpose: At=VSUt, and swap the output U and Vt" + " and transpose them to recover the desired SVD."; + + if (test_string == cusolver_m_less_than_n) { + return 0; + } + } + // Don't really need to fence here as we deep_copy right after... + + typename vector_type::HostMirror S_h = Kokkos::create_mirror_view(S); + Kokkos::deep_copy(S_h, S); + typename AMatrix::HostMirror U_h = Kokkos::create_mirror_view(U); + Kokkos::deep_copy(U_h, U); + typename AMatrix::HostMirror Vt_h = Kokkos::create_mirror_view(Vt); + Kokkos::deep_copy(Vt_h, Vt); + + // The singular values for this problem + // are known: sqrt(45) and sqrt(5) + EXPECT_NEAR_KK_REL(S_h(0), static_cast(5), tol); + EXPECT_NEAR_KK_REL(S_h(1), static_cast(3), tol); + + // Both rotations and reflections are valid + // vector basis so we need to check both signs + // to confirm proper SVD was achieved. + Kokkos::View U_real("U real", 2, 2), + Vt_real("Vt real", 3, 3); + if constexpr (KAT_S::is_complex) { + U_real(0, 0) = U_h(0, 0).real(); + U_real(0, 1) = U_h(0, 1).real(); + U_real(1, 0) = U_h(1, 0).real(); + U_real(1, 1) = U_h(1, 1).real(); + + Vt_real(0, 0) = Vt_h(0, 0).real(); + Vt_real(0, 1) = Vt_h(0, 1).real(); + Vt_real(0, 2) = Vt_h(0, 2).real(); + Vt_real(1, 0) = Vt_h(1, 0).real(); + Vt_real(1, 1) = Vt_h(1, 1).real(); + Vt_real(1, 2) = Vt_h(1, 2).real(); + Vt_real(2, 0) = Vt_h(2, 0).real(); + Vt_real(2, 1) = Vt_h(2, 1).real(); + Vt_real(2, 2) = Vt_h(2, 2).real(); + } else { + U_real(0, 0) = U_h(0, 0); + U_real(0, 1) = U_h(0, 1); + U_real(1, 0) = U_h(1, 0); + U_real(1, 1) = U_h(1, 1); + + Vt_real(0, 0) = Vt_h(0, 0); + Vt_real(0, 1) = Vt_h(0, 1); + Vt_real(0, 2) = Vt_h(0, 2); + Vt_real(1, 0) = Vt_h(1, 0); + Vt_real(1, 1) = Vt_h(1, 1); + Vt_real(1, 2) = Vt_h(1, 2); + Vt_real(2, 0) = Vt_h(2, 0); + Vt_real(2, 1) = Vt_h(2, 1); + Vt_real(2, 2) = Vt_h(2, 2); + } + + const mag_type one_sqrt2 = static_cast(1 / Kokkos::sqrt(2)); + const mag_type one_sqrt18 = static_cast(1 / Kokkos::sqrt(18)); + const mag_type one_third = static_cast(1. / 3.); + + // Check values of U + // Don't worry about the sign + // it will be check with the + // triple product + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(0, 0)), one_sqrt2, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(0, 1)), one_sqrt2, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(1, 0)), one_sqrt2, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(1, 1)), one_sqrt2, tol); + + // Check values of Vt + // Don't worry about the sign + // it will be check with the + // triple product + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(0, 0)), one_sqrt2, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(0, 1)), one_sqrt2, tol); + EXPECT_NEAR_KK(Kokkos::abs(Vt_real(0, 2)), 0, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(1, 0)), one_sqrt18, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(1, 1)), one_sqrt18, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(1, 2)), 4 * one_sqrt18, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(2, 0)), 2 * one_third, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(2, 1)), 2 * one_third, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(2, 2)), one_third, tol); + + check_unitary_orthogonal_matrix(U, tol); + check_unitary_orthogonal_matrix(Vt, tol); + + check_triple_product(Aref, S, U, Vt, tol); + + return 0; +} + +template +int impl_analytic_3x2_svd() { + using scalar_type = typename AMatrix::value_type; + using mag_type = typename Kokkos::ArithTraits::mag_type; + using vector_type = + Kokkos::View; + using KAT_S = Kokkos::ArithTraits; + + const mag_type tol = 100 * KAT_S::eps(); + + AMatrix A("A", 3, 2), U("U", 3, 3), Vt("Vt", 2, 2), Aref("A ref", 3, 2); + vector_type S("S", 2); + + typename AMatrix::HostMirror A_h = Kokkos::create_mirror_view(A); + + // Note this is simply the transpose of the 2x3 matrix in the test above + // A = [3 2] + // [2 3] + // [2 -2] + // USVt = 1/(3*sqrt(2)) [3 1 2*sqrt(2)] * [5 0] * 1/sqrt(2) [1 1] + // [3 -1 -2*sqrt(2)] [0 3] [1 -1] + // [0 4 sqrt(2)] [0 0] + A_h(0, 0) = 3; + A_h(0, 1) = 2; + A_h(1, 0) = 2; + A_h(1, 1) = 3; + A_h(2, 0) = 2; + A_h(2, 1) = -2; + + Kokkos::deep_copy(A, A_h); + Kokkos::deep_copy(Aref, A_h); + + KokkosLapack::svd("A", "A", A, S, U, Vt); + // Don't really need to fence here as we deep_copy right after... + + typename vector_type::HostMirror S_h = Kokkos::create_mirror_view(S); + Kokkos::deep_copy(S_h, S); + typename AMatrix::HostMirror U_h = Kokkos::create_mirror_view(U); + Kokkos::deep_copy(U_h, U); + typename AMatrix::HostMirror Vt_h = Kokkos::create_mirror_view(Vt); + Kokkos::deep_copy(Vt_h, Vt); + + // The singular values for this problem + // are known: sqrt(45) and sqrt(5) + EXPECT_NEAR_KK_REL(S_h(0), static_cast(5), tol); + EXPECT_NEAR_KK_REL(S_h(1), static_cast(3), tol); + + // Both rotations and reflections are valid + // vector basis so we need to check both signs + // to confirm proper SVD was achieved. + Kokkos::View U_real("U real", 3, 3), + Vt_real("Vt real", 2, 2); + if constexpr (KAT_S::is_complex) { + U_real(0, 0) = U_h(0, 0).real(); + U_real(0, 1) = U_h(0, 1).real(); + U_real(0, 2) = U_h(0, 2).real(); + U_real(1, 0) = U_h(1, 0).real(); + U_real(1, 1) = U_h(1, 1).real(); + U_real(1, 2) = U_h(1, 2).real(); + U_real(2, 0) = U_h(2, 0).real(); + U_real(2, 1) = U_h(2, 1).real(); + U_real(2, 2) = U_h(2, 2).real(); + + Vt_real(0, 0) = Vt_h(0, 0).real(); + Vt_real(0, 1) = Vt_h(0, 1).real(); + Vt_real(1, 0) = Vt_h(1, 0).real(); + Vt_real(1, 1) = Vt_h(1, 1).real(); + } else { + U_real(0, 0) = U_h(0, 0); + U_real(0, 1) = U_h(0, 1); + U_real(0, 2) = U_h(0, 2); + U_real(1, 0) = U_h(1, 0); + U_real(1, 1) = U_h(1, 1); + U_real(1, 2) = U_h(1, 2); + U_real(2, 0) = U_h(2, 0); + U_real(2, 1) = U_h(2, 1); + U_real(2, 2) = U_h(2, 2); + + Vt_real(0, 0) = Vt_h(0, 0); + Vt_real(0, 1) = Vt_h(0, 1); + Vt_real(1, 0) = Vt_h(1, 0); + Vt_real(1, 1) = Vt_h(1, 1); + } + + const mag_type one_sqrt2 = static_cast(1 / Kokkos::sqrt(2)); + const mag_type one_sqrt18 = static_cast(1 / Kokkos::sqrt(18)); + const mag_type one_third = static_cast(1. / 3.); + + // Check values of U + // Don't worry about the sign + // it will be check with the + // triple product + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(0, 0)), one_sqrt2, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(0, 1)), one_sqrt18, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(0, 2)), 2 * one_third, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(1, 0)), one_sqrt2, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(1, 1)), one_sqrt18, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(1, 2)), 2 * one_third, tol); + EXPECT_NEAR_KK(Kokkos::abs(U_real(2, 0)), 0, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(2, 1)), 4 * one_sqrt18, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(U_real(2, 2)), one_third, tol); + + // Check values of Vt + // Don't worry about the sign + // it will be check with the + // triple product + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(0, 0)), one_sqrt2, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(0, 1)), one_sqrt2, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(1, 0)), one_sqrt2, tol); + EXPECT_NEAR_KK_REL(Kokkos::abs(Vt_real(1, 1)), one_sqrt2, tol); + + check_unitary_orthogonal_matrix(U, tol); + check_unitary_orthogonal_matrix(Vt, tol); + + check_triple_product(Aref, S, U, Vt, tol); + + return 0; +} + +template +int impl_test_svd(const int m, const int n) { + using execution_space = typename Device::execution_space; + using scalar_type = typename AMatrix::value_type; + using KAT_S = Kokkos::ArithTraits; + using mag_type = typename KAT_S::mag_type; + using vector_type = + Kokkos::View; + + const mag_type max_val = 10; + const mag_type tol = 2000 * max_val * KAT_S::eps(); + + AMatrix A("A", m, n), U("U", m, m), Vt("Vt", n, n), Aref("A ref", m, n); + vector_type S("S", Kokkos::min(m, n)); + + const uint64_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count(); + Kokkos::Random_XorShift64_Pool rand_pool(seed); + + // Initialize A with random numbers + scalar_type randStart = 0, randEnd = 0; + Test::getRandomBounds(max_val, randStart, randEnd); + Kokkos::fill_random(A, rand_pool, randStart, randEnd); + Kokkos::deep_copy(Aref, A); + + // Working around CUSOLVER constraint for m >= n +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) + if constexpr (std::is_same_v) { + if (m >= n) { + KokkosLapack::svd("A", "A", A, S, U, Vt); + } else { + return 0; + } + } else { + KokkosLapack::svd("A", "A", A, S, U, Vt); + } +#else + KokkosLapack::svd("A", "A", A, S, U, Vt); +#endif + + check_unitary_orthogonal_matrix(U, tol); + check_unitary_orthogonal_matrix(Vt, tol); + + // For larger sizes with the triple product + // we accumulate a bit more error apparently? + check_triple_product(Aref, S, U, Vt, 100 * Kokkos::max(m, n) * tol); + + return 0; +} + +} // namespace Test + +template +int test_svd() { + int ret; + +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + using view_type_a_layout_left = + Kokkos::View; + + ret = Test::impl_analytic_2x2_svd(); + EXPECT_EQ(ret, 0); + + ret = Test::impl_analytic_2x3_svd(); + EXPECT_EQ(ret, 0); + + ret = Test::impl_test_svd(0, 0); + EXPECT_EQ(ret, 0); + + ret = Test::impl_test_svd(1, 1); + EXPECT_EQ(ret, 0); + + ret = Test::impl_test_svd(15, 15); + EXPECT_EQ(ret, 0); + + ret = Test::impl_test_svd(100, 100); + EXPECT_EQ(ret, 0); + + ret = Test::impl_test_svd(100, 70); + EXPECT_EQ(ret, 0); + + ret = Test::impl_test_svd(70, 100); + EXPECT_EQ(ret, 0); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + using view_type_a_layout_right = + Kokkos::View; + + ret = Test::impl_analytic_2x2_svd(); + EXPECT_EQ(ret, 0); + + ret = Test::impl_analytic_2x3_svd(); + EXPECT_EQ(ret, 0); + + ret = Test::impl_test_svd(0, 0); + EXPECT_EQ(ret, 0); + + ret = Test::impl_test_svd(1, 1); + EXPECT_EQ(ret, 0); + + ret = Test::impl_test_svd(15, 15); + EXPECT_EQ(ret, 0); + + ret = Test::impl_test_svd(100, 100); + EXPECT_EQ(ret, 0); + + ret = Test::impl_test_svd(100, 70); + EXPECT_EQ(ret, 0); + + ret = Test::impl_test_svd(70, 100); + EXPECT_EQ(ret, 0); +#endif + + return 1; +} + +template +int test_svd_wrapper() { +#if defined(KOKKOSKERNELS_ENABLE_TPL_LAPACK) || \ + defined(KOKKOSKERNELS_ENABLE_TPL_MKL) + if constexpr (std::is_same_v) { + // Using a device side space with LAPACK/MKL + return test_svd(); + } +#endif + +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSOLVER) + if constexpr (std::is_same_v) { + // Using a Cuda device with CUSOLVER + return test_svd(); + } +#endif + +#if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSOLVER) + if constexpr (std::is_same_v) { + // Using a HIP device with ROCSOLVER + return test_svd(); + } +#endif + + std::cout << "No TPL support enabled, svd is not tested" << std::endl; + return 0; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, svd_float) { + Kokkos::Profiling::pushRegion("KokkosLapack::Test::svd_float"); + test_svd_wrapper(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, svd_double) { + Kokkos::Profiling::pushRegion("KokkosLapack::Test::svd_double"); + test_svd_wrapper(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, svd_complex_float) { + Kokkos::Profiling::pushRegion("KokkosLapack::Test::svd_complex_float"); + test_svd_wrapper, TestDevice>(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, svd_complex_double) { + Kokkos::Profiling::pushRegion("KokkosLapack::Test::svd_complex_double"); + test_svd_wrapper, TestDevice>(); + Kokkos::Profiling::popRegion(); +} +#endif diff --git a/packages/kokkos-kernels/master_history.txt b/packages/kokkos-kernels/master_history.txt index 26f95694e9c4..2207bca133c8 100644 --- a/packages/kokkos-kernels/master_history.txt +++ b/packages/kokkos-kernels/master_history.txt @@ -8,7 +8,7 @@ tag: 2.8.00 date: 02/05/2019 master: a6e05e06 develop: 6a790321 tag: 2.9.00 date: 06/24/2019 master: 4ee5f3c6 develop: 094da30c tag: 3.0.00 date: 01/31/2020 master: d86db111 release-candidate-3.0: cf24ab90 tag: 3.1.00 date: 04/14/2020 master: f199f45d develop: 8d063eae -tag: 3.1.01 date: 05/04/2020 master: 43773523 release: 6fce7502 +tag: 3.1.01 date: 05/04/2020 master: 43773523 release: 6fce7502 tag: 3.2.00 date: 08/19/2020 master: 07a60bcc release: ea3f2b77 tag: 3.3.00 date: 12/16/2020 master: 42defc56 release: e5279e55 tag: 3.3.01 date: 01/18/2021 master: f64b1c57 release: 4e1cc00b @@ -24,3 +24,4 @@ tag: 4.0.01 date: 04/26/2023 master: b9c1bab7 release: 8809e41c tag: 4.1.00 date: 06/20/2023 master: 1331baf1 release: 14ad220a tag: 4.2.00 date: 11/09/2023 master: 25a31f88 release: 912d3778 tag: 4.2.01 date: 01/30/2024 master: f429f6ec release: bcf9854b +tag: 4.3.00 date: 04/03/2024 master: afd65f03 release: ebbf4b78 diff --git a/packages/kokkos-kernels/ode/impl/KokkosODE_BDF_impl.hpp b/packages/kokkos-kernels/ode/impl/KokkosODE_BDF_impl.hpp new file mode 100644 index 000000000000..cf89731f1b1f --- /dev/null +++ b/packages/kokkos-kernels/ode/impl/KokkosODE_BDF_impl.hpp @@ -0,0 +1,532 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSBLAS_BDF_IMPL_HPP +#define KOKKOSBLAS_BDF_IMPL_HPP + +#include "Kokkos_Core.hpp" + +#include "KokkosODE_Newton.hpp" +#include "KokkosBlas2_serial_gemv.hpp" +#include "KokkosBatched_Gemm_Decl.hpp" + +namespace KokkosODE { +namespace Impl { + +template +struct BDF_table {}; + +template <> +struct BDF_table<1> { + static constexpr int order = 1; + Kokkos::Array coefficients{{-1.0, 1.0}}; +}; + +template <> +struct BDF_table<2> { + static constexpr int order = 2; + Kokkos::Array coefficients{{-4.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0}}; +}; + +template <> +struct BDF_table<3> { + static constexpr int order = 3; + Kokkos::Array coefficients{ + {-18.0 / 11.0, 9.0 / 11.0, -2.0 / 11.0, 6.0 / 11.0}}; +}; + +template <> +struct BDF_table<4> { + static constexpr int order = 4; + Kokkos::Array coefficients{ + {-48.0 / 25.0, 36.0 / 25.0, -16.0 / 25.0, 3.0 / 25.0, 12.0 / 25.0}}; +}; + +template <> +struct BDF_table<5> { + static constexpr int order = 5; + Kokkos::Array coefficients{{-300.0 / 137.0, 300.0 / 137.0, + -200.0 / 137.0, 75.0 / 137.0, + -12.0 / 137.0, 60.0 / 137.0}}; +}; + +template <> +struct BDF_table<6> { + static constexpr int order = 6; + Kokkos::Array coefficients{ + {-360.0 / 147.0, 450.0 / 147.0, -400.0 / 147.0, 225.0 / 147.0, + -72.0 / 147.0, 10.0 / 147.0, 60.0 / 147.0}}; +}; + +template +struct BDF_system_wrapper { + const system_type mySys; + const int neqs; + const table_type table; + const int order = table.order; + + double t, dt; + mv_type yn; + + KOKKOS_FUNCTION + BDF_system_wrapper(const system_type& mySys_, const table_type& table_, + const double t_, const double dt_, const mv_type& yn_) + : mySys(mySys_), + neqs(mySys_.neqs), + table(table_), + t(t_), + dt(dt_), + yn(yn_) {} + + template + KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& f) const { + // f = f(t+dt, y) + mySys.evaluate_function(t, dt, y, f); + + for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { + f(eqIdx) = y(eqIdx) - table.coefficients[order] * dt * f(eqIdx); + for (int orderIdx = 0; orderIdx < order; ++orderIdx) { + f(eqIdx) += + table.coefficients[order - 1 - orderIdx] * yn(eqIdx, orderIdx); + } + } + } + + template + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { + mySys.evaluate_jacobian(t, dt, y, jac); + + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + for (int colIdx = 0; colIdx < neqs; ++colIdx) { + jac(rowIdx, colIdx) = + -table.coefficients[order] * dt * jac(rowIdx, colIdx); + } + jac(rowIdx, rowIdx) += 1.0; + } + } +}; + +template +struct BDF_system_wrapper2 { + const system_type mySys; + const int neqs; + const subview_type psi; + const d_vec_type d; + + bool compute_jac = true; + double t, dt, c = 0; + + KOKKOS_FUNCTION + BDF_system_wrapper2(const system_type& mySys_, const subview_type& psi_, + const d_vec_type& d_, const double t_, const double dt_) + : mySys(mySys_), neqs(mySys_.neqs), psi(psi_), d(d_), t(t_), dt(dt_) {} + + template + KOKKOS_FUNCTION void residual(const YVectorType& y, + const FVectorType& f) const { + // f = f(t+dt, y) + mySys.evaluate_function(t, dt, y, f); + + // std::cout << "f = psi + d - c * f = " << psi(0) << " + " << d(0) << " - " + // << c << " * " << f(0) << std::endl; + + // rhs = higher order terms + y_{n+1}^i - y_n - dt*f + for (int eqIdx = 0; eqIdx < neqs; ++eqIdx) { + f(eqIdx) = psi(eqIdx) + d(eqIdx) - c * f(eqIdx); + } + } + + template + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { + if (compute_jac) { + mySys.evaluate_jacobian(t, dt, y, jac); + + // J = I - dt*(dy/dy) + for (int rowIdx = 0; rowIdx < neqs; ++rowIdx) { + for (int colIdx = 0; colIdx < neqs; ++colIdx) { + jac(rowIdx, colIdx) = -dt * jac(rowIdx, colIdx); + } + jac(rowIdx, rowIdx) += 1.0; + } + } + } +}; + +template +KOKKOS_FUNCTION void BDFStep(ode_type& ode, const table_type& table, + scalar_type t, scalar_type dt, + const vec_type& y_old, const vec_type& y_new, + const vec_type& rhs, const vec_type& update, + const vec_type& scale, const mv_type& y_vecs, + const mat_type& temp, const mat_type& jac) { + using newton_params = KokkosODE::Experimental::Newton_params; + + BDF_system_wrapper sys(ode, table, t, dt, y_vecs); + const newton_params param(50, 1e-14, 1e-12); + + // first set y_new = y_old + for (int eqIdx = 0; eqIdx < sys.neqs; ++eqIdx) { + y_new(eqIdx) = y_old(eqIdx); + } + + // solver the nonlinear problem + { + KokkosODE::Experimental::Newton::Solve(sys, param, jac, temp, y_new, rhs, + update, scale); + } + +} // BDFStep + +template +KOKKOS_FUNCTION void compute_coeffs(const int order, const scalar_type factor, + const mat_type& coeffs) { + coeffs(0, 0) = 1.0; + for (int colIdx = 0; colIdx < order; ++colIdx) { + coeffs(0, colIdx + 1) = 1.0; + for (int rowIdx = 0; rowIdx < order; ++rowIdx) { + coeffs(rowIdx + 1, colIdx + 1) = + ((rowIdx - factor * (colIdx + 1.0)) / (rowIdx + 1.0)) * + coeffs(rowIdx, colIdx + 1); + } + } +} + +template +KOKKOS_FUNCTION void update_D(const int order, const scalar_type factor, + const mat_type& coeffs, const mat_type& tempD, + const mat_type& D) { + auto subD = + Kokkos::subview(D, Kokkos::ALL(), Kokkos::pair(0, order + 1)); + auto subTempD = Kokkos::subview(tempD, Kokkos::ALL(), + Kokkos::pair(0, order + 1)); + + compute_coeffs(order, factor, coeffs); + auto R = Kokkos::subview(coeffs, Kokkos::pair(0, order + 1), + Kokkos::pair(0, order + 1)); + KokkosBatched::SerialGemm< + KokkosBatched::Trans::NoTranspose, KokkosBatched::Trans::NoTranspose, + KokkosBatched::Algo::Gemm::Blocked>::invoke(1.0, subD, R, 0.0, subTempD); + + compute_coeffs(order, 1.0, coeffs); + auto U = Kokkos::subview(coeffs, Kokkos::pair(0, order + 1), + Kokkos::pair(0, order + 1)); + KokkosBatched::SerialGemm< + KokkosBatched::Trans::NoTranspose, KokkosBatched::Trans::NoTranspose, + KokkosBatched::Algo::Gemm::Blocked>::invoke(1.0, subTempD, U, 0.0, subD); +} + +template +KOKKOS_FUNCTION void initial_step_size( + const ode_type ode, const int order, const scalar_type t0, + const scalar_type atol, const scalar_type rtol, const vec_type& y0, + const res_type& f0, const mat_type& temp, scalar_type& dt_ini) { + using KAT = Kokkos::ArithTraits; + + // Extract subviews to store intermediate data + auto scale = Kokkos::subview(temp, Kokkos::ALL(), 1); + auto y1 = Kokkos::subview(temp, Kokkos::ALL(), 2); + auto f1 = Kokkos::subview(temp, Kokkos::ALL(), 3); + + // Compute norms for y0 and f0 + double n0 = KAT::zero(), n1 = KAT::zero(), dt0; + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + scale(eqIdx) = atol + rtol * Kokkos::abs(y0(eqIdx)); + n0 += Kokkos::pow(y0(eqIdx) / scale(eqIdx), 2); + n1 += Kokkos::pow(f0(eqIdx) / scale(eqIdx), 2); + } + n0 = Kokkos::sqrt(n0) / Kokkos::sqrt(ode.neqs); + n1 = Kokkos::sqrt(n1) / Kokkos::sqrt(ode.neqs); + + // Select dt0 + if ((n0 < 1e-5) || (n1 < 1e-5)) { + dt0 = 1e-6; + } else { + dt0 = 0.01 * n0 / n1; + } + + // Estimate y at t0 + dt0 + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + y1(eqIdx) = y0(eqIdx) + dt0 * f0(eqIdx); + } + + // Compute f at t0+dt0 and y1, + // then compute the norm of f(t0+dt0, y1) - f(t0, y0) + scalar_type n2 = KAT::zero(); + ode.evaluate_function(t0 + dt0, dt0, y1, f1); + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + n2 += Kokkos::pow((f1(eqIdx) - f0(eqIdx)) / scale(eqIdx), 2); + } + n2 = Kokkos::sqrt(n2) / (dt0 * Kokkos::sqrt(ode.neqs)); + + // Finally select initial time step dt_ini + if ((n1 <= 1e-15) && (n2 <= 1e-15)) { + dt_ini = Kokkos::max(1e-6, dt0 * 1e-3); + } else { + dt_ini = Kokkos::pow(0.01 / Kokkos::max(n1, n2), KAT::one() / (order + 1)); + } + + dt_ini = Kokkos::min(100 * dt0, dt_ini); + + // Zero out temp variables just to be safe... + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + scale(eqIdx) = 0; + y1(eqIdx) = 0; + f1(eqIdx) = 0; + } +} // initial_step_size + +template +KOKKOS_FUNCTION void BDFStep(ode_type& ode, scalar_type& t, scalar_type& dt, + scalar_type t_end, int& order, + int& num_equal_steps, const int max_newton_iters, + const scalar_type atol, const scalar_type rtol, + const scalar_type min_factor, + const vec_type& y_old, const vec_type& y_new, + const res_type& rhs, const res_type& update, + const mat_type& temp, const mat_type& temp2) { + using newton_params = KokkosODE::Experimental::Newton_params; + + constexpr int max_order = 5; + + // For NDF coefficients see Sahmpine and Reichelt, The Matlab ODE suite, SIAM + // SISCm 18, 1, p1-22, January 1997 Kokkos::Array kappa{{0., + // -0.1850, -1/9 , -0.0823000, -0.0415000, 0.}}; // NDF coefficients + // kappa gamma(i) = sum_{k=1}^i(1.0 / k); gamma(0) = 0; // NDF coefficients + // gamma_k alpha(i) = (1 - kappa(i)) * gamma(i) error_const(i) = kappa(i) * + // gamma(i) + 1 / (i + 1) + const Kokkos::Array alpha{ + {0., 1.185, 1.66666667, 1.98421667, 2.16979167, 2.28333333}}; + const Kokkos::Array error_const{ + {1., 0.315, 0.16666667, 0.09911667, 0.11354167, 0.16666667}}; + + // Extract columns of temp to form temporary + // subviews to operate on. + // const int numRows = temp.extent_int(0); const int numCols = + // temp.extent_int(1); std::cout << "numRows: " << numRows << ", numCols: " << + // numCols << std::endl; std::cout << "Extract subview from temp" << + // std::endl; + int offset = 2; + auto D = Kokkos::subview( + temp, Kokkos::ALL(), + Kokkos::pair(offset, offset + 8)); // y and its derivatives + offset += 8; + auto tempD = Kokkos::subview(temp, Kokkos::ALL(), + Kokkos::pair(offset, offset + 8)); + offset += 8; + auto scale = Kokkos::subview(temp, Kokkos::ALL(), offset + 1); + ++offset; // Scaling coefficients for error calculation + auto y_predict = Kokkos::subview(temp, Kokkos::ALL(), offset + 1); + ++offset; // Initial guess for y_{n+1} + auto psi = Kokkos::subview(temp, Kokkos::ALL(), offset + 1); + ++offset; // Higher order terms contribution to rhs + auto error = Kokkos::subview(temp, Kokkos::ALL(), offset + 1); + ++offset; // Error estimate + auto jac = Kokkos::subview( + temp, Kokkos::ALL(), + Kokkos::pair(offset, offset + ode.neqs)); // Jacobian matrix + offset += ode.neqs; + auto tmp_gesv = Kokkos::subview( + temp, Kokkos::ALL(), + Kokkos::pair( + offset, offset + ode.neqs + 4)); // Buffer space for gesv calculation + offset += ode.neqs + 4; + + auto coeffs = + Kokkos::subview(temp2, Kokkos::ALL(), Kokkos::pair(0, 6)); + auto gamma = Kokkos::subview(temp2, Kokkos::ALL(), 6); + gamma(0) = 0.0; + gamma(1) = 1.0; + gamma(2) = 1.5; + gamma(3) = 1.83333333; + gamma(4) = 2.08333333; + gamma(5) = 2.28333333; + + BDF_system_wrapper2 sys(ode, psi, update, t, dt); + const newton_params param( + max_newton_iters, atol, + Kokkos::max(10 * Kokkos::ArithTraits::eps() / rtol, + Kokkos::min(0.03, Kokkos::sqrt(rtol)))); + + scalar_type max_step = Kokkos::ArithTraits::max(); + scalar_type min_step = Kokkos::ArithTraits::min(); + scalar_type safety = 0.675, error_norm; + if (dt > max_step) { + update_D(order, max_step / dt, coeffs, tempD, D); + dt = max_step; + num_equal_steps = 0; + } else if (dt < min_step) { + update_D(order, min_step / dt, coeffs, tempD, D); + dt = min_step; + num_equal_steps = 0; + } + + // first set y_new = y_old + for (int eqIdx = 0; eqIdx < sys.neqs; ++eqIdx) { + y_new(eqIdx) = y_old(eqIdx); + } + + double t_new = 0; + bool step_accepted = false; + while (!step_accepted) { + if (dt < min_step) { + return; + } + t_new = t + dt; + + if (t_new > t_end) { + t_new = t_end; + update_D(order, (t_new - t) / dt, coeffs, tempD, D); + num_equal_steps = 0; + } + dt = t_new - t; + + for (int eqIdx = 0; eqIdx < sys.neqs; ++eqIdx) { + y_predict(eqIdx) = 0; + for (int orderIdx = 0; orderIdx < order + 1; ++orderIdx) { + y_predict(eqIdx) += D(eqIdx, orderIdx); + } + scale(eqIdx) = atol + rtol * Kokkos::abs(y_predict(eqIdx)); + } + + // Compute psi, the sum of the higher order + // contribution to the residual + auto subD = + Kokkos::subview(D, Kokkos::ALL(), Kokkos::pair(1, order + 1)); + auto subGamma = + Kokkos::subview(gamma, Kokkos::pair(1, order + 1)); + KokkosBlas::Experimental::serial_gemv('N', 1.0 / alpha[order], subD, + subGamma, 0.0, psi); + + sys.compute_jac = true; + sys.c = dt / alpha[order]; + sys.jacobian(y_new, jac); + sys.compute_jac = true; + Kokkos::Experimental::local_deep_copy(y_new, y_predict); + Kokkos::Experimental::local_deep_copy(update, 0); + KokkosODE::Experimental::newton_solver_status newton_status = + KokkosODE::Experimental::Newton::Solve(sys, param, jac, tmp_gesv, y_new, + rhs, update, scale); + + for (int eqIdx = 0; eqIdx < sys.neqs; ++eqIdx) { + update(eqIdx) = y_new(eqIdx) - y_predict(eqIdx); + } + + if (newton_status == + KokkosODE::Experimental::newton_solver_status::MAX_ITER) { + dt = 0.5 * dt; + update_D(order, 0.5, coeffs, tempD, D); + num_equal_steps = 0; + + } else { + // Estimate the solution error + safety = 0.9 * (2 * max_newton_iters + 1) / + (2 * max_newton_iters + param.iters); + error_norm = 0; + for (int eqIdx = 0; eqIdx < sys.neqs; ++eqIdx) { + scale(eqIdx) = atol + rtol * Kokkos::abs(y_new(eqIdx)); + error(eqIdx) = error_const[order] * update(eqIdx) / scale(eqIdx); + error_norm += error(eqIdx) * error(eqIdx); + } + error_norm = Kokkos::sqrt(error_norm) / Kokkos::sqrt(sys.neqs); + + // Check error norm and adapt step size or accept step + if (error_norm > 1) { + scalar_type factor = Kokkos::max( + min_factor, safety * Kokkos::pow(error_norm, -1.0 / (order + 1))); + dt = factor * dt; + update_D(order, factor, coeffs, tempD, D); + num_equal_steps = 0; + } else { + step_accepted = true; + } + } + } // while(!step_accepted) + + // Now that our time step has been + // accepted we update all our states + // and see if we can adapt the order + // or the time step before going to + // the next step. + ++num_equal_steps; + t = t_new; + for (int eqIdx = 0; eqIdx < sys.neqs; ++eqIdx) { + D(eqIdx, order + 2) = update(eqIdx) - D(eqIdx, order + 1); + D(eqIdx, order + 1) = update(eqIdx); + for (int orderIdx = order; 0 <= orderIdx; --orderIdx) { + D(eqIdx, orderIdx) += D(eqIdx, orderIdx + 1); + } + } + + // Not enough steps at constant dt + // have been succesfull so we do not + // attempt order adaptation. + double error_low = 0, error_high = 0; + if (num_equal_steps < order + 1) { + return; + } + + if (1 < order) { + for (int eqIdx = 0; eqIdx < sys.neqs; ++eqIdx) { + error_low += Kokkos::pow( + error_const[order - 1] * D(eqIdx, order) / scale(eqIdx), 2); + } + error_low = Kokkos::sqrt(error_low) / Kokkos::sqrt(sys.neqs); + } else { + error_low = Kokkos::ArithTraits::max(); + } + + if (order < max_order) { + for (int eqIdx = 0; eqIdx < sys.neqs; ++eqIdx) { + error_high += Kokkos::pow( + error_const[order + 1] * D(eqIdx, order + 2) / scale(eqIdx), 2); + } + error_high = Kokkos::sqrt(error_high) / Kokkos::sqrt(sys.neqs); + } else { + error_high = Kokkos::ArithTraits::max(); + } + + double factor_low, factor_mid, factor_high, factor; + factor_low = Kokkos::pow(error_low, -1.0 / order); + factor_mid = Kokkos::pow(error_norm, -1.0 / (order + 1)); + factor_high = Kokkos::pow(error_high, -1.0 / (order + 2)); + + int delta_order = 0; + if ((factor_mid < factor_low) && (factor_high < factor_low)) { + delta_order = -1; + factor = factor_low; + } else if ((factor_low < factor_high) && (factor_mid < factor_high)) { + delta_order = 1; + factor = factor_high; + } else { + delta_order = 0; + factor = factor_mid; + } + order += delta_order; + factor = Kokkos::fmin(10, safety * factor); + dt *= factor; + + update_D(order, factor, coeffs, tempD, D); + num_equal_steps = 0; + +} // BDFStep + +} // namespace Impl +} // namespace KokkosODE + +#endif // KOKKOSBLAS_BDF_IMPL_HPP diff --git a/packages/kokkos-kernels/ode/impl/KokkosODE_Newton_impl.hpp b/packages/kokkos-kernels/ode/impl/KokkosODE_Newton_impl.hpp index d5000a74ab28..348bf0aa226e 100644 --- a/packages/kokkos-kernels/ode/impl/KokkosODE_Newton_impl.hpp +++ b/packages/kokkos-kernels/ode/impl/KokkosODE_Newton_impl.hpp @@ -30,18 +30,29 @@ namespace KokkosODE { namespace Impl { -template +template KOKKOS_FUNCTION KokkosODE::Experimental::newton_solver_status NewtonSolve( system_type& sys, const KokkosODE::Experimental::Newton_params& params, - mat_type& J, mat_type& tmp, vec_type& y0, vec_type& rhs, vec_type& update) { + mat_type& J, mat_type& tmp, ini_vec_type& y0, rhs_vec_type& rhs, + update_type& update, const scale_type& scale) { using newton_solver_status = KokkosODE::Experimental::newton_solver_status; - using value_type = typename vec_type::non_const_value_type; + using value_type = typename ini_vec_type::non_const_value_type; // Define the type returned by nrm2 to store // the norm of the residual. using norm_type = typename Kokkos::Details::InnerProductSpaceTraits< - typename vec_type::non_const_value_type>::mag_type; - norm_type norm = Kokkos::ArithTraits::zero(); + typename ini_vec_type::non_const_value_type>::mag_type; + sys.residual(y0, rhs); + const norm_type norm0 = KokkosBlas::serial_nrm2(rhs); + norm_type norm = Kokkos::ArithTraits::zero(); + norm_type norm_old = Kokkos::ArithTraits::zero(); + norm_type norm_new = Kokkos::ArithTraits::zero(); + norm_type rate = Kokkos::ArithTraits::zero(); + + const norm_type tol = + Kokkos::max(10 * Kokkos::ArithTraits::eps() / params.rel_tol, + Kokkos::min(0.03, Kokkos::sqrt(params.rel_tol))); // LBV - 07/24/2023: for now assume that we take // a full Newton step. Eventually this value can @@ -57,12 +68,6 @@ KOKKOS_FUNCTION KokkosODE::Experimental::newton_solver_status NewtonSolve( // Solve the following linearized // problem at each iteration: J*update=-rhs // with J=du/dx, rhs=f(u_n+update)-f(u_n) - norm = KokkosBlas::serial_nrm2(rhs); - - if ((norm < params.rel_tol) || - (it > 0 ? KokkosBlas::serial_nrm2(update) < params.abs_tol : false)) { - return newton_solver_status::NLS_SUCCESS; - } // compute LHS sys.jacobian(y0, J); @@ -73,6 +78,26 @@ KOKKOS_FUNCTION KokkosODE::Experimental::newton_solver_status NewtonSolve( J, update, rhs, tmp); KokkosBlas::SerialScale::invoke(-1, update); + // update solution // x = x + alpha*update + KokkosBlas::serial_axpy(alpha, update, y0); + norm = KokkosBlas::serial_nrm2(rhs); + + // Compute rms norm of the scaled update + for (int idx = 0; idx < sys.neqs; ++idx) { + norm_new = (update(idx) * update(idx)) / (scale(idx) * scale(idx)); + } + norm_new = Kokkos::sqrt(norm_new / sys.neqs); + if ((it > 0) && norm_old > Kokkos::ArithTraits::zero()) { + rate = norm_new / norm_old; + if ((rate >= 1) || + Kokkos::pow(rate, params.max_iters - it) / (1 - rate) * norm_new > + tol) { + return newton_solver_status::NLS_DIVERGENCE; + } else if ((norm_new == 0) || ((rate / (1 - rate)) * norm_new < tol)) { + return newton_solver_status::NLS_SUCCESS; + } + } + if (linSolverStat == 1) { #if KOKKOS_VERSION < 40199 KOKKOS_IMPL_DO_NOT_USE_PRINTF( @@ -83,8 +108,12 @@ KOKKOS_FUNCTION KokkosODE::Experimental::newton_solver_status NewtonSolve( return newton_solver_status::LIN_SOLVE_FAIL; } - // update solution // x = x + alpha*update - KokkosBlas::serial_axpy(alpha, update, y0); + if ((norm < (params.rel_tol * norm0)) || + (it > 0 ? KokkosBlas::serial_nrm2(update) < params.abs_tol : false)) { + return newton_solver_status::NLS_SUCCESS; + } + + norm_old = norm_new; } return newton_solver_status::MAX_ITER; } diff --git a/packages/kokkos-kernels/ode/src/KokkosODE_BDF.hpp b/packages/kokkos-kernels/ode/src/KokkosODE_BDF.hpp new file mode 100644 index 000000000000..71a450a1c680 --- /dev/null +++ b/packages/kokkos-kernels/ode/src/KokkosODE_BDF.hpp @@ -0,0 +1,227 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSODE_BDF_HPP +#define KOKKOSODE_BDF_HPP + +/// \author Luc Berger-Vergiat (lberge@sandia.gov) +/// \file KokkosODE_BDF.hpp + +#include "Kokkos_Core.hpp" +#include "KokkosODE_Types.hpp" +#include "KokkosODE_RungeKutta.hpp" + +#include "KokkosODE_BDF_impl.hpp" + +namespace KokkosODE { +namespace Experimental { + +enum BDF_type : int { + BDF1 = 0, + BDF2 = 1, + BDF3 = 2, + BDF4 = 3, + BDF5 = 4, + BDF6 = 5 +}; + +template +struct BDF_coeff_helper { + using table_type = void; + + BDF_coeff_helper() = default; +}; + +template <> +struct BDF_coeff_helper { + using table_type = KokkosODE::Impl::BDF_table<1>; + + BDF_coeff_helper() = default; +}; + +template <> +struct BDF_coeff_helper { + using table_type = KokkosODE::Impl::BDF_table<2>; + + BDF_coeff_helper() = default; +}; + +template <> +struct BDF_coeff_helper { + using table_type = KokkosODE::Impl::BDF_table<3>; + + BDF_coeff_helper() = default; +}; + +template <> +struct BDF_coeff_helper { + using table_type = KokkosODE::Impl::BDF_table<4>; + + BDF_coeff_helper() = default; +}; + +template <> +struct BDF_coeff_helper { + using table_type = KokkosODE::Impl::BDF_table<5>; + + BDF_coeff_helper() = default; +}; + +template <> +struct BDF_coeff_helper { + using table_type = KokkosODE::Impl::BDF_table<6>; + + BDF_coeff_helper() = default; +}; + +template +struct BDF { + using table_type = typename BDF_coeff_helper::table_type; + + template + KOKKOS_FUNCTION static void Solve( + const ode_type& ode, const scalar_type t_start, const scalar_type t_end, + const int num_steps, const vec_type& y0, const vec_type& y, + const vec_type& rhs, const vec_type& update, const vec_type& scale, + const mv_type& y_vecs, const mv_type& kstack, const mat_type& temp, + const mat_type& jac) { + const table_type table{}; + + const double dt = (t_end - t_start) / num_steps; + double t = t_start; + + // Load y0 into y_vecs(:, 0) + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + y_vecs(eqIdx, 0) = y0(eqIdx); + } + + // Compute initial start-up history vectors + // Using a non adaptive explicit method. + const int init_steps = table.order - 1; + if (num_steps < init_steps) { + return; + } + KokkosODE::Experimental::ODE_params params(table.order - 1); + for (int stepIdx = 0; stepIdx < init_steps; ++stepIdx) { + KokkosODE::Experimental::RungeKutta::Solve( + ode, params, t, t + dt, y0, y, update, kstack); + + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + y_vecs(eqIdx, stepIdx + 1) = y(eqIdx); + y0(eqIdx) = y(eqIdx); + } + t += dt; + } + + for (int stepIdx = init_steps; stepIdx < num_steps; ++stepIdx) { + KokkosODE::Impl::BDFStep(ode, table, t, dt, y0, y, rhs, update, scale, + y_vecs, temp, jac); + + // Update history + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + y0(eqIdx) = y(eqIdx); + for (int orderIdx = 0; orderIdx < table.order - 1; ++orderIdx) { + y_vecs(eqIdx, orderIdx) = y_vecs(eqIdx, orderIdx + 1); + } + y_vecs(eqIdx, table.order - 1) = y(eqIdx); + } + t += dt; + } + } // Solve() +}; + +/// \brief BDF Solve integrates an ordinary differential equation +/// using an order and time adaptive BDF method. +/// +/// The integration starts with a BDF1 method and adaptively increases +/// or decreases both dt and the order of integration based on error +/// estimators. This function is marked as KOKKOS_FUNCTION so it can +/// be called on host and device. +/// +/// \tparam ode_type the type of the ode object to integrated +/// \tparam mv_type a rank-2 view +/// \tparam vec_type a rank-1 view +/// +/// \param ode [in]: the ode to integrate +/// \param t_start [in]: time at which the integration starts +/// \param t_end [in]: time at which the integration stops +/// \param initial_step [in]: initial value for dt +/// \param max_step [in]: maximum value for dt +/// \param y0 [in/out]: vector of initial conditions, set to the solution +/// at the end of the integration +/// \param y_new [out]: vector of solution at t_end +/// \param temp [in]: vectors for temporary storage +/// \param temp2 [in]: vectors for temporary storage +template +KOKKOS_FUNCTION void BDFSolve(const ode_type& ode, const scalar_type t_start, + const scalar_type t_end, + const scalar_type initial_step, + const scalar_type max_step, const vec_type& y0, + const vec_type& y_new, mat_type& temp, + mat_type& temp2) { + using KAT = Kokkos::ArithTraits; + + // This needs to go away and be pulled out of temp instead... + auto rhs = Kokkos::subview(temp, Kokkos::ALL(), 0); + auto update = Kokkos::subview(temp, Kokkos::ALL(), 1); + // vec_type rhs("rhs", ode.neqs), update("update", ode.neqs); + (void)max_step; + + int order = 1, num_equal_steps = 0; + constexpr scalar_type min_factor = 0.2; + scalar_type dt = initial_step; + scalar_type t = t_start; + + constexpr int max_newton_iters = 10; + scalar_type atol = 1.0e-6, rtol = 1.0e-3; + + // Compute rhs = f(t_start, y0) + ode.evaluate_function(t_start, 0, y0, rhs); + + // Check if we need to compute the initial + // time step size. + if (initial_step == KAT::zero()) { + KokkosODE::Impl::initial_step_size(ode, order, t_start, atol, rtol, y0, rhs, + temp, dt); + } + + // Initialize D(:, 0) = y0 and D(:, 1) = dt*rhs + auto D = Kokkos::subview(temp, Kokkos::ALL(), Kokkos::pair(2, 10)); + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + D(eqIdx, 0) = y0(eqIdx); + D(eqIdx, 1) = dt * rhs(eqIdx); + rhs(eqIdx) = 0; + } + + // Now we loop over the time interval [t_start, t_end] + // and solve our ODE. + while (t < t_end) { + KokkosODE::Impl::BDFStep(ode, t, dt, t_end, order, num_equal_steps, + max_newton_iters, atol, rtol, min_factor, y0, + y_new, rhs, update, temp, temp2); + + for (int eqIdx = 0; eqIdx < ode.neqs; ++eqIdx) { + y0(eqIdx) = y_new(eqIdx); + } + // printf("t=%f, dt=%f, y={%f, %f, %f}\n", t, dt, y0(0), y0(1), y0(2)); + } +} // BDFSolve + +} // namespace Experimental +} // namespace KokkosODE + +#endif // KOKKOSODE_BDF_HPP diff --git a/packages/kokkos-kernels/ode/src/KokkosODE_Newton.hpp b/packages/kokkos-kernels/ode/src/KokkosODE_Newton.hpp index 94c96e2eea7c..ffccba5cd33e 100644 --- a/packages/kokkos-kernels/ode/src/KokkosODE_Newton.hpp +++ b/packages/kokkos-kernels/ode/src/KokkosODE_Newton.hpp @@ -30,12 +30,14 @@ namespace Experimental { /// \brief Newton solver for non-linear system of equations struct Newton { - template + template KOKKOS_FUNCTION static newton_solver_status Solve( const system_type& sys, const Newton_params& params, const mat_type& J, - const mat_type& tmp, const vec_type& y0, const vec_type& rhs, - const vec_type& update) { - return KokkosODE::Impl::NewtonSolve(sys, params, J, tmp, y0, rhs, update); + const mat_type& tmp, const ini_vec_type& y0, const rhs_vec_type& rhs, + const update_type& update, const scale_type& scale) { + return KokkosODE::Impl::NewtonSolve(sys, params, J, tmp, y0, rhs, update, + scale); } }; diff --git a/packages/kokkos-kernels/ode/src/KokkosODE_Types.hpp b/packages/kokkos-kernels/ode/src/KokkosODE_Types.hpp index 7d7822752680..5fb2c44846c1 100644 --- a/packages/kokkos-kernels/ode/src/KokkosODE_Types.hpp +++ b/packages/kokkos-kernels/ode/src/KokkosODE_Types.hpp @@ -54,16 +54,19 @@ struct ODE_params { enum newton_solver_status : int { NLS_SUCCESS = 0, MAX_ITER = 1, - LIN_SOLVE_FAIL = 2 + LIN_SOLVE_FAIL = 2, + NLS_DIVERGENCE = 3, }; struct Newton_params { - int max_iters; + int max_iters, iters = 0; double abs_tol, rel_tol; - // Constructor that only specify the desired number of steps. - // In this case no adaptivity is provided, the time step will - // be constant such that dt = (tend - tstart) / num_steps; + // Constructor that sets basic solver parameters + // used while solving the nonlinear system + // int max_iters_ [in]: maximum number of iterations allowed + // double abs_tol_ [in]: absolute tolerance to reach for successful solve + // double rel_tol_ [in]: relative tolerance to reach for successful solve KOKKOS_FUNCTION Newton_params(const int max_iters_, const double abs_tol_, const double rel_tol_) diff --git a/packages/kokkos-kernels/ode/unit_test/Test_ODE.hpp b/packages/kokkos-kernels/ode/unit_test/Test_ODE.hpp index 5d4861879b7a..1b55171581ee 100644 --- a/packages/kokkos-kernels/ode/unit_test/Test_ODE.hpp +++ b/packages/kokkos-kernels/ode/unit_test/Test_ODE.hpp @@ -22,5 +22,6 @@ // Implicit integrators #include "Test_ODE_Newton.hpp" +#include "Test_ODE_BDF.hpp" #endif // TEST_ODE_HPP diff --git a/packages/kokkos-kernels/ode/unit_test/Test_ODE_BDF.hpp b/packages/kokkos-kernels/ode/unit_test/Test_ODE_BDF.hpp new file mode 100644 index 000000000000..836030297105 --- /dev/null +++ b/packages/kokkos-kernels/ode/unit_test/Test_ODE_BDF.hpp @@ -0,0 +1,830 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include "KokkosKernels_TestUtils.hpp" + +#include "KokkosODE_BDF.hpp" + +namespace Test { + +// Logistic equation +// Used to model population growth +// it is a simple nonlinear ODE with +// a lot of literature. +// +// Equation: y'(t) = r*y(t)*(1-y(t)/K) +// Jacobian: df/dy = r - 2*r*y/K +// Solution: y = K / (1 + ((K - y0) / y0)*exp(-rt)) +struct Logistic { + static constexpr int neqs = 1; + + const double r, K; + + Logistic(double r_, double K_) : r(r_), K(K_){}; + + template + KOKKOS_FUNCTION void evaluate_function(const double /*t*/, + const double /*dt*/, + const vec_type1& y, + const vec_type2& f) const { + f(0) = r * y(0) * (1.0 - y(0) / K); + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(const double /*t*/, + const double /*dt*/, const vec_type& y, + const mat_type& jac) const { + jac(0, 0) = r - 2 * r * y(0) / K; + } + + template + KOKKOS_FUNCTION void solution(const double t, const vec_type& y0, + const vec_type& y) const { + y(0) = K / (1 + (K - y0) / y0 * Kokkos::exp(-r * t)); + } + +}; // Logistic + +// Lotka-Volterra equation +// A predator-prey model that describe +// population dynamics when two species +// interact. +// +// Equations: y0'(t) = alpha*y0(t) - beta*y0(t)*y1(t) +// y1'(t) = delta*y0(t)*y1(t) - gamma*y1(t) +// Jacobian: df0/dy = [alpha-beta*y1(t); beta*y0(t)] +// df1/dy = [delta*y1(t); delta*y0(t)-gamma] +// Solution: y = K / (1 + ((K - y0) / y0)*exp(-rt)) +struct LotkaVolterra { + static constexpr int neqs = 2; + + const double alpha, beta, delta, gamma; + + LotkaVolterra(double alpha_, double beta_, double delta_, double gamma_) + : alpha(alpha_), beta(beta_), delta(delta_), gamma(gamma_){}; + + template + KOKKOS_FUNCTION void evaluate_function(const double /*t*/, + const double /*dt*/, + const vec_type1& y, + const vec_type2& f) const { + f(0) = alpha * y(0) - beta * y(0) * y(1); + f(1) = delta * y(0) * y(1) - gamma * y(1); + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(const double /*t*/, + const double /*dt*/, const vec_type& y, + const mat_type& jac) const { + jac(0, 0) = alpha - beta * y(1); + jac(0, 1) = -beta * y(0); + jac(1, 0) = delta * y(1); + jac(1, 1) = delta * y(0) - gamma; + } + +}; // LotkaVolterra + +// Robertson's autocatalytic chemical reaction: +// H. H. Robertson, The solution of a set of reaction rate equations, +// in J. Walsh (Ed.), Numerical Analysis: An Introduction, +// pp. 178–182, Academic Press, London (1966). +// +// Equations: y0' = -0.04*y0 + 10e4*y1*y2 +// y1' = 0.04*y0 - 10e4*y1*y2 - 3e7 * y1**2 +// y2' = 3e7 * y1**2 +struct StiffChemistry { + static constexpr int neqs = 3; + + StiffChemistry() {} + + template + KOKKOS_FUNCTION void evaluate_function(const double /*t*/, + const double /*dt*/, + const vec_type1& y, + const vec_type2& f) const { + f(0) = -0.04 * y(0) + 1.e4 * y(1) * y(2); + f(1) = 0.04 * y(0) - 1.e4 * y(1) * y(2) - 3.e7 * y(1) * y(1); + f(2) = 3.e7 * y(1) * y(1); + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(const double /*t*/, + const double /*dt*/, const vec_type& y, + const mat_type& jac) const { + jac(0, 0) = -0.04; + jac(0, 1) = 1.e4 * y(2); + jac(0, 2) = 1.e4 * y(1); + jac(1, 0) = 0.04; + jac(1, 1) = -1.e4 * y(2) - 3.e7 * 2.0 * y(1); + jac(1, 2) = -1.e4 * y(1); + jac(2, 0) = 0.0; + jac(2, 1) = 3.e7 * 2.0 * y(1); + jac(2, 2) = 0.0; + } +}; + +template +struct BDFSolve_wrapper { + ode_type my_ode; + scalar_type tstart, tend; + int num_steps; + vec_type y_old, y_new, rhs, update, scale; + mv_type y_vecs, kstack; + mat_type temp, jac; + + BDFSolve_wrapper(const ode_type& my_ode_, const scalar_type tstart_, + const scalar_type tend_, const int num_steps_, + const vec_type& y_old_, const vec_type& y_new_, + const vec_type& rhs_, const vec_type& update_, + const vec_type& scale_, const mv_type& y_vecs_, + const mv_type& kstack_, const mat_type& temp_, + const mat_type& jac_) + : my_ode(my_ode_), + tstart(tstart_), + tend(tend_), + num_steps(num_steps_), + y_old(y_old_), + y_new(y_new_), + rhs(rhs_), + update(update_), + scale(scale_), + y_vecs(y_vecs_), + kstack(kstack_), + temp(temp_), + jac(jac_) {} + + KOKKOS_FUNCTION + void operator()(const int /*idx*/) const { + KokkosODE::Experimental::BDF::Solve( + my_ode, tstart, tend, num_steps, y_old, y_new, rhs, update, scale, + y_vecs, kstack, temp, jac); + } +}; + +template +struct BDF_Solve_wrapper { + const ode_type my_ode; + const scalar_type t_start, t_end, dt, max_step; + const vec_type y0, y_new; + const mat_type temp, temp2; + + BDF_Solve_wrapper(const ode_type& my_ode_, const scalar_type& t_start_, + const scalar_type& t_end_, const scalar_type& dt_, + const scalar_type& max_step_, const vec_type& y0_, + const vec_type& y_new_, const mat_type& temp_, + const mat_type& temp2_) + : my_ode(my_ode_), + t_start(t_start_), + t_end(t_end_), + dt(dt_), + max_step(max_step_), + y0(y0_), + y_new(y_new_), + temp(temp_), + temp2(temp2_) {} + + KOKKOS_FUNCTION void operator()(const int) const { + KokkosODE::Experimental::BDFSolve(my_ode, t_start, t_end, dt, max_step, y0, + y_new, temp, temp2); + } +}; + +template +void test_BDF_Logistic() { + using execution_space = typename device_type::execution_space; + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; + using mat_type = Kokkos::View; + + Kokkos::RangePolicy myPolicy(0, 1); + Logistic mySys(1, 1); + + constexpr int num_tests = 7; + int num_steps[num_tests] = {512, 256, 128, 64, 32, 16, 8}; + double errors[num_tests] = {0}; + const scalar_type t_start = 0.0, t_end = 6.0; + vec_type y0("initial conditions", mySys.neqs), y_new("solution", mySys.neqs); + vec_type rhs("rhs", mySys.neqs), update("update", mySys.neqs); + vec_type scale("scaling factors", mySys.neqs); + mat_type jac("jacobian", mySys.neqs, mySys.neqs), + temp("temp storage", mySys.neqs, mySys.neqs + 4); + mv_type kstack("Startup RK vectors", 6, mySys.neqs); + + Kokkos::deep_copy(scale, 1); + + scalar_type measured_order; + + // Test BDF1 +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "\nBDF1 convergence test" << std::endl; +#endif + for (int idx = 0; idx < num_tests; idx++) { + mv_type y_vecs("history vectors", mySys.neqs, 1); + + Kokkos::deep_copy(y0, 0.5); + Kokkos::deep_copy(y_vecs, 0.5); + + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, + update, scale, y_vecs, kstack, temp, jac); + Kokkos::parallel_for(myPolicy, solve_wrapper); + Kokkos::fence(); + + auto y_new_h = Kokkos::create_mirror_view(y_new); + Kokkos::deep_copy(y_new_h, y_new); + + errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / + Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); + } + measured_order = + Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); + EXPECT_NEAR_KK_REL(measured_order, 2.0, 0.15); +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "expected ratio: 2, actual ratio: " << measured_order + << ", order error=" << Kokkos::abs(measured_order - 2.0) / 2.0 + << std::endl; +#endif + + // Test BDF2 +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "\nBDF2 convergence test" << std::endl; +#endif + for (int idx = 0; idx < num_tests; idx++) { + mv_type y_vecs("history vectors", mySys.neqs, 2); + Kokkos::deep_copy(y0, 0.5); + + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, + update, scale, y_vecs, kstack, temp, jac); + Kokkos::parallel_for(myPolicy, solve_wrapper); + Kokkos::fence(); + + auto y_new_h = Kokkos::create_mirror_view(y_new); + Kokkos::deep_copy(y_new_h, y_new); + + errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / + Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); + } + measured_order = + Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); + EXPECT_NEAR_KK_REL(measured_order, 4.0, 0.15); +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "expected ratio: 4, actual ratio: " << measured_order + << ", order error=" << Kokkos::abs(measured_order - 4.0) / 4.0 + << std::endl; +#endif + + // Test BDF3 +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "\nBDF3 convergence test" << std::endl; +#endif + for (int idx = 0; idx < num_tests; idx++) { + mv_type y_vecs("history vectors", mySys.neqs, 3); + Kokkos::deep_copy(y0, 0.5); + + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, + update, scale, y_vecs, kstack, temp, jac); + Kokkos::parallel_for(myPolicy, solve_wrapper); + Kokkos::fence(); + + auto y_new_h = Kokkos::create_mirror_view(y_new); + Kokkos::deep_copy(y_new_h, y_new); + + errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / + Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); + } + measured_order = + Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); + EXPECT_NEAR_KK_REL(measured_order, 8.0, 0.15); +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "expected ratio: 8, actual ratio: " << measured_order + << ", order error=" << Kokkos::abs(measured_order - 8.0) / 8.0 + << std::endl; +#endif + + // Test BDF4 +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "\nBDF4 convergence test" << std::endl; +#endif + for (int idx = 0; idx < num_tests; idx++) { + mv_type y_vecs("history vectors", mySys.neqs, 4); + Kokkos::deep_copy(y0, 0.5); + + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, + update, scale, y_vecs, kstack, temp, jac); + Kokkos::parallel_for(myPolicy, solve_wrapper); + Kokkos::fence(); + + auto y_new_h = Kokkos::create_mirror_view(y_new); + Kokkos::deep_copy(y_new_h, y_new); + + errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / + Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); + } + measured_order = + Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "expected ratio: 16, actual ratio: " << measured_order + << ", order error=" << Kokkos::abs(measured_order - 16.0) / 16.0 + << std::endl; +#endif + + // Test BDF5 +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "\nBDF5 convergence test" << std::endl; +#endif + for (int idx = 0; idx < num_tests; idx++) { + mv_type y_vecs("history vectors", mySys.neqs, 5); + Kokkos::deep_copy(y0, 0.5); + + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, num_steps[idx], y0, y_new, rhs, + update, scale, y_vecs, kstack, temp, jac); + Kokkos::parallel_for(myPolicy, solve_wrapper); + Kokkos::fence(); + + auto y_new_h = Kokkos::create_mirror_view(y_new); + Kokkos::deep_copy(y_new_h, y_new); + + errors[idx] = Kokkos::abs(y_new_h(0) - 1 / (1 + Kokkos::exp(-t_end))) / + Kokkos::abs(1 / (1 + Kokkos::exp(-t_end))); + } + measured_order = + Kokkos::pow(errors[num_tests - 1] / errors[0], 1.0 / (num_tests - 1)); +#if defined(HAVE_KOKKOSKERNELS_DEBUG) + std::cout << "expected ratio: 32, actual ratio: " << measured_order + << ", order error=" << Kokkos::abs(measured_order - 32.0) / 32.0 + << std::endl; +#endif + +} // test_BDF_Logistic + +template +void test_BDF_LotkaVolterra() { + using execution_space = typename device_type::execution_space; + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; + using mat_type = Kokkos::View; + + LotkaVolterra mySys(1.1, 0.4, 0.1, 0.4); + + const scalar_type t_start = 0.0, t_end = 100.0; + vec_type y0("initial conditions", mySys.neqs), y_new("solution", mySys.neqs); + vec_type rhs("rhs", mySys.neqs), update("update", mySys.neqs); + vec_type scale("scaling factors", mySys.neqs); + mat_type jac("jacobian", mySys.neqs, mySys.neqs), + temp("temp storage", mySys.neqs, mySys.neqs + 4); + + Kokkos::deep_copy(scale, 1); + + // Test BDF5 + mv_type kstack("Startup RK vectors", 6, mySys.neqs); + mv_type y_vecs("history vectors", mySys.neqs, 5); + + Kokkos::deep_copy(y0, 10.0); + Kokkos::deep_copy(y_vecs, 10.0); + + Kokkos::RangePolicy myPolicy(0, 1); + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, 1000, y0, y_new, rhs, update, scale, + y_vecs, kstack, temp, jac); + Kokkos::parallel_for(myPolicy, solve_wrapper); +} + +template +void test_BDF_StiffChemistry() { + using execution_space = typename device_type::execution_space; + using vec_type = Kokkos::View; + using mv_type = Kokkos::View; + using mat_type = Kokkos::View; + + StiffChemistry mySys{}; + + const scalar_type t_start = 0.0, t_end = 500.0; + vec_type y0("initial conditions", mySys.neqs), y_new("solution", mySys.neqs); + vec_type rhs("rhs", mySys.neqs), update("update", mySys.neqs); + vec_type scale("scaling factors", mySys.neqs); + mat_type jac("jacobian", mySys.neqs, mySys.neqs), + temp("temp storage", mySys.neqs, mySys.neqs + 4); + + Kokkos::deep_copy(scale, 1); + + // Test BDF5 + mv_type kstack("Startup RK vectors", 6, mySys.neqs); + mv_type y_vecs("history vectors", mySys.neqs, 5); + + auto y0_h = Kokkos::create_mirror_view(y0); + y0_h(0) = 1.0; + y0_h(1) = 0.0; + y0_h(2) = 0.0; + Kokkos::deep_copy(y0, y0_h); + Kokkos::deep_copy(y_vecs, 0.0); + + Kokkos::RangePolicy myPolicy(0, 1); + BDFSolve_wrapper + solve_wrapper(mySys, t_start, t_end, 110000, y0, y_new, rhs, update, + scale, y_vecs, kstack, temp, jac); + Kokkos::parallel_for(myPolicy, solve_wrapper); +} + +// template +// struct BDFSolve_parallel { +// ode_type my_ode; +// scalar_type tstart, tend; +// int num_steps; +// vec_type y_old, y_new, rhs, update, scale; +// mv_type y_vecs, kstack; +// mat_type temp, jac; + +// BDFSolve_parallel(const ode_type& my_ode_, const scalar_type tstart_, +// const scalar_type tend_, const int num_steps_, +// const vec_type& y_old_, const vec_type& y_new_, +// const vec_type& rhs_, const vec_type& update_, +// const vec_type& scale_, +// const mv_type& y_vecs_, const mv_type& kstack_, +// const mat_type& temp_, const mat_type& jac_) +// : my_ode(my_ode_), +// tstart(tstart_), +// tend(tend_), +// num_steps(num_steps_), +// y_old(y_old_), +// y_new(y_new_), +// rhs(rhs_), +// update(update_), +// scale(scale_), +// y_vecs(y_vecs_), +// kstack(kstack_), +// temp(temp_), +// jac(jac_) {} + +// KOKKOS_FUNCTION +// void operator()(const int idx) const { +// auto local_y_old = Kokkos::subview( +// y_old, +// Kokkos::pair(my_ode.neqs * idx, my_ode.neqs * (idx + 1))); +// auto local_y_new = Kokkos::subview( +// y_new, +// Kokkos::pair(my_ode.neqs * idx, my_ode.neqs * (idx + 1))); +// auto local_rhs = Kokkos::subview( +// rhs, +// Kokkos::pair(my_ode.neqs * idx, my_ode.neqs * (idx + 1))); +// auto local_update = Kokkos::subview( +// update, +// Kokkos::pair(my_ode.neqs * idx, my_ode.neqs * (idx + 1))); + +// auto local_y_vecs = Kokkos::subview( +// y_vecs, +// Kokkos::pair(my_ode.neqs * idx, my_ode.neqs * (idx + 1)), +// Kokkos::ALL()); +// auto local_kstack = Kokkos::subview( +// kstack, Kokkos::ALL(), +// Kokkos::pair(my_ode.neqs * idx, my_ode.neqs * (idx + 1))); +// auto local_temp = Kokkos::subview( +// temp, +// Kokkos::pair(my_ode.neqs * idx, my_ode.neqs * (idx + 1)), +// Kokkos::ALL()); +// auto local_jac = Kokkos::subview( +// jac, Kokkos::pair(my_ode.neqs * idx, my_ode.neqs * (idx + +// 1)), Kokkos::ALL()); + +// KokkosODE::Experimental::BDF::Solve( +// my_ode, tstart, tend, num_steps, local_y_old, local_y_new, local_rhs, +// local_update, scale, local_y_vecs, local_kstack, local_temp, +// local_jac); +// } +// }; + +// template +// void test_BDF_parallel() { +// using execution_space = typename device_type::execution_space; +// using vec_type = Kokkos::View; +// using mv_type = Kokkos::View; +// using mat_type = Kokkos::View; + +// LotkaVolterra mySys(1.1, 0.4, 0.1, 0.4); +// constexpr int num_solves = 1000; + +// vec_type scale("scaling factors", mySys.neqs); +// Kokkos::deep_copy(scale, 1); + +// const scalar_type t_start = 0.0, t_end = 100.0; +// vec_type y0("initial conditions", mySys.neqs * num_solves), +// y_new("solution", mySys.neqs * num_solves); +// vec_type rhs("rhs", mySys.neqs * num_solves), +// update("update", mySys.neqs * num_solves); +// mat_type jac("jacobian", mySys.neqs * num_solves, mySys.neqs), +// temp("temp storage", mySys.neqs * num_solves, mySys.neqs + 4); + +// // Test BDF5 +// mv_type y_vecs("history vectors", mySys.neqs * num_solves, 5), +// kstack("Startup RK vectors", 6, mySys.neqs * num_solves); + +// Kokkos::deep_copy(y0, 10.0); +// Kokkos::deep_copy(y_vecs, 10.0); + +// Kokkos::RangePolicy myPolicy(0, num_solves); +// BDFSolve_parallel +// solve_wrapper(mySys, t_start, t_end, 1000, y0, y_new, rhs, update, +// scale, y_vecs, +// kstack, temp, jac); +// Kokkos::parallel_for(myPolicy, solve_wrapper); + +// Kokkos::fence(); +// } + +template +void compute_coeffs(const int order, const scalar_type factor, + const mat_type& coeffs) { + std::cout << "compute_coeffs" << std::endl; + + coeffs(0, 0) = 1.0; + for (int colIdx = 0; colIdx < order; ++colIdx) { + coeffs(0, colIdx + 1) = 1.0; + for (int rowIdx = 0; rowIdx < order; ++rowIdx) { + coeffs(rowIdx + 1, colIdx + 1) = + ((rowIdx - factor * (colIdx + 1.0)) / (rowIdx + 1.0)) * + coeffs(rowIdx, colIdx + 1); + } + } +} + +template +void update_D(const int order, const scalar_type factor, const mat_type& coeffs, + const mat_type& tempD, const mat_type& D) { + auto subD = + Kokkos::subview(D, Kokkos::pair(0, order + 1), Kokkos::ALL); + auto subTempD = + Kokkos::subview(tempD, Kokkos::pair(0, order + 1), Kokkos::ALL); + + compute_coeffs(order, factor, coeffs); + auto R = Kokkos::subview(coeffs, Kokkos::pair(0, order + 1), + Kokkos::pair(0, order + 1)); + std::cout << "SerialGemm" << std::endl; + KokkosBatched::SerialGemm< + KokkosBatched::Trans::Transpose, KokkosBatched::Trans::NoTranspose, + KokkosBatched::Algo::Gemm::Blocked>::invoke(1.0, R, subD, 0.0, subTempD); + + compute_coeffs(order, 1.0, coeffs); + auto U = Kokkos::subview(coeffs, Kokkos::pair(0, order + 1), + Kokkos::pair(0, order + 1)); + std::cout << "SerialGemm" << std::endl; + KokkosBatched::SerialGemm< + KokkosBatched::Trans::Transpose, KokkosBatched::Trans::NoTranspose, + KokkosBatched::Algo::Gemm::Blocked>::invoke(1.0, U, subTempD, 0.0, subD); +} + +template +void test_Nordsieck() { + using execution_space = Kokkos::HostSpace; + StiffChemistry mySys{}; + + Kokkos::View R("coeffs", 6, 6), + U("coeffs", 6, 6); + Kokkos::View D("D", 8, mySys.neqs), + tempD("tmp", 8, mySys.neqs); + int order = 1; + double factor = 0.8; + + constexpr double t_start = 0.0, t_end = 500.0; + int max_steps = 200000; + double dt = (t_end - t_start) / max_steps; + + auto y0 = Kokkos::subview(D, 0, Kokkos::ALL()); + auto f = Kokkos::subview(D, 1, Kokkos::ALL()); + y0(0) = 1.0; + + mySys.evaluate_function(0, 0, y0, f); + for (int eqIdx = 0; eqIdx < mySys.neqs; ++eqIdx) { + f(eqIdx) *= dt; + } + + compute_coeffs(order, factor, R); + compute_coeffs(order, 1.0, U); + + { + std::cout << "R: " << std::endl; + for (int i = 0; i < order + 1; ++i) { + std::cout << "{ "; + for (int j = 0; j < order + 1; ++j) { + std::cout << R(i, j) << ", "; + } + std::cout << "}" << std::endl; + } + } + + std::cout << "D before update:" << std::endl; + std::cout << " { " << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) << " }" + << std::endl; + std::cout << " { " << D(1, 0) << ", " << D(1, 1) << ", " << D(1, 2) << " }" + << std::endl; + update_D(order, factor, R, tempD, D); + + std::cout << "D after update:" << std::endl; + std::cout << " { " << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) << " }" + << std::endl; + std::cout << " { " << D(1, 0) << ", " << D(1, 1) << ", " << D(1, 2) << " }" + << std::endl; +} + +template +void test_adaptive_BDF() { + using execution_space = typename device_type::execution_space; + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + + Logistic mySys(1, 1); + + constexpr double t_start = 0.0, t_end = 6.0, atol = 1.0e-6, rtol = 1.0e-4; + constexpr int num_steps = 512, max_newton_iters = 5; + int order = 1, num_equal_steps = 0; + double dt = (t_end - t_start) / num_steps; + double t = t_start; + + vec_type y0("initial conditions", mySys.neqs), y_new("solution", mySys.neqs); + vec_type rhs("rhs", mySys.neqs), update("update", mySys.neqs); + mat_type temp("buffer1", mySys.neqs, 23 + 2 * mySys.neqs + 4), + temp2("buffer2", 6, 7); + + // Initial condition + Kokkos::deep_copy(y0, 0.5); + + // Initialize D + auto D = Kokkos::subview(temp, Kokkos::ALL(), Kokkos::pair(2, 10)); + D(0, 0) = y0(0); + mySys.evaluate_function(0, 0, y0, rhs); + D(0, 1) = dt * rhs(0); + Kokkos::deep_copy(rhs, 0); + + std::cout << "**********************\n" + << " Step 1\n" + << "**********************" << std::endl; + + std::cout << "Initial conditions" << std::endl; + std::cout << " y0=" << y0(0) << ", t=" << t << ", dt=" << dt << std::endl; + + std::cout << "Initial D: {" << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) + << ", " << D(0, 3) << ", " << D(0, 4) << ", " << D(0, 5) << ", " + << D(0, 6) << ", " << D(0, 7) << "}" << std::endl; + + KokkosODE::Impl::BDFStep(mySys, t, dt, t_end, order, num_equal_steps, + max_newton_iters, atol, rtol, 0.2, y0, y_new, rhs, + update, temp, temp2); + + for (int eqIdx = 0; eqIdx < mySys.neqs; ++eqIdx) { + y0(eqIdx) = y_new(eqIdx); + } + + std::cout << "**********************\n" + << " Step 2\n" + << "**********************" << std::endl; + + std::cout << " y0=" << y0(0) << ", t=" << t << ", dt: " << dt << std::endl; + + std::cout << "Initial D: {" << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) + << ", " << D(0, 3) << ", " << D(0, 4) << ", " << D(0, 5) << ", " + << D(0, 6) << ", " << D(0, 7) << "}" << std::endl; + + KokkosODE::Impl::BDFStep(mySys, t, dt, t_end, order, num_equal_steps, + max_newton_iters, atol, rtol, 0.2, y0, y_new, rhs, + update, temp, temp2); + + for (int eqIdx = 0; eqIdx < mySys.neqs; ++eqIdx) { + y0(eqIdx) = y_new(eqIdx); + } + + std::cout << "**********************\n" + << " Step 3\n" + << "**********************" << std::endl; + + std::cout << " y0=" << y0(0) << ", t=" << t << ", dt: " << dt << std::endl; + + std::cout << "Initial D: {" << D(0, 0) << ", " << D(0, 1) << ", " << D(0, 2) + << ", " << D(0, 3) << ", " << D(0, 4) << ", " << D(0, 5) << ", " + << D(0, 6) << ", " << D(0, 7) << "}" << std::endl; + + KokkosODE::Impl::BDFStep(mySys, t, dt, t_end, order, num_equal_steps, + max_newton_iters, atol, rtol, 0.2, y0, y_new, rhs, + update, temp, temp2); + + std::cout << "Final t: " << t << ", y=" << y_new(0) << std::endl; + +} // test_adaptive_BDF() + +template +void test_adaptive_BDF_v2() { + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + using KAT = Kokkos::ArithTraits; + + std::cout << "\n\n\nBDF_v2 test starting\n" << std::endl; + + Logistic mySys(1, 1); + + const scalar_type t_start = KAT::zero(), + t_end = 6 * KAT::one(); //, atol = 1.0e-6, rtol = 1.0e-4; + vec_type y0("initial conditions", mySys.neqs), y_new("solution", mySys.neqs); + Kokkos::deep_copy(y0, 0.5); + + mat_type temp("buffer1", mySys.neqs, 23 + 2 * mySys.neqs + 4), + temp2("buffer2", 6, 7); + + { + scalar_type dt = KAT::zero(); + vec_type f0("initial value f", mySys.neqs); + mySys.evaluate_function(t_start, KAT::zero(), y0, f0); + KokkosODE::Impl::initial_step_size(mySys, 1, t_start, 1e-6, 1e-3, y0, f0, + temp, dt); + + std::cout << "Initial Step Size: dt=" << dt << std::endl; + } + + KokkosODE::Experimental::BDFSolve(mySys, t_start, t_end, 0.0117188, + (t_end - t_start) / 10, y0, y_new, temp, + temp2); +} + +template +void test_BDF_adaptive_stiff() { + using execution_space = typename Device::execution_space; + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + using KAT = Kokkos::ArithTraits; + + StiffChemistry mySys{}; + + const scalar_type t_start = KAT::zero(), t_end = 350 * KAT::one(); + scalar_type dt = KAT::zero(); + vec_type y0("initial conditions", mySys.neqs), y_new("solution", mySys.neqs); + + // Set initial conditions + auto y0_h = Kokkos::create_mirror_view(y0); + y0_h(0) = KAT::one(); + y0_h(1) = KAT::zero(); + y0_h(2) = KAT::zero(); + Kokkos::deep_copy(y0, y0_h); + + mat_type temp("buffer1", mySys.neqs, 23 + 2 * mySys.neqs + 4), + temp2("buffer2", 6, 7); + + Kokkos::RangePolicy policy(0, 1); + BDF_Solve_wrapper bdf_wrapper(mySys, t_start, t_end, dt, + (t_end - t_start) / 10, y0, y_new, temp, temp2); + + Kokkos::parallel_for(policy, bdf_wrapper); + + auto y_new_h = Kokkos::create_mirror_view(y_new); + Kokkos::deep_copy(y_new_h, y_new); + std::cout << "Stiff Chemistry solution at t=500: {" << y_new_h(0) << ", " + << y_new_h(1) << ", " << y_new_h(2) << "}" << std::endl; +} + +} // namespace Test + +TEST_F(TestCategory, BDF_Logistic_serial) { + ::Test::test_BDF_Logistic(); +} +TEST_F(TestCategory, BDF_LotkaVolterra_serial) { + ::Test::test_BDF_LotkaVolterra(); +} +TEST_F(TestCategory, BDF_StiffChemistry_serial) { + ::Test::test_BDF_StiffChemistry(); +} +// TEST_F(TestCategory, BDF_parallel_serial) { +// ::Test::test_BDF_parallel(); +// } +TEST_F(TestCategory, BDF_Nordsieck) { + ::Test::test_Nordsieck(); +} +// TEST_F(TestCategory, BDF_adaptive) { +// ::Test::test_adaptive_BDF(); +// ::Test::test_adaptive_BDF_v2(); +// } +TEST_F(TestCategory, BDF_StiffChemistry_adaptive) { + ::Test::test_BDF_adaptive_stiff(); +} diff --git a/packages/kokkos-kernels/ode/unit_test/Test_ODE_Newton.hpp b/packages/kokkos-kernels/ode/unit_test/Test_ODE_Newton.hpp index d235df1e5689..45dd4adb6adf 100644 --- a/packages/kokkos-kernels/ode/unit_test/Test_ODE_Newton.hpp +++ b/packages/kokkos-kernels/ode/unit_test/Test_ODE_Newton.hpp @@ -21,7 +21,8 @@ namespace Test { -template +template struct NewtonSolve_wrapper { using newton_params = KokkosODE::Experimental::Newton_params; @@ -32,10 +33,13 @@ struct NewtonSolve_wrapper { mat_type J, tmp; status_view status; + scale_type scale; + NewtonSolve_wrapper(const system_type& my_nls_, const newton_params& params_, const vec_type& x_, const vec_type& rhs_, const vec_type& update_, const mat_type& J_, - const mat_type& tmp_, const status_view& status_) + const mat_type& tmp_, const status_view& status_, + const scale_type& scale_) : my_nls(my_nls_), params(params_), x(x_), @@ -43,7 +47,8 @@ struct NewtonSolve_wrapper { update(update_), J(J_), tmp(tmp_), - status(status_) {} + status(status_), + scale(scale_) {} KOKKOS_FUNCTION void operator()(const int idx) const { @@ -71,7 +76,8 @@ struct NewtonSolve_wrapper { // Run Newton nonlinear solver status(idx) = KokkosODE::Experimental::Newton::Solve( - my_nls, params, local_J, local_tmp, local_x, local_rhs, local_update); + my_nls, params, local_J, local_tmp, local_x, local_rhs, local_update, + scale); } }; @@ -87,6 +93,9 @@ void run_newton_test(const system_type& mySys, Kokkos::View status("Newton status", 1); + vec_type scale("scaling factors", mySys.neqs); + Kokkos::deep_copy(scale, 1); + vec_type x("solution vector", mySys.neqs), rhs("right hand side vector", mySys.neqs); auto x_h = Kokkos::create_mirror_view(x); @@ -104,7 +113,7 @@ void run_newton_test(const system_type& mySys, Kokkos::RangePolicy my_policy(0, 1); NewtonSolve_wrapper solve_wrapper(mySys, params, x, rhs, update, J, tmp, - status); + status, scale); Kokkos::parallel_for(my_policy, solve_wrapper); @@ -205,6 +214,9 @@ void test_newton_status() { using vec_type = typename Kokkos::View; using mat_type = typename Kokkos::View; + vec_type scale("scaling factors", 1); + Kokkos::deep_copy(scale, 1); + double abs_tol, rel_tol; if (std::is_same_v) { rel_tol = 10e-5; @@ -227,7 +239,7 @@ void test_newton_status() { scalar_type solution[3] = {2.0, -1.0, 0.0}; #endif newton_solver_status newton_status[3] = { - newton_solver_status::NLS_SUCCESS, newton_solver_status::MAX_ITER, + newton_solver_status::NLS_SUCCESS, newton_solver_status::NLS_DIVERGENCE, newton_solver_status::LIN_SOLVE_FAIL}; vec_type x("solution vector", 1), rhs("right hand side vector", 1); auto x_h = Kokkos::create_mirror_view(x); @@ -242,7 +254,7 @@ void test_newton_status() { Kokkos::RangePolicy my_policy(0, 1); NewtonSolve_wrapper solve_wrapper(my_system, params, x, rhs, update, J, tmp, - status); + status, scale); Kokkos::parallel_for(my_policy, solve_wrapper); Kokkos::deep_copy(status_h, status); @@ -481,6 +493,9 @@ void test_newton_on_device() { system_type mySys{}; + vec_type scale("scaling factors", mySys.neqs); + Kokkos::deep_copy(scale, 1); + vec_type x("solution vector", mySys.neqs * num_systems); vec_type rhs("right hand side vector", mySys.neqs * num_systems); vec_type update("update", mySys.neqs * num_systems); @@ -503,7 +518,7 @@ void test_newton_on_device() { Kokkos::RangePolicy my_policy(0, num_systems); NewtonSolve_wrapper solve_wrapper(mySys, params, x, rhs, update, J, tmp, - status); + status, scale); Kokkos::parallel_for(my_policy, solve_wrapper); Kokkos::fence(); diff --git a/packages/kokkos-kernels/perf_test/CMakeLists.txt b/packages/kokkos-kernels/perf_test/CMakeLists.txt index cf1905d6d4d1..28271dfb0d36 100644 --- a/packages/kokkos-kernels/perf_test/CMakeLists.txt +++ b/packages/kokkos-kernels/perf_test/CMakeLists.txt @@ -49,6 +49,7 @@ if (KokkosKernels_ENABLE_PERFTESTS) ADD_COMPONENT_SUBDIRECTORY(sparse) ADD_COMPONENT_SUBDIRECTORY(blas) ADD_COMPONENT_SUBDIRECTORY(ode) + ADD_COMPONENT_SUBDIRECTORY(lapack) ADD_SUBDIRECTORY(performance) #ADD_SUBDIRECTORY(common) diff --git a/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp b/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp index d617ffcdf3d2..cd7f194071d6 100644 --- a/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp +++ b/packages/kokkos-kernels/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test_benchmark.cpp @@ -142,7 +142,19 @@ void run(const blas3_gemm_params& params) { int main(int argc, char** argv) { const auto params = blas3_gemm_params::get_params(argc, argv); const int num_threads = params.use_openmp; - const int device_id = params.use_cuda - 1; + + // the common parameter parser takes the requested device ID and + // adds 1 to it (e.g. --cuda 0 -> params.use_cuda = 1) + // this is presumably so that 0 can be a sentinel value, + // even though device ID 0 is valid + // here, we use CUDA, SYCL, or HIP, whichever is set first, to + // choose which device Kokkos should initialize on + // or -1, for no such selection + const int device_id = + params.use_cuda + ? params.use_cuda - 1 + : (params.use_sycl ? params.use_sycl - 1 + : (params.use_hip ? params.use_hip - 1 : -1)); Kokkos::initialize(Kokkos::InitializationSettings() .set_num_threads(num_threads) diff --git a/packages/kokkos-kernels/perf_test/lapack/CMakeLists.txt b/packages/kokkos-kernels/perf_test/lapack/CMakeLists.txt new file mode 100644 index 000000000000..478703d38ad9 --- /dev/null +++ b/packages/kokkos-kernels/perf_test/lapack/CMakeLists.txt @@ -0,0 +1,8 @@ +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +if(KOKKOSKERNELS_ENABLE_BENCHMARK) + KOKKOSKERNELS_ADD_BENCHMARK( + lapack_svd SOURCES KokkosLapack_SVD_benchmark.cpp + ) +endif() diff --git a/packages/kokkos-kernels/perf_test/lapack/KokkosLapack_SVD_benchmark.cpp b/packages/kokkos-kernels/perf_test/lapack/KokkosLapack_SVD_benchmark.cpp new file mode 100644 index 000000000000..1ac9381ff89e --- /dev/null +++ b/packages/kokkos-kernels/perf_test/lapack/KokkosLapack_SVD_benchmark.cpp @@ -0,0 +1,124 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include "KokkosLapack_svd.hpp" + +#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_perf_test_utilities.hpp" + +#include +#include "Benchmark_Context.hpp" + +struct svd_parameters { + int numRows, numCols; + bool verbose; + + svd_parameters(const int numRows_, const int numCols_, const bool verbose_) + : numRows(numRows_), numCols(numCols_), verbose(verbose_){}; +}; + +void print_options() { + std::cerr << "Options\n" << std::endl; + + std::cerr << perf_test::list_common_options(); + + std::cerr << "\t[Optional] --verbose :: enable verbose output" + << std::endl; + std::cerr << "\t[Optional] --m :: number of rows of A" << std::endl; + std::cerr << "\t[Optional] --n :: number of columns of A" + << std::endl; +} // print_options + +int parse_inputs(svd_parameters& params, int argc, char** argv) { + for (int i = 1; i < argc; ++i) { + if (perf_test::check_arg_int(i, argc, argv, "--m", params.numRows)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--n", params.numCols)) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--verbose", + params.verbose)) { + } else { + std::cerr << "Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(); + return 1; + } + } + return 0; +} // parse_inputs + +template +void run_svd_benchmark(benchmark::State& state, + const svd_parameters& svd_params) { + using mat_type = Kokkos::View; + using vec_type = Kokkos::View; + + const int m = svd_params.numRows; + const int n = svd_params.numCols; + + mat_type A("A", m, n), U("U", m, m), Vt("Vt", n, n); + vec_type S("S", Kokkos::min(m, n)); + + const uint64_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count(); + Kokkos::Random_XorShift64_Pool rand_pool(seed); + + // Initialize A with random numbers + double randStart = 0, randEnd = 0; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(A, rand_pool, randStart, randEnd); + + for (auto _ : state) { + (void)_; + KokkosLapack::svd("A", "A", A, S, U, Vt); + Kokkos::fence(); + } +} + +int main(int argc, char** argv) { + Kokkos::initialize(argc, argv); + + benchmark::Initialize(&argc, argv); + benchmark::SetDefaultTimeUnit(benchmark::kMillisecond); + KokkosKernelsBenchmark::add_benchmark_context(true); + + perf_test::CommonInputParams common_params; + perf_test::parse_common_options(argc, argv, common_params); + svd_parameters svd_params(0, 0, false); + parse_inputs(svd_params, argc, argv); + + std::string bench_name = "KokkosLapack_SVD"; + + if (0 < common_params.repeat) { + benchmark::RegisterBenchmark( + bench_name.c_str(), run_svd_benchmark, + svd_params) + ->UseRealTime() + ->Iterations(common_params.repeat); + } else { + benchmark::RegisterBenchmark( + bench_name.c_str(), run_svd_benchmark, + svd_params) + ->UseRealTime(); + } + + benchmark::RunSpecifiedBenchmarks(); + + benchmark::Shutdown(); + Kokkos::finalize(); + + return 0; +} diff --git a/packages/kokkos-kernels/perf_test/ode/CMakeLists.txt b/packages/kokkos-kernels/perf_test/ode/CMakeLists.txt index b4aa86889fb9..39acabed9807 100644 --- a/packages/kokkos-kernels/perf_test/ode/CMakeLists.txt +++ b/packages/kokkos-kernels/perf_test/ode/CMakeLists.txt @@ -5,4 +5,8 @@ if(KOKKOSKERNELS_ENABLE_BENCHMARK) KOKKOSKERNELS_ADD_BENCHMARK( ode_runge_kutta SOURCES KokkosODE_RK.cpp ) + + KOKKOSKERNELS_ADD_BENCHMARK( + ode_bdf_solver SOURCES KokkosODE_BDF.cpp + ) endif() diff --git a/packages/kokkos-kernels/perf_test/ode/KokkosODE_BDF.cpp b/packages/kokkos-kernels/perf_test/ode/KokkosODE_BDF.cpp new file mode 100644 index 000000000000..84a310666f30 --- /dev/null +++ b/packages/kokkos-kernels/perf_test/ode/KokkosODE_BDF.cpp @@ -0,0 +1,266 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include "KokkosODE_BDF.hpp" + +#include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_perf_test_utilities.hpp" + +#include +#include "Benchmark_Context.hpp" + +namespace { +// Robertson's autocatalytic chemical reaction: +// H. H. Robertson, The solution of a set of reaction rate equations, +// in J. Walsh (Ed.), Numerical Analysis: An Introduction, +// pp. 178–182, Academic Press, London (1966). +// +// Equations: y0' = -0.04*y0 + 10e4*y1*y2 +// y1' = 0.04*y0 - 10e4*y1*y2 - 3e7 * y1**2 +// y2' = 3e7 * y1**2 +struct StiffChemistry { + static constexpr int neqs = 3; + + StiffChemistry() {} + + template + KOKKOS_FUNCTION void evaluate_function(const double /*t*/, + const double /*dt*/, + const vec_type1& y, + const vec_type2& f) const { + f(0) = -0.04 * y(0) + 1.e4 * y(1) * y(2); + f(1) = 0.04 * y(0) - 1.e4 * y(1) * y(2) - 3.e7 * y(1) * y(1); + f(2) = 3.e7 * y(1) * y(1); + } + + template + KOKKOS_FUNCTION void evaluate_jacobian(const double /*t*/, + const double /*dt*/, const vec_type& y, + const mat_type& jac) const { + jac(0, 0) = -0.04; + jac(0, 1) = 1.e4 * y(2); + jac(0, 2) = 1.e4 * y(1); + jac(1, 0) = 0.04; + jac(1, 1) = -1.e4 * y(2) - 3.e7 * 2.0 * y(1); + jac(1, 2) = -1.e4 * y(1); + jac(2, 0) = 0.0; + jac(2, 1) = 3.e7 * 2.0 * y(1); + jac(2, 2) = 0.0; + } +}; + +template +struct BDF_Solve_wrapper { + const ode_type my_ode; + const scalar_type t_start, t_end, dt, max_step; + const vec_type y0, y_new; + const mat_type temp, temp2; + + BDF_Solve_wrapper(const ode_type& my_ode_, const scalar_type& t_start_, + const scalar_type& t_end_, const scalar_type& dt_, + const scalar_type& max_step_, const vec_type& y0_, + const vec_type& y_new_, const mat_type& temp_, + const mat_type& temp2_) + : my_ode(my_ode_), + t_start(t_start_), + t_end(t_end_), + dt(dt_), + max_step(max_step_), + y0(y0_), + y_new(y_new_), + temp(temp_), + temp2(temp2_) {} + + KOKKOS_FUNCTION void operator()(const int idx) const { + auto subTemp = Kokkos::subview(temp, Kokkos::ALL(), Kokkos::ALL(), idx); + auto subTemp2 = Kokkos::subview(temp2, Kokkos::ALL(), Kokkos::ALL(), idx); + auto subY0 = Kokkos::subview(y0, Kokkos::ALL(), idx); + auto subYnew = Kokkos::subview(y_new, Kokkos::ALL(), idx); + + KokkosODE::Experimental::BDFSolve(my_ode, t_start, t_end, dt, max_step, + subY0, subYnew, subTemp, subTemp2); + } +}; + +} // namespace + +struct bdf_input_parameters { + int num_odes; + int repeat; + bool verbose; + + bdf_input_parameters(const int num_odes_, const int repeat_, + const bool verbose_) + : num_odes(num_odes_), repeat(repeat_), verbose(verbose_){}; +}; + +template +void run_ode_chem(benchmark::State& state, const bdf_input_parameters& inputs) { + using scalar_type = double; + using KAT = Kokkos::ArithTraits; + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + + StiffChemistry mySys{}; + + const bool verbose = inputs.verbose; + const int num_odes = inputs.num_odes; + const int neqs = mySys.neqs; + + const scalar_type t_start = KAT::zero(), t_end = 350 * KAT::one(); + scalar_type dt = KAT::zero(); + vec_type y0("initial conditions", neqs, num_odes); + vec_type y_new("solution", neqs, num_odes); + + // Set initial conditions + auto y0_h = Kokkos::create_mirror(y0); + for (int sysIdx = 0; sysIdx < num_odes; ++sysIdx) { + y0_h(0, sysIdx) = KAT::one(); + y0_h(1, sysIdx) = KAT::zero(); + y0_h(2, sysIdx) = KAT::zero(); + } + + mat_type temp("buffer1", neqs, 23 + 2 * neqs + 4, num_odes), + temp2("buffer2", 6, 7, num_odes); + + if (verbose) { + std::cout << "Number of problems solved in parallel: " << num_odes + << std::endl; + } + + Kokkos::RangePolicy policy(0, num_odes); + + Kokkos::Timer time; + time.reset(); + for (auto _ : state) { + (void)_; + + // Set initial conditions for each test iteration + state.PauseTiming(); + dt = KAT::zero(); + Kokkos::deep_copy(y0, y0_h); + Kokkos::deep_copy(y_new, KAT::zero()); + Kokkos::deep_copy(temp, KAT::zero()); + Kokkos::deep_copy(temp2, KAT::zero()); + BDF_Solve_wrapper bdf_wrapper(mySys, t_start, t_end, dt, + (t_end - t_start) / 10, y0, y_new, temp, + temp2); + state.ResumeTiming(); + + // Actually run the time integrator + Kokkos::parallel_for(policy, bdf_wrapper); + Kokkos::fence(); + } + double run_time = time.seconds(); + std::cout << "Run time: " << run_time << std::endl; + + Kokkos::deep_copy(y0_h, y0); + double error; + for (int odeIdx = 0; odeIdx < num_odes; ++odeIdx) { + error = 0; + // error += Kokkos::abs(y0_h(0, odeIdx) - 0.4193639) / 0.4193639; + // error += Kokkos::abs(y0_h(1, odeIdx) - 0.000002843646) / 0.000002843646; + // error += Kokkos::abs(y0_h(2, odeIdx) - 0.5806333) / 0.5806333; + error += Kokkos::abs(y0_h(0, odeIdx) - 0.462966) / 0.462966; + error += Kokkos::abs(y0_h(1, odeIdx) - 3.42699e-06) / 3.42699e-06; + error += Kokkos::abs(y0_h(2, odeIdx) - 0.537030) / 0.537030; + error = error / 3; + + if (error > 1e-6) { + std::cout << "Large error in problem " << odeIdx << ": " << error + << std::endl; + } + } +} + +void print_options() { + std::cerr << "Options\n" << std::endl; + + std::cerr << perf_test::list_common_options(); + + std::cerr + << "\t[Optional] --repeat :: how many times to repeat overall test" + << std::endl; + std::cerr << "\t[Optional] --verbose :: enable verbose output" + << std::endl; + std::cerr << "\t[Optional] --n :: number of ode problems to solve" + << std::endl; +} // print_options + +int parse_inputs(bdf_input_parameters& params, int argc, char** argv) { + for (int i = 1; i < argc; ++i) { + if (perf_test::check_arg_int(i, argc, argv, "--n", params.num_odes)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--repeat", + params.repeat)) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--verbose", + params.verbose)) { + } else { + std::cerr << "Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(); + return 1; + } + } + return 0; +} // parse_inputs + +template +void run_benchmark_wrapper(benchmark::State& state, + bdf_input_parameters params) { + run_ode_chem(state, params); +} + +int main(int argc, char** argv) { + Kokkos::initialize(argc, argv); + { + benchmark::Initialize(&argc, argv); + benchmark::SetDefaultTimeUnit(benchmark::kMillisecond); + KokkosKernelsBenchmark::add_benchmark_context(true); + + perf_test::CommonInputParams common_params; + perf_test::parse_common_options(argc, argv, common_params); + + std::string bench_name = "KokkosODE_BDF_Stiff_Chem"; + bdf_input_parameters params(1000, 1, false); + parse_inputs(params, argc, argv); + + if (0 < common_params.repeat) { + benchmark::RegisterBenchmark( + bench_name.c_str(), + run_benchmark_wrapper, params) + ->UseRealTime() + ->ArgNames({"n"}) + ->Args({params.num_odes}) + ->Iterations(common_params.repeat); + } else { + benchmark::RegisterBenchmark( + bench_name.c_str(), + run_benchmark_wrapper, params) + ->UseRealTime() + ->ArgNames({"n"}) + ->Args({params.num_odes}); + } + + benchmark::RunSpecifiedBenchmarks(); + + benchmark::Shutdown(); + } + Kokkos::finalize(); + + return 0; +} diff --git a/packages/kokkos-kernels/perf_test/sparse/CMakeLists.txt b/packages/kokkos-kernels/perf_test/sparse/CMakeLists.txt index 8a994b4122aa..ef0bf7d99530 100644 --- a/packages/kokkos-kernels/perf_test/sparse/CMakeLists.txt +++ b/packages/kokkos-kernels/perf_test/sparse/CMakeLists.txt @@ -145,4 +145,9 @@ if (KokkosKernels_ENABLE_BENCHMARK) if (Kokkos_CXX_COMPILER_ID STREQUAL HIPCC AND Kokkos_CXX_COMPILER_VERSION VERSION_LESS 5.3) target_link_libraries(KokkosKernels_sparse_spmv_bsr_benchmark PRIVATE -lstdc++fs) endif() + # IntelLLVM < 2023.1.0 (and possible higher versions too) have an underlying clang that has the std::filesystem + # in an experimental namespace and a different library + if (Kokkos_CXX_COMPILER_ID STREQUAL IntelLLVM AND Kokkos_CXX_COMPILER_VERSION VERSION_LESS_EQUAL 2023.1.0) + target_link_libraries(KokkosKernels_sparse_spmv_bsr_benchmark PRIVATE -lstdc++fs) + endif() endif() diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_kk_spmv.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_kk_spmv.cpp index 3f4893363ad9..194ee9afd46a 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_kk_spmv.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_kk_spmv.cpp @@ -28,85 +28,159 @@ #include #include #include +#include // for graph_max_degree #include #include "KokkosKernels_default_types.hpp" -typedef default_scalar Scalar; -typedef default_lno_t Ordinal; -typedef default_size_type Offset; - -template -void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop, - int num_vecs, char mode, Scalar beta) { - typedef KokkosSparse::CrsMatrix - matrix_type; - typedef typename Kokkos::View mv_type; - typedef typename mv_type::HostMirror h_mv_type; - - srand(17312837); - matrix_type A; - if (filename) - A = KokkosSparse::Impl::read_kokkos_crst_matrix(filename); - else { - Offset nnz = 10 * numRows; - // note: the help text says the bandwidth is fixed at 0.01 * numRows - A = KokkosSparse::Impl::kk_generate_sparse_matrix( - numRows, numCols, nnz, 0, 0.01 * numRows); - } - numRows = A.numRows(); - numCols = A.numCols(); - - std::cout << "A is " << numRows << "x" << numCols << ", with " << A.nnz() - << " nonzeros\n"; - std::cout << "SpMV mode " << mode << ", " << num_vecs - << " vectors, beta = " << beta << ", multivectors are "; - std::cout << (std::is_same_v ? "LayoutLeft" - : "LayoutRight"); - std::cout << '\n'; - - mv_type x("X", numCols, num_vecs); - mv_type y("Y", numRows, num_vecs); - h_mv_type h_x = Kokkos::create_mirror_view(x); - h_mv_type h_y = Kokkos::create_mirror_view(y); - h_mv_type h_y_compare = Kokkos::create_mirror(y); - - for (int v = 0; v < num_vecs; v++) { - for (int i = 0; i < numCols; i++) { - h_x(i, v) = (Scalar)(1.0 * (rand() % 40) - 20.); - } - } +using Scalar = default_scalar; +using Ordinal = default_lno_t; +using Offset = default_size_type; +using KAT = Kokkos::ArithTraits; + +struct SPMVBenchmarking { + // note: CLI currently only allows square matrices to be randomly generated + // and nz/row is fixed at 10 + Ordinal num_rows = 110503; + Ordinal num_cols = 110503; + char mode = 'N'; + int loop = 100; + int num_vecs = 1; + Scalar beta = KAT::zero(); + std::string filename = ""; + bool flush_cache = false; + bool non_reuse = false; - Kokkos::deep_copy(x, h_x); + // Using the parameters above, run and time spmv where x and y use the given + // memory layout. + template + void run() { + using matrix_type = + KokkosSparse::CrsMatrix; + using mv_type = Kokkos::View; + using h_mv_type = typename mv_type::HostMirror; - // Benchmark - auto x0 = Kokkos::subview(x, Kokkos::ALL(), 0); - auto y0 = Kokkos::subview(y, Kokkos::ALL(), 0); - // Do 5 warm up calls (not timed) - for (int i = 0; i < 5; i++) { - if (num_vecs == 1) { - // run the rank-1 version - KokkosSparse::spmv(&mode, 1.0, A, x0, beta, y0); + srand(17312837); + matrix_type A; + if (filename != "") { + std::cout << "Reading A from file \"" << filename << "\"...\n"; + A = KokkosSparse::Impl::read_kokkos_crst_matrix( + filename.c_str()); + num_rows = A.numRows(); + num_cols = A.numCols(); } else { - // rank-2 - KokkosSparse::spmv(&mode, 1.0, A, x, beta, y); + std::cout << "Randomly generating A...\n"; + Offset nnz = 10 * num_rows; + // note: the help text says the bandwidth is fixed at 0.01 * numRows + A = KokkosSparse::Impl::kk_generate_sparse_matrix( + num_rows, num_cols, nnz, 0, 0.01 * num_rows); } - Kokkos::DefaultExecutionSpace().fence(); - } - Kokkos::Timer timer; - for (int i = 0; i < loop; i++) { - if (num_vecs == 1) { - // run the rank-1 version - KokkosSparse::spmv(&mode, 1.0, A, x0, beta, y0); - } else { - // rank-2 - KokkosSparse::spmv(&mode, 1.0, A, x, beta, y); + + std::cout << "A is " << A.numRows() << "x" << A.numCols() << ", with " + << A.nnz() << " nonzeros\n"; + std::cout << "Mean nnz/row: " << (double)A.nnz() / A.numRows() << '\n'; + std::cout << "Max nnz/row: " + << KokkosSparse::Impl::graph_max_degree< + Kokkos::DefaultExecutionSpace, Ordinal>(A.graph.row_map) + << '\n'; + std::cout << "SpMV mode " << mode << ", " << num_vecs + << " vectors, beta = " << beta << ", multivectors are "; + std::cout << (std::is_same_v ? "LayoutLeft" + : "LayoutRight"); + std::cout << '\n'; + + bool transpose_like = (mode == 'T') || (mode == 'H'); + + Ordinal xlen = transpose_like ? A.numRows() : A.numCols(); + Ordinal ylen = transpose_like ? A.numCols() : A.numRows(); + + mv_type x("X", xlen, num_vecs); + mv_type y("Y", ylen, num_vecs); + + h_mv_type h_x = Kokkos::create_mirror_view(x); + h_mv_type h_y = Kokkos::create_mirror_view(y); + h_mv_type h_y_compare = Kokkos::create_mirror(y); + + for (int v = 0; v < num_vecs; v++) { + for (Ordinal i = 0; i < xlen; i++) { + h_x(i, v) = (Scalar)(1.0 * (rand() % 40) - 20.); + } } - Kokkos::DefaultExecutionSpace().fence(); + + Kokkos::deep_copy(x, h_x); + + // Benchmark + auto x0 = Kokkos::subview(x, Kokkos::ALL(), 0); + auto y0 = Kokkos::subview(y, Kokkos::ALL(), 0); + + // Create handles for both rank-1 and rank-2 cases, + // even though only 1 will get used below (depending on num_vecs) + + KokkosSparse::SPMVHandle + handle_rank1; + KokkosSparse::SPMVHandle + handle_rank2; + // Assuming that 1GB is enough to fully clear the L3 cache of a CPU, or the + // L2 of a GPU. (Some AMD EPYC chips have 768 MB L3) + Kokkos::View cacheFlushData; + if (flush_cache) { + Kokkos::resize(cacheFlushData, 1024 * 1024 * 1024); + } + + Kokkos::DefaultExecutionSpace space; + + // Do 5 warm up calls (not timed). This will also initialize the handle. + for (int i = 0; i < 5; i++) { + if (num_vecs == 1) { + // run the rank-1 version + if (non_reuse) + KokkosSparse::spmv(space, &mode, 1.0, A, x0, beta, y0); + else + KokkosSparse::spmv(space, &handle_rank1, &mode, 1.0, A, x0, beta, y0); + } else { + // rank-2 + if (non_reuse) + KokkosSparse::spmv(space, &mode, 1.0, A, x, beta, y); + else + KokkosSparse::spmv(space, &handle_rank2, &mode, 1.0, A, x, beta, y); + } + space.fence(); + } + + double totalTime = 0; + Kokkos::Timer timer; + for (int i = 0; i < loop; i++) { + if (flush_cache) { + // Copy some non-zero data to the view multiple times to flush the + // cache. (nonzero in case the system has an optimized path for zero + // pages) + for (int rep = 0; rep < 4; rep++) + Kokkos::deep_copy(space, cacheFlushData, char(rep + 1)); + } + space.fence(); + timer.reset(); + if (num_vecs == 1) { + // run the rank-1 version + if (non_reuse) + KokkosSparse::spmv(space, &mode, 1.0, A, x0, beta, y0); + else + KokkosSparse::spmv(space, &handle_rank1, &mode, 1.0, A, x0, beta, y0); + } else { + // rank-2 + if (non_reuse) + KokkosSparse::spmv(space, &mode, 1.0, A, x, beta, y); + else + KokkosSparse::spmv(space, &handle_rank2, &mode, 1.0, A, x, beta, y); + } + space.fence(); + totalTime += timer.seconds(); + } + double avg_time = totalTime / loop; + std::cout << avg_time << " s\n"; } - double avg_time = timer.seconds() / loop; - std::cout << avg_time << " s\n"; -} +}; void print_help() { printf(" -s [nrows] : matrix dimension (square)\n"); @@ -117,8 +191,11 @@ void print_help() { " --layout left|right : memory layout of x/y. Default depends on " "build's default execution space\n"); printf( - " -m N|T : matrix apply mode: N (normal, default), T " - "(transpose)\n"); + " -m N|T|H|C : matrix apply mode:\n" + " N - normal, default\n" + " T - transpose\n" + " H - conjugate transpose\n" + " C - conjugate\n"); printf( " -f [file],-fb [file] : Read in Matrix Market (.mtx), or binary " "(.bin) matrix file.\n"); @@ -126,21 +203,21 @@ void print_help() { " -l [LOOP] : How many spmv to run to aggregate average " "time. \n"); printf(" -b beta : beta, as in y := Ax + (beta)y\n"); + printf( + " --flush : Flush the cache between each spmv call " + "(slow!)\n"); + printf( + " --non-reuse : Use non-reuse interface (without " + "SPMVHandle)\n"); } int main(int argc, char** argv) { - long long int size = 110503; // a prime number - char* filename = NULL; - - char mode = 'N'; + SPMVBenchmarking sb; char layout; if (std::is_same::value) layout = 'L'; else layout = 'R'; - int loop = 100; - int num_vecs = 1; - Scalar beta = 0.0; if (argc == 1) { print_help(); @@ -149,27 +226,31 @@ int main(int argc, char** argv) { for (int i = 0; i < argc; i++) { if ((strcmp(argv[i], "-s") == 0)) { - size = atoi(argv[++i]); + // only square matrices supported now + sb.num_rows = atoi(argv[++i]); + sb.num_cols = sb.num_rows; continue; } if ((strcmp(argv[i], "-f") == 0 || strcmp(argv[i], "-fb") == 0)) { - filename = argv[++i]; + sb.filename = argv[++i]; continue; } if ((strcmp(argv[i], "-l") == 0)) { - loop = atoi(argv[++i]); + sb.loop = atoi(argv[++i]); continue; } if ((strcmp(argv[i], "-m") == 0)) { - mode = toupper(argv[++i][0]); + sb.mode = toupper(argv[++i][0]); + if (sb.mode != 'N' && sb.mode != 'T' && sb.mode != 'C' && sb.mode != 'H') + throw std::invalid_argument("Mode must be one of N, T, C or H."); continue; } if ((strcmp(argv[i], "--nv") == 0)) { - num_vecs = atoi(argv[++i]); + sb.num_vecs = atoi(argv[++i]); continue; } if ((strcmp(argv[i], "-b") == 0)) { - beta = atof(argv[++i]); + sb.beta = atof(argv[++i]); continue; } if ((strcmp(argv[i], "--layout") == 0)) { @@ -180,6 +261,15 @@ int main(int argc, char** argv) { layout = 'R'; else throw std::runtime_error("Invalid layout"); + continue; + } + if ((strcmp(argv[i], "--flush") == 0)) { + sb.flush_cache = true; + continue; + } + if ((strcmp(argv[i], "--non-reuse") == 0)) { + sb.non_reuse = true; + continue; } if ((strcmp(argv[i], "--help") == 0) || (strcmp(argv[i], "-h") == 0)) { print_help(); @@ -190,11 +280,9 @@ int main(int argc, char** argv) { Kokkos::initialize(argc, argv); if (layout == 'L') - run_spmv(size, size, filename, loop, num_vecs, mode, - beta); + sb.template run(); else - run_spmv(size, size, filename, loop, num_vecs, mode, - beta); + sb.template run(); Kokkos::finalize(); } diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spadd.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spadd.cpp index 3b347eb90325..a785ea82f628 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spadd.cpp @@ -303,8 +303,8 @@ void run_experiment(int argc, char** argv, CommonInputParams) { double numericTime = 0; // Do an untimed warm up symbolic, and preallocate space for C entries/values - spadd_symbolic(&kh, A.graph.row_map, A.graph.entries, B.graph.row_map, - B.graph.entries, row_mapC); + spadd_symbolic(exec_space{}, &kh, A.numRows(), A.numCols(), A.graph.row_map, + A.graph.entries, B.graph.row_map, B.graph.entries, row_mapC); bool use_kk = !params.use_cusparse && !params.use_mkl; @@ -366,7 +366,8 @@ void run_experiment(int argc, char** argv, CommonInputParams) { for (int sumRep = 0; sumRep < params.repeat; sumRep++) { timer.reset(); if (use_kk) { - spadd_symbolic(&kh, A.graph.row_map, A.graph.entries, B.graph.row_map, + spadd_symbolic(exec_space{}, &kh, A.numRows(), A.numCols(), + A.graph.row_map, A.graph.entries, B.graph.row_map, B.graph.entries, row_mapC); c_nnz = addHandle->get_c_nnz(); } else if (params.use_cusparse) { @@ -434,7 +435,8 @@ void run_experiment(int argc, char** argv, CommonInputParams) { } #endif } else { - spadd_numeric(&kh, A.graph.row_map, A.graph.entries, A.values, + spadd_numeric(exec_space{}, &kh, A.numRows(), A.numCols(), + A.graph.row_map, A.graph.entries, A.values, 1.0, // A, alpha B.graph.row_map, B.graph.entries, B.values, 1.0, // B, beta diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp index 0f705e12090d..33cb8a0f5f17 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp @@ -237,17 +237,10 @@ int main(int argc, char** argv) { Kokkos::print_configuration(std::cout); #if defined(KOKKOS_ENABLE_OPENMP) - if (params.use_openmp) { -#ifdef KOKKOSKERNELS_INST_MEMSPACE_HBWSPACE - KokkosKernels::Experiment::run_spgemm_jacobi< - size_type, lno_t, scalar_t, Kokkos::OpenMP, - Kokkos::Experimental::HBWSpace, Kokkos::HostSpace>(params); -#else KokkosKernels::Experiment::run_spgemm_jacobi< size_type, lno_t, scalar_t, Kokkos::OpenMP, Kokkos::OpenMP::memory_space, Kokkos::OpenMP::memory_space>(params); -#endif } #endif diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spiluk.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spiluk.cpp index 331ae9ec82b7..c85b126019e2 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spiluk.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spiluk.cpp @@ -144,12 +144,6 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, // std::cout << "Create handle" << std::endl; switch (test) { - case LVLSCHED_RP: - kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_RP, nrows, - EXPAND_FACT * nnz * (fill_lev + 1), - EXPAND_FACT * nnz * (fill_lev + 1)); - kh.get_spiluk_handle()->print_algorithm(); - break; case LVLSCHED_TP1: kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, nrows, EXPAND_FACT * nnz * (fill_lev + 1), diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp index aeaa37db9644..6adf55b26e0e 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_benchmark.cpp @@ -35,13 +35,20 @@ namespace { struct spmv_parameters { - int N, offset; + int N, offset, numvecs; + std::string mode; std::string filename; std::string alg; std::string tpl; spmv_parameters(const int N_) - : N(N_), offset(0), filename(""), alg(""), tpl("") {} + : N(N_), + offset(0), + numvecs(1), + mode(""), + filename(""), + alg(""), + tpl("") {} }; void print_options() { @@ -49,9 +56,11 @@ void print_options() { std::cerr << perf_test::list_common_options(); - std::cerr - << "\t[Optional] --repeat :: how many times to repeat overall test" - << std::endl; + std::cerr << "\t[Optional] --mode :: whether to run a suite of " + << "automated test or manually define one (auto, manual)" + << std::endl; + std::cerr << "\t[Optional] --repeat :: how many times to repeat overall " + << "test" << std::endl; std::cerr << " -n [N] :: generate a semi-random banded (band size " "0.01xN)\n" "NxN matrix with average of 10 entries per row." @@ -59,25 +68,30 @@ void print_options() { std::cerr << "\t[Optional] --alg :: the algorithm to run (default, " "native, merge)" << std::endl; - std::cerr - << "\t[Optional] --alg :: the algorithm to run (classic, merge)" - << std::endl; std::cerr << "\t[Optional] --TPL :: when available and compatible with " "alg, a TPL can be used (cusparse, rocsparse, MKL)" << std::endl; - std::cerr - << " -f [file] : Read in Matrix Market formatted text file 'file'." - << std::endl; + std::cerr << " -f [file] : Read in Matrix Market formatted text file" + << " 'file'." << std::endl; std::cerr << " --offset [O] : Subtract O from every index.\n" << " Useful in case the matrix market file is " "not 0 based." << std::endl; + std::cerr << " --num_vecs : The number of vectors stored in X and Y" + << std::endl; } // print_options void parse_inputs(int argc, char** argv, spmv_parameters& params) { for (int i = 1; i < argc; ++i) { if (perf_test::check_arg_int(i, argc, argv, "-n", params.N)) { ++i; + } else if (perf_test::check_arg_str(i, argc, argv, "--mode", params.alg)) { + if ((params.mode != "") && (params.mode != "auto") && + (params.alg != "manual")) { + throw std::runtime_error( + "--mode can only be an empty string, `auto` or `manual`!"); + } + ++i; } else if (perf_test::check_arg_str(i, argc, argv, "--alg", params.alg)) { if ((params.alg != "") && (params.alg != "default") && (params.alg != "native") && (params.alg != "merge")) { @@ -93,6 +107,9 @@ void parse_inputs(int argc, char** argv, spmv_parameters& params) { } else if (perf_test::check_arg_int(i, argc, argv, "--offset", params.offset)) { ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--num_vecs", + params.numvecs)) { + ++i; } else { print_options(); KK_USER_REQUIRE_MSG(false, "Unrecognized command line argument #" @@ -105,13 +122,21 @@ template void run_spmv(benchmark::State& state, const spmv_parameters& inputs) { using matrix_type = KokkosSparse::CrsMatrix; - using mv_type = Kokkos::View; - - KokkosKernels::Experimental::Controls controls; - if ((inputs.alg == "default") || (inputs.alg == "native") || - (inputs.alg == "merge")) { - controls.setParameter("algorithm", inputs.alg); + using mv_type = Kokkos::View; + using handle_t = + KokkosSparse::SPMVHandle; + + KokkosSparse::SPMVAlgorithm spmv_alg; + if ((inputs.alg == "default") || (inputs.alg == "")) { + spmv_alg = KokkosSparse::SPMVAlgorithm::SPMV_DEFAULT; + } else if (inputs.alg == "native") { + spmv_alg = KokkosSparse::SPMVAlgorithm::SPMV_NATIVE; + } else if (inputs.alg == "merge") { + spmv_alg = KokkosSparse::SPMVAlgorithm::SPMV_MERGE_PATH; + } else { + throw std::runtime_error("invalid spmv algorithm"); } + handle_t handle(spmv_alg); // Create test matrix srand(17312837); @@ -126,16 +151,17 @@ void run_spmv(benchmark::State& state, const spmv_parameters& inputs) { } // Create input vectors - mv_type x("X", A.numRows()); - mv_type y("Y", A.numCols()); + mv_type x("X", A.numRows(), inputs.numvecs); + mv_type y("Y", A.numCols(), inputs.numvecs); Kokkos::Random_XorShift64_Pool rand_pool(13718); Kokkos::fill_random(x, rand_pool, 10); Kokkos::fill_random(y, rand_pool, 10); + Kokkos::fence(); // Run the actual experiments for (auto _ : state) { - KokkosSparse::spmv(controls, KokkosSparse::NoTranspose, 1.0, A, x, 0.0, y); + KokkosSparse::spmv(&handle, KokkosSparse::NoTranspose, 1.0, A, x, 0.0, y); Kokkos::fence(); } } @@ -158,12 +184,25 @@ int main(int argc, char** argv) { spmv_parameters inputs(100000); parse_inputs(argc, argv, inputs); - // Google benchmark will report the wrong n if an input file matrix is used. - KokkosKernelsBenchmark::register_benchmark_real_time( - bench_name.c_str(), run_spmv, {"n"}, - {inputs.N}, common_params.repeat, inputs); - benchmark::RunSpecifiedBenchmarks(); + if ((inputs.mode == "") || (inputs.mode == "auto")) { + for (int n : {10000, 20000, 40000, 100000, 250000, 1000000}) { + for (int nv : {1, 2, 3, 4, 10}) { + inputs.N = n; + inputs.numvecs = nv; + KokkosKernelsBenchmark::register_benchmark_real_time( + bench_name.c_str(), run_spmv, + {"n", "nv"}, {inputs.N, inputs.numvecs}, common_params.repeat, + inputs); + } + } + } else { + // Google benchmark will report the wrong n if an input file matrix is used. + KokkosKernelsBenchmark::register_benchmark_real_time( + bench_name.c_str(), run_spmv, {"n"}, + {inputs.N}, common_params.repeat, inputs); + } + benchmark::RunSpecifiedBenchmarks(); benchmark::Shutdown(); Kokkos::finalize(); diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_bsr.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_bsr.cpp index d3b038f0e42f..d96a3c6c8d76 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_bsr.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_bsr.cpp @@ -159,18 +159,22 @@ int test_bsr_matrix_single_vec( y_vector_type ycrs("crs_product_result", nRow); auto h_ycrs = Kokkos::create_mirror_view(ycrs); - KokkosKernels::Experimental::Controls controls; + KokkosSparse::SPMVAlgorithm algo = KokkosSparse::SPMV_DEFAULT; + switch (static_cast(test)) { case Implementation::KokkosKernels: { - controls.setParameter("algorithm", "native"); + algo = KokkosSparse::SPMV_NATIVE; } break; default: break; } + KokkosSparse::SPMVHandle + handle_crs(algo); // Do the multiplication for warming up for (Ordinal ir = 0; ir < nRow; ++ir) h_ycrs(ir) = h_y0(ir); Kokkos::deep_copy(ycrs, h_ycrs); - KokkosSparse::spmv(controls, fOp, alpha, Acrs, xref, beta, ycrs); + KokkosSparse::spmv(&handle_crs, fOp, alpha, Acrs, xref, beta, ycrs); // Time a series of multiplications with the CrsMatrix double time_crs = 0.0; @@ -178,7 +182,7 @@ int test_bsr_matrix_single_vec( for (Ordinal ir = 0; ir < nRow; ++ir) h_ycrs(ir) = h_y0(ir); Kokkos::deep_copy(ycrs, h_ycrs); Kokkos::Timer timer; - KokkosSparse::spmv(controls, fOp, alpha, Acrs, xref, beta, ycrs); + KokkosSparse::spmv(&handle_crs, fOp, alpha, Acrs, xref, beta, ycrs); time_crs += timer.seconds(); Kokkos::fence(); } @@ -192,10 +196,14 @@ int test_bsr_matrix_single_vec( scalar_t, Ordinal, Kokkos::DefaultExecutionSpace, void, int> Absr(Acrs, blockSize); + KokkosSparse::SPMVHandle + handle_bsr(algo); + // Do the multiplication for warming up for (Ordinal ir = 0; ir < nRow; ++ir) h_ybsr(ir) = h_y0(ir); Kokkos::deep_copy(ybsr, h_ybsr); - KokkosSparse::spmv(controls, fOp, alpha, Absr, xref, beta, ybsr); + KokkosSparse::spmv(&handle_bsr, fOp, alpha, Absr, xref, beta, ybsr); // Time a series of multiplications with the BsrMatrix double time_bsr = 0.0; @@ -203,7 +211,7 @@ int test_bsr_matrix_single_vec( for (Ordinal ir = 0; ir < nRow; ++ir) h_ybsr(ir) = h_y0(ir); Kokkos::deep_copy(ybsr, h_ybsr); Kokkos::Timer timer; - KokkosSparse::spmv(controls, fOp, alpha, Absr, xref, beta, ybsr); + KokkosSparse::spmv(&handle_bsr, fOp, alpha, Absr, xref, beta, ybsr); time_bsr += timer.seconds(); Kokkos::fence(); } @@ -316,19 +324,23 @@ int test_bsr_matrix_vec( block_vector_t ycrs("crs_product_result", nRow, nvec); auto h_ycrs = Kokkos::create_mirror_view(ycrs); - KokkosKernels::Experimental::Controls controls; + KokkosSparse::SPMVAlgorithm algo = KokkosSparse::SPMV_DEFAULT; + switch (static_cast(test)) { case Implementation::KokkosKernels: { - controls.setParameter("algorithm", "native"); + algo = KokkosSparse::SPMV_NATIVE; } break; default: break; } + KokkosSparse::SPMVHandle + handle_crs(algo); // Do the multiplication for warming up for (Ordinal jc = 0; jc < nvec; ++jc) for (Ordinal ir = 0; ir < nRow; ++ir) h_ycrs(ir, jc) = h_y0(ir, jc); Kokkos::deep_copy(ycrs, h_ycrs); - KokkosSparse::spmv(controls, fOp, alpha, Acrs, xref, beta, ycrs); + KokkosSparse::spmv(&handle_crs, fOp, alpha, Acrs, xref, beta, ycrs); // Time a series of multiplications with the CrsMatrix format double time_crs = 0.0; @@ -337,7 +349,7 @@ int test_bsr_matrix_vec( for (Ordinal ir = 0; ir < nRow; ++ir) h_ycrs(ir, jc) = h_y0(ir, jc); Kokkos::deep_copy(ycrs, h_ycrs); Kokkos::Timer timer; - KokkosSparse::spmv(controls, fOp, alpha, Acrs, xref, beta, ycrs); + KokkosSparse::spmv(&handle_crs, fOp, alpha, Acrs, xref, beta, ycrs); time_crs += timer.seconds(); Kokkos::fence(); } @@ -347,6 +359,10 @@ int test_bsr_matrix_vec( scalar_t, Ordinal, Kokkos::DefaultExecutionSpace, void, int> Absr(Acrs, blockSize); + KokkosSparse::SPMVHandle + handle_bsr(algo); + block_vector_t ybsr("bsr_product_result", nRow, nvec); auto h_ybsr = Kokkos::create_mirror_view(ybsr); @@ -354,7 +370,7 @@ int test_bsr_matrix_vec( for (Ordinal jc = 0; jc < nvec; ++jc) for (Ordinal ir = 0; ir < nRow; ++ir) h_ybsr(ir, jc) = h_y0(ir, jc); Kokkos::deep_copy(ybsr, h_ybsr); - KokkosSparse::spmv(controls, fOp, alpha, Absr, xref, beta, ybsr); + KokkosSparse::spmv(&handle_bsr, fOp, alpha, Absr, xref, beta, ybsr); // Time a series of multiplications with the BsrMatrix double time_bsr = 0.0; @@ -363,7 +379,7 @@ int test_bsr_matrix_vec( for (Ordinal ir = 0; ir < nRow; ++ir) h_ybsr(ir, jc) = h_y0(ir, jc); Kokkos::deep_copy(ybsr, h_ybsr); Kokkos::Timer timer; - KokkosSparse::spmv(controls, fOp, alpha, Absr, xref, beta, ybsr); + KokkosSparse::spmv(&handle_bsr, fOp, alpha, Absr, xref, beta, ybsr); time_bsr += timer.seconds(); Kokkos::fence(); } diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_bsr_benchmark.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_bsr_benchmark.cpp index 770b09cfb1f1..254a35c34fb9 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_bsr_benchmark.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_bsr_benchmark.cpp @@ -207,9 +207,10 @@ struct SpmvNative { typename YView> static void spmv(const char *mode, const Alpha &alpha, const Matrix &crs, const XView &x, const Beta &beta, const YView &y) { - KokkosKernels::Experimental::Controls controls; - controls.setParameter("algorithm", "native"); - return KokkosSparse::spmv(controls, mode, alpha, crs, x, beta, y); + KokkosSparse::SPMVHandle + handle(KokkosSparse::SPMV_NATIVE); + return KokkosSparse::spmv(&handle, mode, alpha, crs, x, beta, y); } static std::string name() { return "native"; } @@ -221,9 +222,10 @@ struct SpmvV41 { typename YView> static void spmv(const char *mode, const Alpha &alpha, const Matrix &crs, const XView &x, const Beta &beta, const YView &y) { - KokkosKernels::Experimental::Controls controls; - controls.setParameter("algorithm", "v4.1"); - return KokkosSparse::spmv(controls, mode, alpha, crs, x, beta, y); + KokkosSparse::SPMVHandle + handle(KokkosSparse::SPMV_BSR_V41); + return KokkosSparse::spmv(&handle, mode, alpha, crs, x, beta, y); } static std::string name() { return "v4.1"; } @@ -473,4 +475,4 @@ int main(int argc, char **argv) { drop_cache(); Kokkos::finalize(); return 0; -} \ No newline at end of file +} diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_merge.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_merge.cpp index 6ad772116ef2..fdd2905b523c 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_merge.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_merge.cpp @@ -148,9 +148,8 @@ matrix_type generate_unbalanced_matrix( void print_help() { printf("SPMV merge benchmark code written by Luc Berger-Vergiat.\n"); - printf( - "The goal is to test cusSPARSE's merge algorithm on imbalanced " - "matrices."); + printf("The goal is to compare the merge path algorithm vs.\n"); + printf("TPLs and the KK native algorithm on imbalanced matrices.\n"); printf("Options:\n"); printf( " --compare : Compare the performance of the merge algo with the " @@ -233,35 +232,59 @@ int main(int argc, char** argv) { Kokkos::initialize(argc, argv); { - if (std::is_same::value) { - // Note that we template the matrix with entries=lno_t and offsets=lno_t - // to make sure it verifies the cusparse requirements - using matrix_type = - KokkosSparse::CrsMatrix; - using values_type = typename matrix_type::values_type::non_const_type; - const Scalar SC_ONE = Kokkos::ArithTraits::one(); - const Scalar alpha = SC_ONE + SC_ONE; - const Scalar beta = alpha + SC_ONE; - - matrix_type test_matrix = generate_unbalanced_matrix( - numRows, numEntries, numLongRows, numLongEntries); - - values_type y("right hand side", test_matrix.numRows()); - values_type x("left hand side", test_matrix.numCols()); - Kokkos::deep_copy(x, SC_ONE); - Kokkos::deep_copy(y, SC_ONE); - - KokkosKernels::Experimental::Controls controls; - controls.setParameter("algorithm", "merge"); - - // Perform a so called "warm-up" run - KokkosSparse::spmv(controls, "N", alpha, test_matrix, x, beta, y); - - double min_time = 1.0e32, max_time = 0.0, avg_time = 0.0; + // Note that we template the matrix with entries=lno_t and offsets=lno_t + // so that TPLs can be used + using matrix_type = + KokkosSparse::CrsMatrix; + using values_type = typename matrix_type::values_type::non_const_type; + using handle_type = + KokkosSparse::SPMVHandle; + const Scalar SC_ONE = Kokkos::ArithTraits::one(); + const Scalar alpha = SC_ONE + SC_ONE; + const Scalar beta = alpha + SC_ONE; + + matrix_type test_matrix = generate_unbalanced_matrix( + numRows, numEntries, numLongRows, numLongEntries); + + values_type y("right hand side", test_matrix.numRows()); + values_type x("left hand side", test_matrix.numCols()); + Kokkos::deep_copy(x, SC_ONE); + Kokkos::deep_copy(y, SC_ONE); + + handle_type handleMerge(KokkosSparse::SPMV_MERGE_PATH); + + // Perform a so called "warm-up" run + KokkosSparse::spmv(&handleMerge, "N", alpha, test_matrix, x, beta, y); + + double min_time = 1.0e32, max_time = 0.0, avg_time = 0.0; + for (int iterIdx = 0; iterIdx < loop; ++iterIdx) { + Kokkos::Timer timer; + KokkosSparse::spmv(&handleMerge, "N", alpha, test_matrix, x, beta, y); + Kokkos::fence(); + double time = timer.seconds(); + avg_time += time; + if (time > max_time) max_time = time; + if (time < min_time) min_time = time; + } + + std::cout << "KK Merge alg --- min: " << min_time << " max: " << max_time + << " avg: " << avg_time / loop << std::endl; + + // Run the cusparse default algorithm and native kokkos-kernels algorithm + // then output timings for comparison + if (compare) { + handle_type handleDefault; + // Warm up + KokkosSparse::spmv(&handleDefault, "N", alpha, test_matrix, x, beta, y); + + min_time = 1.0e32; + max_time = 0.0; + avg_time = 0.0; for (int iterIdx = 0; iterIdx < loop; ++iterIdx) { Kokkos::Timer timer; - KokkosSparse::spmv(controls, "N", alpha, test_matrix, x, beta, y); + KokkosSparse::spmv(&handleDefault, "N", alpha, test_matrix, x, beta, y); Kokkos::fence(); double time = timer.seconds(); avg_time += time; @@ -269,58 +292,28 @@ int main(int argc, char** argv) { if (time < min_time) min_time = time; } - std::cout << "cuSPARSE Merge alg --- min: " << min_time + std::cout << "Default alg --- min: " << min_time << " max: " << max_time << " avg: " << avg_time / loop << std::endl; - // Run the cusparse default algorithm and native kokkos-kernels algorithm - // then output timings for comparison - if (compare) { - controls.setParameter("algorithm", "default"); - - min_time = 1.0e32; - max_time = 0.0; - avg_time = 0.0; - for (int iterIdx = 0; iterIdx < loop; ++iterIdx) { - Kokkos::Timer timer; - KokkosSparse::spmv(controls, "N", alpha, test_matrix, x, beta, y); - Kokkos::fence(); - double time = timer.seconds(); - avg_time += time; - if (time > max_time) max_time = time; - if (time < min_time) min_time = time; - } - - std::cout << "cuSPARSE Default alg --- min: " << min_time - << " max: " << max_time << " avg: " << avg_time / loop - << std::endl; - - controls.setParameter("algorithm", "native"); - - min_time = 1.0e32; - max_time = 0.0; - avg_time = 0.0; - for (int iterIdx = 0; iterIdx < loop; ++iterIdx) { - Kokkos::Timer timer; - // KokkosSparse::spmv(controls, "N", alpha, test_matrix, x, beta, y); - KokkosSparse::Impl::spmv_beta(Kokkos::DefaultExecutionSpace{}, - controls, "N", alpha, test_matrix, x, - beta, y); - Kokkos::fence(); - double time = timer.seconds(); - avg_time += time; - if (time > max_time) max_time = time; - if (time < min_time) min_time = time; - } - - std::cout << "Kokkos Native alg --- min: " << min_time - << " max: " << max_time << " avg: " << avg_time / loop - << std::endl; + handle_type handleNative(KokkosSparse::SPMV_NATIVE); + KokkosSparse::spmv(&handleNative, "N", alpha, test_matrix, x, beta, y); + + min_time = 1.0e32; + max_time = 0.0; + avg_time = 0.0; + for (int iterIdx = 0; iterIdx < loop; ++iterIdx) { + Kokkos::Timer timer; + KokkosSparse::spmv(&handleNative, "N", alpha, test_matrix, x, beta, y); + Kokkos::fence(); + double time = timer.seconds(); + avg_time += time; + if (time > max_time) max_time = time; + if (time < min_time) min_time = time; } - } else { - std::cout << "The default execution space is not Cuda, nothing to do!" + + std::cout << "KK Native alg --- min: " << min_time + << " max: " << max_time << " avg: " << avg_time / loop << std::endl; } } diff --git a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp index 02fcd1640aa3..85aab62122f5 100644 --- a/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp +++ b/packages/kokkos-kernels/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp @@ -581,7 +581,9 @@ int main(int argc, char** argv) { const double alpha = 1.0, beta = 1.0; size_t bufferSize = 0; void* dBuffer = NULL; -#if CUSPARSE_VERSION >= 11201 + +// CUSPARSE_MM_ALG_DEFAULT was deprecated in CUDA 11.2.1 a.k.a cuSPARSE 11.4.0 +#if CUSPARSE_VERSION >= 11400 cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT; #else cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT; diff --git a/packages/kokkos-kernels/scripts/cm_test_all_sandia b/packages/kokkos-kernels/scripts/cm_test_all_sandia index 28ef93b0049c..eb296091af8c 100755 --- a/packages/kokkos-kernels/scripts/cm_test_all_sandia +++ b/packages/kokkos-kernels/scripts/cm_test_all_sandia @@ -91,7 +91,10 @@ print_help() { echo "--with-tpls=TPLS: set KOKKOSKERNELS_ENABLE_TPLS" echo " Provide a comma-separated list of TPLs" echo " Valid items:" - echo " blas, mkl, cublas, cusparse, magma, armpl, rocblas, rocsparse" + echo " blas, mkl, cublas, cusparse, cusolver, magma, armpl, rocblas, rocsparse, rocsolver" + echo "" + echo "--cmake-flags=[CMAKE Command options]: Set Kokkos Kernels cmake options not handled by script" + echo "--kokkos-cmake-flags=[CMAKE Command options]: Set Kokkos cmake options not handled by script" echo "" echo "ARGS: list of expressions matching compilers to test" @@ -145,14 +148,8 @@ if [[ "$HOSTNAME" =~ weaver.* ]]; then module load git fi -if [[ "$HOSTNAME" =~ .*voltrino.* ]]; then - MACHINE=voltrino - module load git -fi - if [[ "$HOSTNAME" == *blake* ]]; then # Warning: very generic name MACHINE=blake - module load git fi if [[ "$HOSTNAME" == *solo* ]]; then # Warning: very generic name @@ -163,15 +160,6 @@ if [[ "$HOSTNAME" == kokkos-dev-2* ]]; then MACHINE=kokkos-dev-2 fi -if [[ "$HOSTNAME" == may* ]]; then - MACHINE=mayer -# module load git -fi - -if [[ "$HOSTNAME" == cn* ]]; then # Warning: very generic name - MACHINE=mayer -fi - if [[ "$HOSTNAME" == caraway* ]]; then # Warning: very generic name MACHINE=caraway fi @@ -210,7 +198,6 @@ fi echo "Running on machine: $MACHINE" GCC_BUILD_LIST="OpenMP,Threads,Serial,OpenMP_Serial,Threads_Serial" -IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" ARM_GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" INTEL_BUILD_LIST="OpenMP,Threads,Serial,OpenMP_Serial,Threads_Serial" CLANG_BUILD_LIST="Threads,Serial,Threads_Serial" @@ -218,7 +205,6 @@ CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Threads,Cuda_Serial" CUDA_IBM_BUILD_LIST="Cuda_OpenMP,Cuda_Serial" GCC_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized" -IBM_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Wsign-compare,-Wtype-limits,-Wuninitialized" CLANG_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" INTEL_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized,-diag-disable=1011,-diag-disable=869" CUDA_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" @@ -418,6 +404,12 @@ do --with-tpls*) KOKKOSKERNELS_ENABLE_TPLS="${key#*=}" ;; + --cmake-flags*) + PASSTHRU_CMAKE_FLAGS="${key#*=}" + ;; + --kokkos-cmake-flags*) + KOKKOS_PASSTHRU_CMAKE_FLAGS="${key#*=}" + ;; --help*) PRINT_HELP=True ;; @@ -636,45 +628,14 @@ elif [ "$MACHINE" = "weaver" ]; then SPACK_HOST_ARCH="+power9" SPACK_CUDA_ARCH="+volta70" -elif [ "$MACHINE" = "voltrino" ]; then - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=32 - - BASE_MODULE_LIST="PrgEnv-intel,craype-mic-knl,cmake/3.16.2,slurm/20.11.4a,/,gcc/9.3.0" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("intel/19.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - ) - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=KNL" - fi -elif [ "$MACHINE" = "mayer" ]; then - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=96 - - BASE_MODULE_LIST="cmake/3.17.1,/" - - ARMCLANG_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Wsign-compare,-Wtype-limits,-Wuninitialized" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gnu9/9.3.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "arm/20.1 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST armclang++ $ARMCLANG_WARNING_FLAGS" - ) - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=ARMV8_THUNDERX2" - fi - - SPACK_HOST_ARCH="+armv8_tx2" elif [ "$MACHINE" = "caraway" ]; then SKIP_HWLOC=True # BUILD_ONLY=True # report_and_log_test_result: only testing compilation of code for now, # output description and success based only on build succes; build time output (no run-time) - BASE_MODULE_LIST="cmake/3.19.3,/" - ROCM520_MODULE_LIST="$BASE_MODULE_LIST,openblas/0.3.20/rocm/5.2.0" + BASE_MODULE_LIST="cmake,/" + ROCM520_MODULE_LIST="$BASE_MODULE_LIST,openblas/0.3.20" HIPCLANG_BUILD_LIST="Hip_Serial" HIPCLANG_WARNING_FLAGS="" @@ -686,10 +647,7 @@ elif [ "$MACHINE" = "caraway" ]; then else # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("rocm/5.2.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" - "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/11.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/11.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" ) fi @@ -705,23 +663,23 @@ elif [ "$MACHINE" = "vega90a_caraway" ]; then # output description and success based only on build succes; build time output (no run-time) BASE_MODULE_LIST="cmake,/" - ROCM520_MODULE_LIST="$BASE_MODULE_LIST,openblas/0.3.20/rocm/5.2.0" + ROCM520_MODULE_LIST="$BASE_MODULE_LIST,openblas/0.3.20" + ROCM_TPL_MODULE_LIST="$BASE_MODULE_LIST,openblas/0.3.23" HIPCLANG_BUILD_LIST="Hip_Serial" HIPCLANG_WARNING_FLAGS="" if [ "$SPOT_CHECK_TPLS" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("rocm/5.6.0 $ROCM520_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" + COMPILERS=("rocm/5.6.1 $ROCM_TPL_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" + "rocm/6.0.0 $ROCM_TPL_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" ) else # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("rocm/5.6.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" + COMPILERS=("rocm/5.2.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" "rocm/5.6.1 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" - "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/11.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "rocm/6.0.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" + "gcc/11.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" ) fi @@ -731,53 +689,56 @@ elif [ "$MACHINE" = "vega90a_caraway" ]; then ARCH_FLAG="--arch=VEGA90A" fi elif [ "$MACHINE" = "blake" ]; then + MODULE_ENVIRONMENT="source /projects/x86-64-icelake-rocky8/spack-config/blake-setup-user-module-env.sh" eval "$MODULE_ENVIRONMENT" SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=32 - - module load cmake/3.19.3 - BASE_MODULE_LIST="cmake/3.19.3,/" - BASE_MODULE_LIST_INTEL="cmake/3.19.3,/compilers/" - BASE_MODULE_LIST_ONEAPI="cmake/3.19.3,/oneAPI/base-toolkit/,/oneAPI/hpc-toolkit/" - ONEAPI_WARNING_FLAGS="" + module load cmake - GCC102_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/0.3.21/gcc/10.2.0" + BASE_MODULE_LIST="cmake,/" + BASE_MODULE_LIST_TPLS="cmake,/,openblas/0.3.23" + BASE_MODULE_LIST_ONEAPI_202310="cmake,-oneapi-compilers/,intel-oneapi-dpl/2022.1.0,intel-oneapi-mkl/2023.1.0,intel-oneapi-tbb/2021.9.0" + BASE_MODULE_LIST_ONEAPI_202320="cmake,-oneapi-compilers/,intel-oneapi-dpl/2022.2.0,intel-oneapi-mkl/2023.2.0,intel-oneapi-tbb/2021.10.0" + ONEAPI_FLAGS_EXTRA="-fp-model=precise" + LLVM_EXTRA_FLAGS="-fPIC ${CLANG_WARNING_FLAGS}" + # Remove -Wuninitialized: compiler issues show up with Threads backend + GCC11_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered" + # update KOKKOS_PASSTHRU_CMAKE_FLAGS to disable onedpl on Blake + KOKKOS_PASSTHRU_CMAKE_FLAGS="${KOKKOS_PASSTHRU_CMAKE_FLAGS} -DKokkos_ENABLE_ONEDPL=OFF" if [ "$SPOT_CHECK" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) - # TODO: Failing toolchains: - #"intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - #"pgi/18.7.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" - COMPILERS=("clang/10.0.1 $BASE_MODULE_LIST "Threads_Serial" clang++ $CLANG_WARNING_FLAGS" - "intel/19.5.281 $BASE_MODULE_LIST_INTEL "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" - "gcc/10.2.0 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GCC_WARNING_FLAGS" - "gcc/11.2.0 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GCC_WARNING_FLAGS" + COMPILERS=("intel/2023.1.0 $BASE_MODULE_LIST_ONEAPI_202310 "OpenMP,Threads,Serial" icpx $ONEAPI_FLAGS_EXTRA" + "intel/2023.2.0 $BASE_MODULE_LIST_ONEAPI_202320 "OpenMP,Threads,Serial" icpx $ONEAPI_FLAGS_EXTRA" + "llvm/15.0.7 $BASE_MODULE_LIST "Threads,Serial" clang++ $LLVM_EXTRA_FLAGS" + "gcc/11.3.0 $BASE_MODULE_LIST "OpenMP,Threads,Serial" g++ $GCC11_WARNING_FLAGS" + "gcc/12.2.0 $BASE_MODULE_LIST "OpenMP,Threads,Serial" g++ $GCC11_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("intel/19.5.281 $BASE_MODULE_LIST_INTEL "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" - "gcc/10.2.0 $GCC102_MODULE_TPL_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS" + # Known issues: + # gcc/12.2.0+openblas/0.3.23 with OpenMP: internal compiler error: in get_vectype_for_scalar_type, at tree-vect-stmts + COMPILERS=("intel/2023.1.0 $BASE_MODULE_LIST_ONEAPI_202310 "OpenMP,Threads,Serial" icpx $ONEAPI_FLAGS_EXTRA" + "intel/2023.2.0 $BASE_MODULE_LIST_ONEAPI_202320 "OpenMP,Threads,Serial" icpx $ONEAPI_FLAGS_EXTRA" + "llvm/15.0.7 $BASE_MODULE_LIST_TPLS "Threads,Serial" clang++ $LLVM_EXTRA_FLAGS" + "gcc/11.3.0 $BASE_MODULE_LIST_TPLS "OpenMP,Threads,Serial" g++ $GCC11_WARNING_FLAGS" + "gcc/12.2.0 $BASE_MODULE_LIST_TPLS "OpenMP,Threads,Serial" g++ $GCC11_WARNING_FLAGS" ) else - COMPILERS=("intel/19.5.281 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/2021.2.0 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS" - "intel/2021.4.0 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS" - "intel/2022.1.2 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS" - "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/8.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/11.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "clang/10.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" + # gcc/12.2.0 with OpenMP: internal compiler error: in get_vectype_for_scalar_type, at tree-vect-stmts + COMPILERS=("intel/2023.1.0 $BASE_MODULE_LIST_ONEAPI_202310 $INTEL_BUILD_LIST icpx $ONEAPI_FLAGS_EXTRA" + "intel/2023.2.0 $BASE_MODULE_LIST_ONEAPI_202320 $INTEL_BUILD_LIST icpx $ONEAPI_FLAGS_EXTRA" + "llvm/15.0.7 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $LLVM_EXTRA_FLAGS" + "gcc/11.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC11_WARNING_FLAGS" + "gcc/12.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC11_WARNING_FLAGS" ) fi if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=SKX" + ARCH_FLAG="--arch=SPR" fi - SPACK_HOST_ARCH="+skx" + SPACK_HOST_ARCH="+spr" elif [ "$MACHINE" = "solo" ]; then SKIP_HWLOC=True export SLURM_TASKS_PER_NODE=32 @@ -792,8 +753,7 @@ elif [ "$MACHINE" = "solo" ]; then GNU102_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/0.3.21" if [ "$SPOT_CHECK" = "True" ]; then - COMPILERS=( - "gnu/10.2.1 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GNU_WARNING_FLAGS" + COMPILERS=("gnu/10.2.1 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GNU_WARNING_FLAGS" "llvm/10.0.1 $BASE_MODULE_LIST_LLVM "Threads_Serial" clang++ $CLANG_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then @@ -802,8 +762,7 @@ elif [ "$MACHINE" = "solo" ]; then ) else ###"clang/10.0.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - COMPILERS=( - "gnu/10.2.1 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" + COMPILERS=("gnu/10.2.1 $BASE_MODULE_LIST $GNU_BUILD_LIST g++ $GNU_WARNING_FLAGS" ) fi @@ -884,6 +843,7 @@ fi export OMP_NUM_THREADS=${omp_num_threads:=8} export OMP_PROC_BIND=${omp_proc_bind:=spread} export OMP_PLACES=${omp_places:=cores} +export KOKKOS_NUM_THREADS=8 declare -i NUM_RESULTS_TO_KEEP=7 @@ -947,6 +907,7 @@ if [ "$COMPILERS_TO_TEST" == "" ]; then exit 1 fi + # # Functions. # @@ -1082,11 +1043,11 @@ setup_env() { if [[ "${SPOT_CHECK_TPLS}" = "True" ]]; then # device tpls if [[ "$compiler" == cuda* ]]; then - NEW_TPL_LIST="cublas,cusparse," + NEW_TPL_LIST="cublas,cusparse,cusolver," export KOKKOS_CUDA_OPTIONS="${KOKKOS_CUDA_OPTIONS},enable_lambda" fi if [[ "$compiler" == rocm* ]]; then - NEW_TPL_LIST="rocblas,rocsparse," + NEW_TPL_LIST="rocblas,rocsparse,rocsolver," fi # host tpls - use mkl with intel, else use host blas if [[ "$compiler" == intel* ]]; then @@ -1120,10 +1081,9 @@ setup_env() { if [[ "${SPOT_CHECK_TPLS}" = "True" ]]; then # Some machines will require explicitly setting include dirs and libs - if ([[ "$MACHINE" = weaver* ]] || [[ "$MACHINE" = blake* ]] || [[ "$MACHINE" = sogpu* ]]) && [[ "$mod" = openblas* ]]; then + if ([[ "$MACHINE" = weaver* ]] || [[ "$MACHINE" = sogpu* ]]) && [[ "$mod" = openblas* ]]; then BLAS_LIBRARY_DIRS="${OPENBLAS_ROOT}/lib" LAPACK_LIBRARY_DIRS="${OPENBLAS_ROOT}/lib" - # BLAS_LIBRARIES="openblas" BLAS_LIBRARIES="blas" LAPACK_LIBRARIES="lapack" KOKKOSKERNELS_TPL_PATH_CMD="--user-blas-path=${BLAS_LIBRARY_DIRS} --user-lapack-path=${LAPACK_LIBRARY_DIRS}" @@ -1131,6 +1091,16 @@ setup_env() { KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD="--extra-linker-flags=-lgfortran,-lm" echo "TPL PATHS: KOKKOSKERNELS_TPL_PATH_CMD=$KOKKOSKERNELS_TPL_PATH_CMD" echo "TPL LIBS: KOKKOSKERNELS_TPL_LIBS_CMD=$KOKKOSKERNELS_TPL_LIBS_CMD" + elif [[ "$MACHINE" = blake* ]] && [[ "$mod" = openblas* ]]; then + BLAS_LIBRARY_DIRS="${OPENBLAS_ROOT}/lib" + LAPACK_LIBRARY_DIRS="${OPENBLAS_ROOT}/lib" + BLAS_LIBRARIES="openblas" + LAPACK_LIBRARIES="openblas" + KOKKOSKERNELS_TPL_PATH_CMD="--user-blas-path=${BLAS_LIBRARY_DIRS} --user-lapack-path=${LAPACK_LIBRARY_DIRS}" + KOKKOSKERNELS_TPL_LIBS_CMD="--user-blas-lib=${BLAS_LIBRARIES} --user-lapack-lib=${LAPACK_LIBRARIES}" + KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD="--extra-linker-flags=-lgfortran,-lm" + echo "TPL PATHS: KOKKOSKERNELS_TPL_PATH_CMD=$KOKKOSKERNELS_TPL_PATH_CMD" + echo "TPL LIBS: KOKKOSKERNELS_TPL_LIBS_CMD=$KOKKOSKERNELS_TPL_LIBS_CMD" elif ([[ "$MACHINE" = weaver* ]]) && [[ "$mod" = netlib* ]]; then BLAS_LIBRARY_DIRS="${BLAS_ROOT}/lib" LAPACK_LIBRARY_DIRS="${BLAS_ROOT}/lib" @@ -1161,8 +1131,9 @@ single_build_and_test() { # Set up env. local compiler_modules_list=$(get_compiler_modules $compiler) - mkdir -p $ROOT_DIR/$compiler/"${build}-$build_type" - cd $ROOT_DIR/$compiler/"${build}-$build_type" + local BUILD_AND_TEST_DIR=$ROOT_DIR/$compiler/"${build}-$build_type" + mkdir -p $BUILD_AND_TEST_DIR + cd $BUILD_AND_TEST_DIR local kokkos_variants=$(get_kokkos_variants $compiler) local kernels_variants=$(get_kernels_variants $compiler) @@ -1205,6 +1176,7 @@ single_build_and_test() { echo " export OMP_NUM_THREADS=$omp_num_threads" &>> reload_modules.sh echo " export OMP_PROC_BIND=$omp_proc_bind" &>> reload_modules.sh echo " export OMP_PLACES=$omp_places" &>> reload_modules.sh + echo " export KOKKOS_NUM_THREADS=8" &>> reload_modules.sh echo "" &>> reload_modules.sh chmod +x reload_modules.sh @@ -1284,6 +1256,7 @@ single_build_and_test() { HIP_ENABLE_CMD="--with-hip" fi local arch_code=$(echo $ARCH_FLAG | cut -d "=" -f 2) + local tpl_list_print=$(echo $KOKKOSKERNELS_ENABLE_TPL_CMD | cut -d "=" -f2-) echo "kokkos devices: ${LOCAL_KOKKOS_DEVICES}" echo "kokkos arch: ${arch_code}" echo "kokkos options: ${KOKKOS_OPTIONS}" @@ -1294,16 +1267,17 @@ single_build_and_test() { echo "kokkoskernels ordinals: ${KOKKOSKERNELS_ORDINALS}" echo "kokkoskernels offsets: ${KOKKOSKERNELS_OFFSETS}" echo "kokkoskernels layouts: ${KOKKOSKERNELS_LAYOUTS}" + echo "kokkoskernels tpls list: ${tpl_list_print}" # KOKKOS_OPTIONS and KOKKOS_CUDA_OPTIONS are exported and detected by kokkos' generate_makefile.sh during install of kokkos; we pass them to the reproducer script instructions echo " # Use generate_makefile line below to call cmake which generates makefile for this build:" &> call_generate_makefile.sh - echo " ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples ${KOKKOS_DEPRECATED_CODE} $extra_args" &>> call_generate_makefile.sh + echo " ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples ${KOKKOS_DEPRECATED_CODE} --cmake-flags=${PASSTHRU_CMAKE_FLAGS} --kokkos-cmake-flags=${KOKKOS_PASSTHRU_CMAKE_FLAGS} $extra_args" &>> call_generate_makefile.sh chmod +x call_generate_makefile.sh # script command with generic path for faster copy/paste of reproducer into issues - echo " # \$KOKKOSKERNELS_PATH/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=\$KOKKOS_PATH --kokkoskernels-path=\$KOKKOSKERNELS_PATH --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples ${KOKKOS_DEPRECATED_CODE} $extra_args" &> call_generate_makefile_genericpath.sh + echo " # \$KOKKOSKERNELS_PATH/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=\$KOKKOS_PATH --kokkoskernels-path=\$KOKKOSKERNELS_PATH --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples ${KOKKOS_DEPRECATED_CODE} --cmake-flags=${PASSTHRU_CMAKE_FLAGS} --kokkos-cmake-flags=${KOKKOS_PASSTHRU_CMAKE_FLAGS} $extra_args" &> call_generate_makefile_genericpath.sh - run_cmd ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples ${KOKKOS_DEPRECATED_CODE} $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } + run_cmd ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples ${KOKKOS_DEPRECATED_CODE} --cmake-flags=${PASSTHRU_CMAKE_FLAGS} --kokkos-cmake-flags=${KOKKOS_PASSTHRU_CMAKE_FLAGS} $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } local -i build_start_time=$(date +%s) run_cmd make -j $MAKE_PAR_LEVEL all >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; } @@ -1421,9 +1395,15 @@ wait_summarize_and_exit() { rv=$rv+1 local str=$failed_test - local comp=$(echo "$str" | cut -d- -f1) - local vers=$(echo "$str" | cut -d- -f2) - local lbuild=$(echo "$str" | cut -d- -f3-) + # Note: all relevant info in str to assemble the build directory path + # is separated by dashes; however the compiler name may include dashes as well + # the final two pieces of str always the version and build-type (as set in BUILD_AND_TEST_DIR) + # leaving the compiler name as the remaining fields preceding version + local getdashes="${str//[^-]}" + local numdashes=${#getdashes} + local lbuild=$(echo "$str" | cut -d- -f${numdashes}-) + local vers=$(echo "$str" | cut -d- -f$((numdashes-1))) + local comp=$(echo "$str" | cut -d- -f-$((numdashes-2))) # Generate reproducer instructions #local filename=reproducer_instructions-$comp-$vers-$lbuild local faildir=$ROOT_DIR/$comp/$vers/$lbuild diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_cpp/spmv/KokkosSparse_spmv_bsrmatrix_eti_spec_inst.cpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_cpp/spmv/KokkosSparse_spmv_bsrmatrix_eti_spec_inst.cpp.in index 9895083764a5..077150f36c75 100644 --- a/packages/kokkos-kernels/sparse/eti/generated_specializations_cpp/spmv/KokkosSparse_spmv_bsrmatrix_eti_spec_inst.cpp.in +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_cpp/spmv/KokkosSparse_spmv_bsrmatrix_eti_spec_inst.cpp.in @@ -19,11 +19,9 @@ #include "KokkosSparse_spmv_bsrmatrix_spec.hpp" namespace KokkosSparse { -namespace Experimental { namespace Impl { // clang-format off @SPARSE_SPMV_BSRMATRIX_ETI_INST_BLOCK@ // clang-format on } // namespace Impl -} // namespace Experimental -} // namespace KokkosSparse \ No newline at end of file +} // namespace KokkosSparse diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_cpp/spmv/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_inst.cpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_cpp/spmv/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_inst.cpp.in index d089eca0e3fb..2c9a6083bfb9 100644 --- a/packages/kokkos-kernels/sparse/eti/generated_specializations_cpp/spmv/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_inst.cpp.in +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_cpp/spmv/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_inst.cpp.in @@ -19,11 +19,9 @@ #include "KokkosSparse_spmv_bsrmatrix_spec.hpp" namespace KokkosSparse { -namespace Experimental { namespace Impl { // clang-format off @SPARSE_SPMV_MV_BSRMATRIX_ETI_INST_BLOCK@ /// // clang-format on } // namespace Impl -} // namespace Experimental } // namespace KokkosSparse diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_avail.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_avail.hpp.in index f98e60ae0dc2..278b60a8131d 100644 --- a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_avail.hpp.in +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_avail.hpp.in @@ -17,12 +17,10 @@ #ifndef KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_AVAIL_HPP_ #define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_AVAIL_HPP_ namespace KokkosSparse { -namespace Experimental { namespace Impl { // clang-format off @SPARSE_SPMV_BSRMATRIX_ETI_AVAIL_BLOCK@ // clang-format on } // namespace Impl -} // namespace Experimental } // namespace KokkosSparse #endif diff --git a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_avail.hpp.in b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_avail.hpp.in index df5392826667..3247985f4c5a 100644 --- a/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_avail.hpp.in +++ b/packages/kokkos-kernels/sparse/eti/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_avail.hpp.in @@ -18,12 +18,10 @@ #define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_AVAIL_HPP_ namespace KokkosSparse { -namespace Experimental { namespace Impl { // clang-format off @SPARSE_SPMV_MV_BSRMATRIX_ETI_AVAIL_BLOCK@ // clang-format on } // namespace Impl -} // namespace Experimental } // namespace KokkosSparse #endif diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_coo2crs_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_coo2crs_impl.hpp index d00a6f34a9b5..aaa5cdcb72f2 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_coo2crs_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_coo2crs_impl.hpp @@ -15,11 +15,6 @@ //@HEADER #ifndef KOKKOSSPARSE_COO2CRS_IMPL_HPP #define KOKKOSSPARSE_COO2CRS_IMPL_HPP -// The unorderedmap changes necessary for this to work -// have not made it into Kokkos 4.0.00 pr 4.0.01 will -// need to see if it happens in 4.1.00 to have a final -// version check here. -#if KOKKOS_VERSION >= 40099 #include #include "Kokkos_UnorderedMap.hpp" @@ -196,8 +191,13 @@ class Coo2Crs { reinterpret_cast(Kokkos::kokkos_malloc( "m_umaps", m_nrows * sizeof(UmapType))); - using shallow_copy_to_device = - Kokkos::Impl::DeepCopy; + auto shallow_copy_to_device = [](UmapType *dst, UmapType const *src, + std::size_t cnt) { + std::size_t nn = cnt / sizeof(UmapType); + Kokkos::deep_copy( + Kokkos::View(dst, nn), + Kokkos::View(src, nn)); + }; UmapType **umap_ptrs = new UmapType *[m_nrows]; // TODO: use host-level parallel_for with tag rowmapRp1 @@ -275,6 +275,4 @@ class Coo2Crs { } // namespace Impl } // namespace KokkosSparse -#endif // KOKKOS_VERSION >= 40099 - #endif // KOKKOSSPARSE_COO2CRS_IMPL_HPP diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 7391e00e3db8..f0b78408bc56 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -1547,8 +1547,8 @@ class PointGaussSeidel { Permuted_Yvector); } if (init_zero_x_vector) { - KokkosKernels::Impl::zero_vector< - MyExecSpace, scalar_persistent_work_view2d_t, MyExecSpace>( + KokkosKernels::Impl::zero_vector( my_exec_space, num_cols * block_size, Permuted_Xvector); } else { KokkosKernels::Impl::permute_block_vector< @@ -1664,8 +1664,8 @@ class PointGaussSeidel { Permuted_Yvector); } if (init_zero_x_vector) { - KokkosKernels::Impl::zero_vector< - MyExecSpace, scalar_persistent_work_view2d_t, MyExecSpace>( + KokkosKernels::Impl::zero_vector( my_exec_space, num_cols, Permuted_Xvector); } else { KokkosKernels::Impl::permute_vector< diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_gmres_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_gmres_impl.hpp index 8c7231f90c92..f616bfe8f369 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_gmres_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_gmres_impl.hpp @@ -70,7 +70,7 @@ struct GmresWrap { Kokkos::Profiling::pushRegion("GMRES::TotalTime:"); // Store solver options: - const auto n = A.numRows(); + const auto n = A.numPointRows(); const int m = thandle.get_m(); const auto maxRestart = thandle.get_max_restart(); const auto tol = thandle.get_tol(); diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_gmres_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_gmres_spec.hpp index bfe1c4539a90..a588793ff872 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_gmres_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_gmres_spec.hpp @@ -23,6 +23,7 @@ #include #include #include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_BsrMatrix.hpp" #include "KokkosKernels_Handle.hpp" // Include the actual functors @@ -81,10 +82,15 @@ template ::value> struct GMRES { - using AMatrix = CrsMatrix; + using AMatrix = CrsMatrix; + using BAMatrix = KokkosSparse::Experimental::BsrMatrix; static void gmres( KernelHandle *handle, const AMatrix &A, const BType &B, XType &X, KokkosSparse::Experimental::Preconditioner *precond = nullptr); + + static void gmres( + KernelHandle *handle, const BAMatrix &A, const BType &B, XType &X, + KokkosSparse::Experimental::Preconditioner *precond = nullptr); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY @@ -104,6 +110,17 @@ struct GMRES; + static void gmres( + KernelHandle *handle, const BAMatrix &A, const BType &B, XType &X, + KokkosSparse::Experimental::Preconditioner *precond = nullptr) { + auto gmres_handle = handle->get_gmres_handle(); + using Gmres = Experimental::GmresWrap< + typename std::remove_pointer::type>; + + Gmres::gmres(*gmres_handle, A, B, X, precond); + } }; #endif diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp index 0ac9c2616601..6bdf0eb5775c 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp @@ -588,7 +588,7 @@ struct IlutWrap { count); Kokkos::single(Kokkos::PerTeam(team), - [=]() { O_row_map(row_idx) = count; }); + [&]() { O_row_map(row_idx) = count; }); } float_t threshold; @@ -699,18 +699,24 @@ struct IlutWrap { multiply_matrices(kh, ih, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values, LU_row_map, LU_entries, LU_values); - auto addHandle = kh.get_spadd_handle(); - KokkosSparse::Experimental::spadd_symbolic( - &kh, A_row_map, A_entries, LU_row_map, LU_entries, R_row_map); + auto addHandle = kh.get_spadd_handle(); + typename KHandle::const_nnz_lno_t m = A_row_map.extent(0) - 1, + n = m; // square matrix + // TODO: let compute_residual_norm also take an execution space argument and + // use that for exec! + typename KHandle::HandleExecSpace exec{}; + KokkosSparse::Experimental::spadd_symbolic(exec, &kh, m, n, A_row_map, + A_entries, LU_row_map, + LU_entries, R_row_map); const size_type r_nnz = addHandle->get_c_nnz(); - Kokkos::resize(R_entries, r_nnz); - Kokkos::resize(R_values, r_nnz); + Kokkos::resize(exec, R_entries, r_nnz); + Kokkos::resize(exec, R_values, r_nnz); KokkosSparse::Experimental::spadd_numeric( - &kh, A_row_map, A_entries, A_values, 1., LU_row_map, LU_entries, - LU_values, -1., R_row_map, R_entries, R_values); - + exec, &kh, m, n, A_row_map, A_entries, A_values, 1., LU_row_map, + LU_entries, LU_values, -1., R_row_map, R_entries, R_values); + // TODO: how to make this policy use exec? auto policy = ih.get_default_team_policy(); Kokkos::parallel_reduce( diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp index 8e70cd3c3bc1..fa356dc96377 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp @@ -174,24 +174,23 @@ struct UnsortedNumericSumFunctor { std::is_same::type, \ typename std::remove_const::type>::value -template +template < + typename execution_space, typename KernelHandle, typename alno_row_view_t, + typename alno_nnz_view_t, typename ascalar_t, typename ascalar_nnz_view_t, + typename blno_row_view_t, typename blno_nnz_view_t, typename bscalar_t, + typename bscalar_nnz_view_t, typename clno_row_view_t, + typename clno_nnz_view_t, typename cscalar_nnz_view_t> void spadd_numeric_impl( - KernelHandle* kernel_handle, const alno_row_view_t a_rowmap, - const alno_nnz_view_t a_entries, const ascalar_nnz_view_t a_values, - const ascalar_t alpha, const blno_row_view_t b_rowmap, - const blno_nnz_view_t b_entries, const bscalar_nnz_view_t b_values, - const bscalar_t beta, const clno_row_view_t c_rowmap, - clno_nnz_view_t c_entries, cscalar_nnz_view_t c_values) { + const execution_space& exec, KernelHandle* kernel_handle, + const alno_row_view_t a_rowmap, const alno_nnz_view_t a_entries, + const ascalar_nnz_view_t a_values, const ascalar_t alpha, + const blno_row_view_t b_rowmap, const blno_nnz_view_t b_entries, + const bscalar_nnz_view_t b_values, const bscalar_t beta, + const clno_row_view_t c_rowmap, clno_nnz_view_t c_entries, + cscalar_nnz_view_t c_values) { typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; typedef typename KernelHandle::nnz_scalar_t scalar_type; - typedef - typename KernelHandle::SPADDHandleType::execution_space execution_space; // Check that A/B/C data types match KernelHandle types, and that C data types // are nonconst (doesn't matter if A/B types are const) static_assert(SAME_TYPE(ascalar_t, scalar_type), @@ -252,7 +251,7 @@ void spadd_numeric_impl( sortedNumeric(a_rowmap, b_rowmap, c_rowmap, a_entries, b_entries, c_entries, a_values, b_values, c_values, alpha, beta); Kokkos::parallel_for("KokkosSparse::SpAdd:Numeric::InputSorted", - range_type(0, nrows), sortedNumeric); + range_type(exec, 0, nrows), sortedNumeric); } else { // use a_pos and b_pos (set in the handle by symbolic) to quickly compute C // entries and values @@ -265,7 +264,7 @@ void spadd_numeric_impl( c_entries, a_values, b_values, c_values, alpha, beta, addHandle->get_a_pos(), addHandle->get_b_pos()); Kokkos::parallel_for("KokkosSparse::SpAdd:Numeric::InputNotSorted", - range_type(0, nrows), unsortedNumeric); + range_type(exec, 0, nrows), unsortedNumeric); } addHandle->set_call_numeric(); } diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp index e81649f55279..18731348de47 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp @@ -28,10 +28,10 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spadd_numeric_eti_spec_avail { enum : bool { value = false }; }; @@ -44,6 +44,7 @@ struct spadd_numeric_eti_spec_avail { MEM_SPACE_TYPE) \ template <> \ struct spadd_numeric_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ @@ -87,20 +88,22 @@ namespace Impl { // Unification layer /// \brief Implementation of KokkosBlas::spadd (sparse-sparse matrix addition) -template ::value, + ExecSpace, KernelHandle, a_size_view_t, a_lno_view_t, + a_scalar_view_t, b_size_view_t, b_lno_view_t, b_scalar_view_t, + c_size_view_t, c_lno_view_t, c_scalar_view_t>::value, bool eti_spec_avail = spadd_numeric_eti_spec_avail< - KernelHandle, a_size_view_t, a_lno_view_t, a_scalar_view_t, - b_size_view_t, b_lno_view_t, b_scalar_view_t, c_size_view_t, - c_lno_view_t, c_scalar_view_t>::value> + ExecSpace, KernelHandle, a_size_view_t, a_lno_view_t, + a_scalar_view_t, b_size_view_t, b_lno_view_t, b_scalar_view_t, + c_size_view_t, c_lno_view_t, c_scalar_view_t>::value> struct SPADD_NUMERIC { - static void spadd_numeric(KernelHandle *handle, + static void spadd_numeric(const ExecSpace &exec, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t m, + typename KernelHandle::const_nnz_lno_t n, typename a_scalar_view_t::const_value_type alpha, a_size_view_t row_mapA, a_lno_view_t entriesA, a_scalar_view_t valuesA, @@ -112,15 +115,17 @@ struct SPADD_NUMERIC { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -template -struct SPADD_NUMERIC { - static void spadd_numeric(KernelHandle *handle, +template +struct SPADD_NUMERIC< + ExecSpace, KernelHandle, a_size_view_t, a_lno_view_t, a_scalar_view_t, + b_size_view_t, b_lno_view_t, b_scalar_view_t, c_size_view_t, c_lno_view_t, + c_scalar_view_t, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> { + static void spadd_numeric(const ExecSpace &exec, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t /* m */, + typename KernelHandle::const_nnz_lno_t /* n */, typename a_scalar_view_t::const_value_type alpha, a_size_view_t row_mapA, a_lno_view_t entriesA, a_scalar_view_t valuesA, @@ -128,8 +133,9 @@ struct SPADD_NUMERIC, \ @@ -178,6 +185,7 @@ struct SPADD_NUMERIC, \ @@ -210,6 +218,6 @@ struct SPADD_NUMERIC >, \ false, true>; -#include +#include #endif diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp index 15132f9da315..80506e30568d 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp @@ -371,50 +371,48 @@ struct MergeEntriesFunctor { }; // Run SortedCountEntries: non-GPU, always uses the RangePolicy version. -template +template void runSortedCountEntries( - const alno_row_view_t_& a_rowmap, const alno_nnz_view_t_& a_entries, - const blno_row_view_t_& b_rowmap, const blno_nnz_view_t_& b_entries, - const clno_row_view_t_& c_rowmap, - typename std::enable_if()>::type* = + const execution_space& exec, const alno_row_view_t_& a_rowmap, + const alno_nnz_view_t_& a_entries, const blno_row_view_t_& b_rowmap, + const blno_nnz_view_t_& b_entries, const clno_row_view_t_& c_rowmap, + typename std::enable_if< + !KokkosKernels::Impl::kk_is_gpu_exec_space()>::type* = nullptr) { using size_type = typename KernelHandle::size_type; using ordinal_type = typename KernelHandle::nnz_lno_t; - using execution_space = - typename KernelHandle::SPADDHandleType::execution_space; - using range_type = Kokkos::RangePolicy; - auto nrows = c_rowmap.extent(0) - 1; + using range_type = Kokkos::RangePolicy; + auto nrows = c_rowmap.extent(0) - 1; SortedCountEntriesRange countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); Kokkos::parallel_for( "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", - range_type(0, nrows), countEntries); + range_type(exec, 0, nrows), countEntries); } // Run SortedCountEntries: GPU, uses the TeamPolicy or RangePolicy depending // on average nz per row (a runtime decision) -template +template void runSortedCountEntries( - const alno_row_view_t_& a_rowmap, const alno_nnz_view_t_& a_entries, - const blno_row_view_t_& b_rowmap, const blno_nnz_view_t_& b_entries, - const clno_row_view_t_& c_rowmap, - typename std::enable_if()>::type* = + const execution_space& exec, const alno_row_view_t_& a_rowmap, + const alno_nnz_view_t_& a_entries, const blno_row_view_t_& b_rowmap, + const blno_nnz_view_t_& b_entries, const clno_row_view_t_& c_rowmap, + typename std::enable_if< + KokkosKernels::Impl::kk_is_gpu_exec_space()>::type* = nullptr) { using size_type = typename KernelHandle::size_type; using ordinal_type = typename KernelHandle::nnz_lno_t; - using execution_space = - typename KernelHandle::SPADDHandleType::execution_space; - using RangePol = Kokkos::RangePolicy; - using TeamPol = Kokkos::TeamPolicy; - auto nrows = c_rowmap.extent(0) - 1; + using RangePol = Kokkos::RangePolicy; + using TeamPol = Kokkos::TeamPolicy; + auto nrows = c_rowmap.extent(0) - 1; size_type c_est_nnz = 1.4 * (a_entries.extent(0) + b_entries.extent(0)) / nrows; if (c_est_nnz <= 512) { @@ -435,14 +433,14 @@ void runSortedCountEntries( countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); countEntries.sharedPerThread = pot_est_nnz; // compute largest possible team size - TeamPol testPolicy(1, 1, vector_length); + TeamPol testPolicy(exec, 1, 1, vector_length); testPolicy.set_scratch_size( 0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type))); int team_size = testPolicy.team_size_recommended(countEntries, Kokkos::ParallelForTag()); // construct real policy int league_size = (nrows + team_size - 1) / team_size; - TeamPol policy(league_size, team_size, vector_length); + TeamPol policy(exec, league_size, team_size, vector_length); policy.set_scratch_size( 0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type))); countEntries.totalShared = @@ -457,24 +455,23 @@ void runSortedCountEntries( countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); Kokkos::parallel_for( "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", - RangePol(0, nrows), countEntries); + RangePol(exec, 0, nrows), countEntries); } } // Symbolic: count entries in each row in C to produce rowmap // kernel handle has information about whether it is sorted add or not. -template +template void spadd_symbolic_impl( - KernelHandle* handle, const alno_row_view_t_ a_rowmap, - const alno_nnz_view_t_ a_entries, const blno_row_view_t_ b_rowmap, - const blno_nnz_view_t_ b_entries, + const execution_space& exec, KernelHandle* handle, + const alno_row_view_t_ a_rowmap, const alno_nnz_view_t_ a_entries, + const blno_row_view_t_ b_rowmap, const blno_nnz_view_t_ b_entries, clno_row_view_t_ c_rowmap) // c_rowmap must already be allocated (doesn't // need to be initialized) { - typedef - typename KernelHandle::SPADDHandleType::execution_space execution_space; typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; typedef typename KernelHandle::SPADDHandleType::nnz_lno_view_t ordinal_view_t; @@ -520,17 +517,18 @@ void spadd_symbolic_impl( ordinal_type nrows = a_rowmap.extent(0) - 1; typedef Kokkos::RangePolicy range_type; if (addHandle->is_input_sorted()) { - runSortedCountEntries( - a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); + runSortedCountEntries(exec, a_rowmap, a_entries, b_rowmap, + b_entries, c_rowmap); KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - nrows + 1, c_rowmap); + exec, nrows + 1, c_rowmap); } else { // note: scoping individual parts of the process to free views sooner, // minimizing peak memory usage run the unsorted c_rowmap upper bound // functor (just adds together A and B entry counts row by row) offset_view_t c_rowmap_upperbound( - Kokkos::view_alloc(Kokkos::WithoutInitializing, + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "C row counts upper bound"), nrows + 1); size_type c_nnz_upperbound = 0; @@ -540,17 +538,17 @@ void spadd_symbolic_impl( countEntries(nrows, a_rowmap, b_rowmap, c_rowmap_upperbound); Kokkos::parallel_for( "KokkosSparse::SpAdd:Symbolic::InputNotSorted::CountEntries", - range_type(0, nrows), countEntries); + range_type(exec, 0, nrows), countEntries); KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - nrows + 1, c_rowmap_upperbound); - Kokkos::deep_copy(c_nnz_upperbound, + exec, nrows + 1, c_rowmap_upperbound); + Kokkos::deep_copy(exec, c_nnz_upperbound, Kokkos::subview(c_rowmap_upperbound, nrows)); } ordinal_view_t c_entries_uncompressed( - Kokkos::view_alloc(Kokkos::WithoutInitializing, + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "C entries uncompressed"), c_nnz_upperbound); - ordinal_view_t ab_perm(Kokkos::view_alloc(Kokkos::WithoutInitializing, + ordinal_view_t ab_perm(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "A and B permuted entry indices"), c_nnz_upperbound); // compute the unmerged sum @@ -561,17 +559,17 @@ void spadd_symbolic_impl( c_rowmap_upperbound, c_entries_uncompressed, ab_perm); Kokkos::parallel_for( "KokkosSparse::SpAdd:Symbolic::InputNotSorted::UnmergedSum", - range_type(0, nrows), unmergedSum); + range_type(exec, 0, nrows), unmergedSum); // sort the unmerged sum KokkosSparse::sort_crs_matrix( - c_rowmap_upperbound, c_entries_uncompressed, ab_perm); - ordinal_view_t a_pos( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "A entry positions"), - a_entries.extent(0)); - ordinal_view_t b_pos( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "B entry positions"), - b_entries.extent(0)); + exec, c_rowmap_upperbound, c_entries_uncompressed, ab_perm); + ordinal_view_t a_pos(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, + "A entry positions"), + a_entries.extent(0)); + ordinal_view_t b_pos(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, + "B entry positions"), + b_entries.extent(0)); // merge the entries and compute Apos/Bpos, as well as Crowcounts { MergeEntriesFunctor( - nrows + 1, c_rowmap); + exec, nrows + 1, c_rowmap); } addHandle->set_a_b_pos(a_pos, b_pos); } // provide the number of NNZ in C to user through handle size_type cmax; - Kokkos::deep_copy(cmax, Kokkos::subview(c_rowmap, nrows)); + Kokkos::deep_copy(exec, cmax, Kokkos::subview(c_rowmap, nrows)); addHandle->set_c_nnz(cmax); addHandle->set_call_symbolic(); addHandle->set_call_numeric(false); diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp index aaab68568a3d..bdc4ed04bd94 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp @@ -28,8 +28,9 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spadd_symbolic_eti_spec_avail { enum : bool { value = false }; }; @@ -42,6 +43,7 @@ struct spadd_symbolic_eti_spec_avail { MEM_SPACE_TYPE) \ template <> \ struct spadd_symbolic_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ @@ -73,31 +75,39 @@ namespace Impl { // Unification layer /// \brief Implementation of KokkosBlas::spadd (sparse-sparse matrix addition) -template ::value, + ExecSpace, KernelHandle, a_size_view_t, a_lno_view_t, + b_size_view_t, b_lno_view_t, c_size_view_t>::value, bool eti_spec_avail = spadd_symbolic_eti_spec_avail< - KernelHandle, a_size_view_t, a_lno_view_t, b_size_view_t, - b_lno_view_t, c_size_view_t>::value> + ExecSpace, KernelHandle, a_size_view_t, a_lno_view_t, + b_size_view_t, b_lno_view_t, c_size_view_t>::value> struct SPADD_SYMBOLIC { - static void spadd_symbolic(KernelHandle *handle, a_size_view_t row_mapA, - a_lno_view_t entriesA, b_size_view_t row_mapB, - b_lno_view_t entriesB, c_size_view_t row_mapC); + static void spadd_symbolic(const ExecSpace &exec, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t m, + typename KernelHandle::const_nnz_lno_t n, + a_size_view_t row_mapA, a_lno_view_t entriesA, + b_size_view_t row_mapB, b_lno_view_t entriesB, + c_size_view_t row_mapC); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -template -struct SPADD_SYMBOLIC +struct SPADD_SYMBOLIC { - static void spadd_symbolic(KernelHandle *handle, a_size_view_t row_mapA, - a_lno_view_t entriesA, b_size_view_t row_mapB, - b_lno_view_t entriesB, c_size_view_t row_mapC) { - spadd_symbolic_impl(handle, row_mapA, entriesA, row_mapB, entriesB, + static void spadd_symbolic(const ExecSpace &exec, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t /* m */, + typename KernelHandle::const_nnz_lno_t /* n */, + a_size_view_t row_mapA, a_lno_view_t entriesA, + b_size_view_t row_mapB, b_lno_view_t entriesB, + c_size_view_t row_mapC) { + spadd_symbolic_impl(exec, handle, row_mapA, entriesA, row_mapB, entriesB, row_mapC); } }; @@ -111,6 +121,7 @@ struct SPADD_SYMBOLIC, \ @@ -135,6 +146,7 @@ struct SPADD_SYMBOLIC, \ @@ -155,6 +167,6 @@ struct SPADD_SYMBOLIC >, \ false, true>; -#include +#include #endif diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index c2863885b21a..b3b5dfa27761 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -21,8 +21,17 @@ /// \brief Implementation(s) of the numeric phase of sparse ILU(k). #include +#include #include #include +#include "KokkosBatched_SetIdentity_Decl.hpp" +#include "KokkosBatched_SetIdentity_Impl.hpp" +#include "KokkosBatched_Trsm_Decl.hpp" +#include "KokkosBatched_Trsm_Serial_Impl.hpp" +#include "KokkosBatched_Axpy.hpp" +#include "KokkosBatched_Gemm_Decl.hpp" +#include "KokkosBatched_Gemm_Serial_Impl.hpp" +#include "KokkosBlas1_set.hpp" //#define NUMERIC_OUTPUT_INFO @@ -30,391 +39,514 @@ namespace KokkosSparse { namespace Impl { namespace Experimental { -// struct UnsortedTag {}; - -template -struct ILUKLvlSchedRPNumericFunctor { - using lno_t = typename AEntriesType::non_const_value_type; - using scalar_t = typename AValuesType::non_const_value_type; - ARowMapType A_row_map; - AEntriesType A_entries; - AValuesType A_values; - LRowMapType L_row_map; - LEntriesType L_entries; - LValuesType L_values; - URowMapType U_row_map; - UEntriesType U_entries; - UValuesType U_values; - LevelViewType level_idx; - WorkViewType iw; - nnz_lno_t lev_start; - - ILUKLvlSchedRPNumericFunctor( - const ARowMapType &A_row_map_, const AEntriesType &A_entries_, - const AValuesType &A_values_, const LRowMapType &L_row_map_, - const LEntriesType &L_entries_, LValuesType &L_values_, - const URowMapType &U_row_map_, const UEntriesType &U_entries_, - UValuesType &U_values_, const LevelViewType &level_idx_, - WorkViewType &iw_, const nnz_lno_t &lev_start_) - : A_row_map(A_row_map_), - A_entries(A_entries_), - A_values(A_values_), - L_row_map(L_row_map_), - L_entries(L_entries_), - L_values(L_values_), - U_row_map(U_row_map_), - U_entries(U_entries_), - U_values(U_values_), - level_idx(level_idx_), - iw(iw_), - lev_start(lev_start_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const lno_t i) const { - auto rowid = level_idx(i); - auto tid = i - lev_start; - auto k1 = L_row_map(rowid); - auto k2 = L_row_map(rowid + 1); -#ifdef KEEP_DIAG - for (auto k = k1; k < k2 - 1; ++k) { -#else - for (auto k = k1; k < k2; ++k) { -#endif - auto col = L_entries(k); - L_values(k) = 0.0; - iw(tid, col) = k; +template +struct IlukWrap { + // + // Useful types + // + using execution_space = typename IlukHandle::execution_space; + using memory_space = typename IlukHandle::memory_space; + using lno_t = typename IlukHandle::nnz_lno_t; + using size_type = typename IlukHandle::size_type; + using scalar_t = typename IlukHandle::nnz_scalar_t; + using HandleDeviceRowMapType = typename IlukHandle::nnz_row_view_t; + using HandleDeviceValueType = typename IlukHandle::nnz_value_view_t; + using WorkViewType = typename IlukHandle::work_view_t; + using LevelHostViewType = typename IlukHandle::nnz_lno_view_host_t; + using LevelViewType = typename IlukHandle::nnz_lno_view_t; + using karith = typename Kokkos::ArithTraits; + using team_policy = typename IlukHandle::TeamPolicy; + using member_type = typename team_policy::member_type; + using range_policy = typename IlukHandle::RangePolicy; + + static team_policy get_team_policy(const size_type nrows, + const int team_size) { + team_policy rv; + if (team_size == -1) { + rv = team_policy(nrows, Kokkos::AUTO); + } else { + rv = team_policy(nrows, team_size); } -#ifdef KEEP_DIAG - L_values(k2 - 1) = scalar_t(1.0); -#endif - k1 = U_row_map(rowid); - k2 = U_row_map(rowid + 1); - for (auto k = k1; k < k2; ++k) { - auto col = U_entries(k); - U_values(k) = 0.0; - iw(tid, col) = k; + return rv; + } + + static team_policy get_team_policy(execution_space exe_space, + const size_type nrows, + const int team_size) { + team_policy rv; + if (team_size == -1) { + rv = team_policy(exe_space, nrows, Kokkos::AUTO); + } else { + rv = team_policy(exe_space, nrows, team_size); } - // Unpack the ith row of A - k1 = A_row_map(rowid); - k2 = A_row_map(rowid + 1); - for (auto k = k1; k < k2; ++k) { - auto col = A_entries(k); - auto ipos = iw(tid, col); - if (col < rowid) - L_values(ipos) = A_values(k); - else - U_values(ipos) = A_values(k); + return rv; + } + + /** + * Common base class for SPILUK functors. Default version does not support + * blocks + */ + template + struct Common { + ARowMapType A_row_map; + AEntriesType A_entries; + AValuesType A_values; + LRowMapType L_row_map; + LEntriesType L_entries; + LValuesType L_values; + URowMapType U_row_map; + UEntriesType U_entries; + UValuesType U_values; + LevelViewType level_idx; + WorkViewType iw; + lno_t lev_start; + + using reftype = scalar_t &; + + Common(const ARowMapType &A_row_map_, const AEntriesType &A_entries_, + const AValuesType &A_values_, const LRowMapType &L_row_map_, + const LEntriesType &L_entries_, LValuesType &L_values_, + const URowMapType &U_row_map_, const UEntriesType &U_entries_, + UValuesType &U_values_, const LevelViewType &level_idx_, + WorkViewType &iw_, const lno_t &lev_start_, + const size_type &block_size_) + : A_row_map(A_row_map_), + A_entries(A_entries_), + A_values(A_values_), + L_row_map(L_row_map_), + L_entries(L_entries_), + L_values(L_values_), + U_row_map(U_row_map_), + U_entries(U_entries_), + U_values(U_values_), + level_idx(level_idx_), + iw(iw_), + lev_start(lev_start_) { + KK_REQUIRE_MSG(block_size_ == 0, + "Tried to use blocks with the unblocked Common?"); } - // Eliminate prev rows - k1 = L_row_map(rowid); - k2 = L_row_map(rowid + 1); -#ifdef KEEP_DIAG - for (auto k = k1; k < k2 - 1; ++k) { -#else - for (auto k = k1; k < k2; ++k) { -#endif - auto prev_row = L_entries(k); -#ifdef KEEP_DIAG - auto fact = L_values(k) / U_values(U_row_map(prev_row)); -#else - auto fact = L_values(k) * U_values(U_row_map(prev_row)); -#endif - L_values(k) = fact; - for (auto kk = U_row_map(prev_row) + 1; kk < U_row_map(prev_row + 1); - ++kk) { - auto col = U_entries(kk); - auto ipos = iw(tid, col); - if (ipos == -1) continue; - auto lxu = -U_values(kk) * fact; - if (col < rowid) - L_values(ipos) += lxu; - else - U_values(ipos) += lxu; - } // end for kk - } // end for k - -#ifdef KEEP_DIAG - if (U_values(iw(tid, rowid)) == 0.0) { - U_values(iw(tid, rowid)) = 1e6; + // lset + KOKKOS_INLINE_FUNCTION + void lset(const size_type nnz, const scalar_t &value) const { + L_values(nnz) = value; } -#else - if (U_values(iw(tid, rowid)) == 0.0) { - U_values(iw(tid, rowid)) = 1e6; - } else { - U_values(iw(tid, rowid)) = 1.0 / U_values(iw(tid, rowid)); + + // uset + KOKKOS_INLINE_FUNCTION + void uset(const size_type nnz, const scalar_t &value) const { + U_values(nnz) = value; } -#endif - // Reset - k1 = L_row_map(rowid); - k2 = L_row_map(rowid + 1); -#ifdef KEEP_DIAG - for (auto k = k1; k < k2 - 1; ++k) -#else - for (auto k = k1; k < k2; ++k) -#endif - iw(tid, L_entries(k)) = -1; + // lset_id + KOKKOS_INLINE_FUNCTION + void lset_id(const member_type &team, const size_type nnz) const { + // Not sure a Kokkos::single is really needed here since the + // race is harmless + Kokkos::single(Kokkos::PerTeam(team), + [&]() { L_values(nnz) = scalar_t(1.0); }); + } - k1 = U_row_map(rowid); - k2 = U_row_map(rowid + 1); - for (auto k = k1; k < k2; ++k) iw(tid, U_entries(k)) = -1; - } -}; - -template -struct ILUKLvlSchedTP1NumericFunctor { - using execution_space = typename ARowMapType::execution_space; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; - using size_type = typename ARowMapType::non_const_value_type; - using lno_t = typename AEntriesType::non_const_value_type; - using scalar_t = typename AValuesType::non_const_value_type; - - ARowMapType A_row_map; - AEntriesType A_entries; - AValuesType A_values; - LRowMapType L_row_map; - LEntriesType L_entries; - LValuesType L_values; - URowMapType U_row_map; - UEntriesType U_entries; - UValuesType U_values; - LevelViewType level_idx; - WorkViewType iw; - nnz_lno_t lev_start; - - ILUKLvlSchedTP1NumericFunctor( - const ARowMapType &A_row_map_, const AEntriesType &A_entries_, - const AValuesType &A_values_, const LRowMapType &L_row_map_, - const LEntriesType &L_entries_, LValuesType &L_values_, - const URowMapType &U_row_map_, const UEntriesType &U_entries_, - UValuesType &U_values_, const LevelViewType &level_idx_, - WorkViewType &iw_, const nnz_lno_t &lev_start_) - : A_row_map(A_row_map_), - A_entries(A_entries_), - A_values(A_values_), - L_row_map(L_row_map_), - L_entries(L_entries_), - L_values(L_values_), - U_row_map(U_row_map_), - U_entries(U_entries_), - U_values(U_values_), - level_idx(level_idx_), - iw(iw_), - lev_start(lev_start_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - nnz_lno_t my_team = static_cast(team.league_rank()); - nnz_lno_t rowid = - static_cast(level_idx(my_team + lev_start)); // map to rowid - - size_type k1 = static_cast(L_row_map(rowid)); - size_type k2 = static_cast(L_row_map(rowid + 1)); -#ifdef KEEP_DIAG - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1), - [&](const size_type k) { - nnz_lno_t col = static_cast(L_entries(k)); - L_values(k) = 0.0; - iw(my_team, col) = static_cast(k); - }); -#else - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const size_type k) { - nnz_lno_t col = static_cast(L_entries(k)); - L_values(k) = 0.0; - iw(my_team, col) = static_cast(k); - }); -#endif + // divide. lhs /= rhs + KOKKOS_INLINE_FUNCTION + void divide(const member_type &team, scalar_t &lhs, + const scalar_t &rhs) const { + Kokkos::single(Kokkos::PerTeam(team), [&]() { lhs /= rhs; }); + team.team_barrier(); + } -#ifdef KEEP_DIAG - // if (my_thread == 0) L_values(k2 - 1) = scalar_t(1.0); - Kokkos::single(Kokkos::PerTeam(team), - [&]() { L_values(k2 - 1) = scalar_t(1.0); }); -#endif + // multiply_subtract. C -= A * B + KOKKOS_INLINE_FUNCTION + void multiply_subtract(const scalar_t &A, const scalar_t &B, + scalar_t &C) const { + C -= A * B; + } - team.team_barrier(); - - k1 = static_cast(U_row_map(rowid)); - k2 = static_cast(U_row_map(rowid + 1)); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const size_type k) { - nnz_lno_t col = static_cast(U_entries(k)); - U_values(k) = 0.0; - iw(my_team, col) = static_cast(k); - }); - - team.team_barrier(); - - // Unpack the ith row of A - k1 = static_cast(A_row_map(rowid)); - k2 = static_cast(A_row_map(rowid + 1)); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const size_type k) { - nnz_lno_t col = static_cast(A_entries(k)); - nnz_lno_t ipos = iw(my_team, col); - if (col < rowid) - L_values(ipos) = A_values(k); - else - U_values(ipos) = A_values(k); - }); - - team.team_barrier(); - - // Eliminate prev rows - k1 = static_cast(L_row_map(rowid)); - k2 = static_cast(L_row_map(rowid + 1)); -#ifdef KEEP_DIAG - for (size_type k = k1; k < k2 - 1; k++) -#else - for (size_type k = k1; k < k2; k++) -#endif - { - nnz_lno_t prev_row = L_entries(k); - - scalar_t fact = scalar_t(0.0); - Kokkos::single( - Kokkos::PerTeam(team), - [&](scalar_t &tmp_fact) { -#ifdef KEEP_DIAG - tmp_fact = L_values(k) / U_values(U_row_map(prev_row)); -#else - tmp_fact = L_values(k) * U_values(U_row_map(prev_row)); -#endif - L_values(k) = tmp_fact; - }, - fact); - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, U_row_map(prev_row) + 1, - U_row_map(prev_row + 1)), - [&](const size_type kk) { - nnz_lno_t col = static_cast(U_entries(kk)); - nnz_lno_t ipos = iw(my_team, col); - auto lxu = -U_values(kk) * fact; - if (ipos != -1) { - if (col < rowid) - L_values(ipos) += lxu; - else - U_values(ipos) += lxu; - } - }); // end for kk + // lget + KOKKOS_INLINE_FUNCTION + scalar_t &lget(const size_type nnz) const { return L_values(nnz); } - team.team_barrier(); - } // end for k - - // if (my_thread == 0) { - Kokkos::single(Kokkos::PerTeam(team), [&]() { - nnz_lno_t ipos = iw(my_team, rowid); -#ifdef KEEP_DIAG - if (U_values(ipos) == 0.0) { - U_values(ipos) = 1e6; + // uget + KOKKOS_INLINE_FUNCTION + scalar_t &uget(const size_type nnz) const { return U_values(nnz); } + + // aget + KOKKOS_INLINE_FUNCTION + scalar_t aget(const size_type nnz) const { return A_values(nnz); } + + // uequal + KOKKOS_INLINE_FUNCTION + bool uequal(const size_type nnz, const scalar_t &value) const { + return U_values(nnz) == value; + } + + // print + KOKKOS_INLINE_FUNCTION + void print(const scalar_t &item) const { std::cout << item << std::endl; } + }; + + // Partial specialization for block support + template + struct Common { + ARowMapType A_row_map; + AEntriesType A_entries; + AValuesType A_values; + LRowMapType L_row_map; + LEntriesType L_entries; + LValuesType L_values; + URowMapType U_row_map; + UEntriesType U_entries; + UValuesType U_values; + LevelViewType level_idx; + WorkViewType iw; + lno_t lev_start; + size_type block_size; + size_type block_items; + + // BSR data is in LayoutRight! + using Layout = Kokkos::LayoutRight; + + using LBlock = Kokkos::View< + typename LValuesType::value_type **, Layout, + typename LValuesType::device_type, + Kokkos::MemoryTraits >; + + using UBlock = Kokkos::View< + typename UValuesType::value_type **, Layout, + typename UValuesType::device_type, + Kokkos::MemoryTraits >; + + using ABlock = Kokkos::View< + typename AValuesType::value_type **, Layout, + typename AValuesType::device_type, + Kokkos::MemoryTraits >; + + using reftype = LBlock; + + Common(const ARowMapType &A_row_map_, const AEntriesType &A_entries_, + const AValuesType &A_values_, const LRowMapType &L_row_map_, + const LEntriesType &L_entries_, LValuesType &L_values_, + const URowMapType &U_row_map_, const UEntriesType &U_entries_, + UValuesType &U_values_, const LevelViewType &level_idx_, + WorkViewType &iw_, const lno_t &lev_start_, + const size_type &block_size_) + : A_row_map(A_row_map_), + A_entries(A_entries_), + A_values(A_values_), + L_row_map(L_row_map_), + L_entries(L_entries_), + L_values(L_values_), + U_row_map(U_row_map_), + U_entries(U_entries_), + U_values(U_values_), + level_idx(level_idx_), + iw(iw_), + lev_start(lev_start_), + block_size(block_size_), + block_items(block_size * block_size) { + KK_REQUIRE_MSG(block_size > 0, + "Tried to use block_size=0 with the blocked Common?"); + } + + // lset + KOKKOS_INLINE_FUNCTION + void lset(const size_type block, const scalar_t &value) const { + KokkosBlas::SerialSet::invoke(value, lget(block)); + } + + KOKKOS_INLINE_FUNCTION + void lset(const size_type block, const ABlock &rhs) const { + auto lblock = lget(block); + for (size_type i = 0; i < block_size; ++i) { + for (size_type j = 0; j < block_size; ++j) { + lblock(i, j) = rhs(i, j); + } } -#else - if (U_values(ipos) == 0.0) { - U_values(ipos) = 1e6; - } else { - U_values(ipos) = 1.0 / U_values(ipos); + } + + // uset + KOKKOS_INLINE_FUNCTION + void uset(const size_type block, const scalar_t &value) const { + KokkosBlas::SerialSet::invoke(value, uget(block)); + } + + KOKKOS_INLINE_FUNCTION + void uset(const size_type block, const ABlock &rhs) const { + auto ublock = uget(block); + for (size_type i = 0; i < block_size; ++i) { + for (size_type j = 0; j < block_size; ++j) { + ublock(i, j) = rhs(i, j); + } } -#endif - }); - //} - - team.team_barrier(); - - // Reset - k1 = static_cast(L_row_map(rowid)); - k2 = static_cast(L_row_map(rowid + 1)); -#ifdef KEEP_DIAG - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1), - [&](const size_type k) { - nnz_lno_t col = static_cast(L_entries(k)); - iw(my_team, col) = -1; - }); -#else - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const size_type k) { - nnz_lno_t col = static_cast(L_entries(k)); - iw(my_team, col) = -1; - }); -#endif + } + + // lset_id + KOKKOS_INLINE_FUNCTION + void lset_id(const member_type &team, const size_type block) const { + KokkosBatched::TeamSetIdentity::invoke(team, lget(block)); + } + + // divide. lhs /= rhs + KOKKOS_INLINE_FUNCTION + void divide(const member_type &team, const LBlock &lhs, + const UBlock &rhs) const { + KokkosBatched::TeamTrsm< + member_type, KokkosBatched::Side::Right, KokkosBatched::Uplo::Upper, + KokkosBatched::Trans::NoTranspose, // not 100% on this + KokkosBatched::Diag::NonUnit, + KokkosBatched::Algo::Trsm::Unblocked>:: // not 100% on this + invoke(team, 1.0, rhs, lhs); + } + + // multiply_subtract. C -= A * B + template + KOKKOS_INLINE_FUNCTION void multiply_subtract(const UBlock &A, + const LBlock &B, + CView &C) const { + // Use gemm. alpha is hardcoded to -1, beta hardcoded to 1 + KokkosBatched::SerialGemm< + KokkosBatched::Trans::NoTranspose, KokkosBatched::Trans::NoTranspose, + KokkosBatched::Algo::Gemm::Unblocked>::invoke( + -1.0, A, B, 1.0, C); + } + + // lget + KOKKOS_INLINE_FUNCTION + LBlock lget(const size_type block) const { + return LBlock(L_values.data() + (block * block_items), block_size, + block_size); + } + + // uget + KOKKOS_INLINE_FUNCTION + UBlock uget(const size_type block) const { + return UBlock(U_values.data() + (block * block_items), block_size, + block_size); + } + + // aget + KOKKOS_INLINE_FUNCTION + ABlock aget(const size_type block) const { + return ABlock(A_values.data() + (block * block_items), block_size, + block_size); + } + + // uequal + KOKKOS_INLINE_FUNCTION + bool uequal(const size_type block, const scalar_t &value) const { + auto u_block = uget(block); + for (size_type i = 0; i < block_size; ++i) { + for (size_type j = 0; j < block_size; ++j) { + if (u_block(i, j) != value) { + return false; + } + } + } + return true; + } - k1 = static_cast(U_row_map(rowid)); - k2 = static_cast(U_row_map(rowid + 1)); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const size_type k) { - nnz_lno_t col = static_cast(U_entries(k)); - iw(my_team, col) = -1; - }); + // print + KOKKOS_INLINE_FUNCTION + void print(const LBlock &item) const { + for (size_type i = 0; i < block_size; ++i) { + std::cout << " "; + for (size_type j = 0; j < block_size; ++j) { + std::cout << item(i, j) << " "; + } + std::cout << std::endl; + } + } + }; + + template + struct ILUKLvlSchedTP1NumericFunctor + : public Common { + using Base = Common; + + ILUKLvlSchedTP1NumericFunctor( + const ARowMapType &A_row_map_, const AEntriesType &A_entries_, + const AValuesType &A_values_, const LRowMapType &L_row_map_, + const LEntriesType &L_entries_, LValuesType &L_values_, + const URowMapType &U_row_map_, const UEntriesType &U_entries_, + UValuesType &U_values_, const LevelViewType &level_idx_, + WorkViewType &iw_, const lno_t &lev_start_, + const size_type &block_size_ = 0) + : Base(A_row_map_, A_entries_, A_values_, L_row_map_, L_entries_, + L_values_, U_row_map_, U_entries_, U_values_, level_idx_, iw_, + lev_start_, block_size_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &team) const { + const auto my_team = team.league_rank(); + const auto rowid = + Base::level_idx(my_team + Base::lev_start); // map to rowid + + // Set active entries in L to zero, store active cols in iw + // Set L diagonal for this row to identity + size_type k1 = Base::L_row_map(rowid); + size_type k2 = Base::L_row_map(rowid + 1) - 1; + Base::lset_id(team, k2); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const size_type k) { + const auto col = Base::L_entries(k); + Base::lset(k, 0.0); + Base::iw(my_team, col) = k; + }); + + team.team_barrier(); + + // Set active entries in U to zero, store active cols in iw + k1 = Base::U_row_map(rowid); + k2 = Base::U_row_map(rowid + 1); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const size_type k) { + const auto col = Base::U_entries(k); + Base::uset(k, 0.0); + Base::iw(my_team, col) = k; + }); + + team.team_barrier(); + + // Unpack the rowid-th row of A, copy into L,U + k1 = Base::A_row_map(rowid); + k2 = Base::A_row_map(rowid + 1); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const size_type k) { + const auto col = Base::A_entries(k); + const auto ipos = Base::iw(my_team, col); + if (col < rowid) { + Base::lset(ipos, Base::aget(k)); + } else { + Base::uset(ipos, Base::aget(k)); + } + }); + + team.team_barrier(); + + // Eliminate prev rows + k1 = Base::L_row_map(rowid); + k2 = Base::L_row_map(rowid + 1) - 1; + for (auto k = k1; k < k2; k++) { + const auto prev_row = Base::L_entries(k); + const auto udiag = Base::uget(Base::U_row_map(prev_row)); + Base::divide(team, Base::lget(k), udiag); + auto fact = Base::lget(k); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, Base::U_row_map(prev_row) + 1, + Base::U_row_map(prev_row + 1)), + [&](const size_type kk) { + const auto col = Base::U_entries(kk); + const auto ipos = Base::iw(my_team, col); + if (ipos != -1) { + typename Base::reftype C = + col < rowid ? Base::lget(ipos) : Base::uget(ipos); + Base::multiply_subtract(fact, Base::uget(kk), C); + } + }); // end for kk + + team.team_barrier(); + } // end for k + + Kokkos::single(Kokkos::PerTeam(team), [&]() { + const auto ipos = Base::iw(my_team, rowid); + if (Base::uequal(ipos, 0.0)) { + Base::uset(ipos, 1e6); + } + }); + + team.team_barrier(); + + // Reset + k1 = Base::L_row_map(rowid); + k2 = Base::L_row_map(rowid + 1) - 1; + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const size_type k) { + const auto col = Base::L_entries(k); + Base::iw(my_team, col) = -1; + }); + + k1 = Base::U_row_map(rowid); + k2 = Base::U_row_map(rowid + 1); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const size_type k) { + const auto col = Base::U_entries(k); + Base::iw(my_team, col) = -1; + }); + } + }; + +#define FunctorTypeMacro(Functor, BlockEnabled) \ + Functor + +#define KernelLaunchMacro(arow, aent, aval, lrow, lent, lval, urow, uent, \ + uval, polc, name, lidx, iwv, lstrt, ftf, ftb, be, \ + bs) \ + if (be) { \ + ftb functor(arow, aent, aval, lrow, lent, lval, urow, uent, uval, lidx, \ + iwv, lstrt, bs); \ + Kokkos::parallel_for(name, polc, functor); \ + } else { \ + ftf functor(arow, aent, aval, lrow, lent, lval, urow, uent, uval, lidx, \ + iwv, lstrt); \ + Kokkos::parallel_for(name, polc, functor); \ } -}; - -template -void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, - const AEntriesType &A_entries, const AValuesType &A_values, - const LRowMapType &L_row_map, const LEntriesType &L_entries, - LValuesType &L_values, const URowMapType &U_row_map, - const UEntriesType &U_entries, UValuesType &U_values) { - using execution_space = typename IlukHandle::execution_space; - using size_type = typename IlukHandle::size_type; - using nnz_lno_t = typename IlukHandle::nnz_lno_t; - using HandleDeviceEntriesType = typename IlukHandle::nnz_lno_view_t; - using WorkViewType = typename IlukHandle::work_view_t; - using LevelHostViewType = typename IlukHandle::nnz_lno_view_host_t; - - size_type nlevels = thandle.get_num_levels(); - int team_size = thandle.get_team_size(); - - LevelHostViewType level_ptr_h = thandle.get_host_level_ptr(); - HandleDeviceEntriesType level_idx = thandle.get_level_idx(); - - LevelHostViewType level_nchunks_h, level_nrowsperchunk_h; - WorkViewType iw; - - //{ - if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { + + template + static void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, + const AEntriesType &A_entries, + const AValuesType &A_values, + const LRowMapType &L_row_map, + const LEntriesType &L_entries, LValuesType &L_values, + const URowMapType &U_row_map, + const UEntriesType &U_entries, + UValuesType &U_values) { + using TPF = FunctorTypeMacro(ILUKLvlSchedTP1NumericFunctor, false); + using TPB = FunctorTypeMacro(ILUKLvlSchedTP1NumericFunctor, true); + + size_type nlevels = thandle.get_num_levels(); + int team_size = thandle.get_team_size(); + const auto block_size = thandle.get_block_size(); + const auto block_enabled = thandle.is_block_enabled(); + + LevelHostViewType level_ptr_h = thandle.get_host_level_ptr(); + LevelViewType level_idx = thandle.get_level_idx(); + + LevelHostViewType level_nchunks_h, level_nrowsperchunk_h; + WorkViewType iw; + level_nchunks_h = thandle.get_level_nchunks(); level_nrowsperchunk_h = thandle.get_level_nrowsperchunk(); - } - iw = thandle.get_iw(); + iw = thandle.get_iw(); - // Main loop must be performed sequential. Question: Try out Cuda's graph - // stuff to reduce kernel launch overhead - for (size_type lvl = 0; lvl < nlevels; ++lvl) { - nnz_lno_t lev_start = level_ptr_h(lvl); - nnz_lno_t lev_end = level_ptr_h(lvl + 1); + // Main loop must be performed sequential. Question: Try out Cuda's graph + // stuff to reduce kernel launch overhead + for (size_type lvl = 0; lvl < nlevels; ++lvl) { + lno_t lev_start = level_ptr_h(lvl); + lno_t lev_end = level_ptr_h(lvl + 1); - if ((lev_end - lev_start) != 0) { - if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP) { - Kokkos::parallel_for( - "parfor_fixed_lvl", - Kokkos::RangePolicy(lev_start, lev_end), - ILUKLvlSchedRPNumericFunctor< - ARowMapType, AEntriesType, AValuesType, LRowMapType, - LEntriesType, LValuesType, URowMapType, UEntriesType, - UValuesType, HandleDeviceEntriesType, WorkViewType, nnz_lno_t>( - A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, - U_row_map, U_entries, U_values, level_idx, iw, lev_start)); - } else if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { - using policy_type = Kokkos::TeamPolicy; - - nnz_lno_t lvl_rowid_start = 0; - nnz_lno_t lvl_nrows_chunk; + if ((lev_end - lev_start) != 0) { + lno_t lvl_rowid_start = 0; + lno_t lvl_nrows_chunk; for (int chunkid = 0; chunkid < level_nchunks_h(lvl); chunkid++) { if ((lvl_rowid_start + level_nrowsperchunk_h(lvl)) > (lev_end - lev_start)) @@ -422,163 +554,110 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, else lvl_nrows_chunk = level_nrowsperchunk_h(lvl); - ILUKLvlSchedTP1NumericFunctor< - ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType, - LValuesType, URowMapType, UEntriesType, UValuesType, - HandleDeviceEntriesType, WorkViewType, nnz_lno_t> - tstf(A_row_map, A_entries, A_values, L_row_map, L_entries, - L_values, U_row_map, U_entries, U_values, level_idx, iw, - lev_start + lvl_rowid_start); - - if (team_size == -1) - Kokkos::parallel_for( - "parfor_tp1", policy_type(lvl_nrows_chunk, Kokkos::AUTO), tstf); - else - Kokkos::parallel_for("parfor_tp1", - policy_type(lvl_nrows_chunk, team_size), tstf); + team_policy tpolicy = get_team_policy(lvl_nrows_chunk, team_size); + KernelLaunchMacro(A_row_map, A_entries, A_values, L_row_map, + L_entries, L_values, U_row_map, U_entries, U_values, + tpolicy, "parfor_tp1", level_idx, iw, + lev_start + lvl_rowid_start, TPF, TPB, + block_enabled, block_size); Kokkos::fence(); lvl_rowid_start += lvl_nrows_chunk; } - } - } // end if - } // end for lvl - //} + } // end if + } // end for lvl // Output check #ifdef NUMERIC_OUTPUT_INFO - std::cout << " iluk_numeric result: " << std::endl; + std::cout << " iluk_numeric result: " << std::endl; - std::cout << " nnzL: " << thandle.get_nnzL() << std::endl; - std::cout << " L_row_map = "; - for (size_type i = 0; i < thandle.get_nrows() + 1; ++i) { - std::cout << L_row_map(i) << " "; - } - std::cout << std::endl; + std::cout << " nnzL: " << thandle.get_nnzL() << std::endl; + std::cout << " L_row_map = "; + for (size_type i = 0; i < thandle.get_nrows() + 1; ++i) { + std::cout << L_row_map(i) << " "; + } + std::cout << std::endl; - std::cout << " L_entries = "; - for (size_type i = 0; i < thandle.get_nnzL(); ++i) { - std::cout << L_entries(i) << " "; - } - std::cout << std::endl; + std::cout << " L_entries = "; + for (size_type i = 0; i < thandle.get_nnzL(); ++i) { + std::cout << L_entries(i) << " "; + } + std::cout << std::endl; - std::cout << " L_values = "; - for (size_type i = 0; i < thandle.get_nnzL(); ++i) { - std::cout << L_values(i) << " "; - } - std::cout << std::endl; + std::cout << " L_values = "; + for (size_type i = 0; i < thandle.get_nnzL(); ++i) { + std::cout << L_values(i) << " "; + } + std::cout << std::endl; - std::cout << " nnzU: " << thandle.get_nnzU() << std::endl; - std::cout << " U_row_map = "; - for (size_type i = 0; i < thandle.get_nrows() + 1; ++i) { - std::cout << U_row_map(i) << " "; - } - std::cout << std::endl; + std::cout << " nnzU: " << thandle.get_nnzU() << std::endl; + std::cout << " U_row_map = "; + for (size_type i = 0; i < thandle.get_nrows() + 1; ++i) { + std::cout << U_row_map(i) << " "; + } + std::cout << std::endl; - std::cout << " U_entries = "; - for (size_type i = 0; i < thandle.get_nnzU(); ++i) { - std::cout << U_entries(i) << " "; - } - std::cout << std::endl; + std::cout << " U_entries = "; + for (size_type i = 0; i < thandle.get_nnzU(); ++i) { + std::cout << U_entries(i) << " "; + } + std::cout << std::endl; - std::cout << " U_values = "; - for (size_type i = 0; i < thandle.get_nnzU(); ++i) { - std::cout << U_values(i) << " "; - } - std::cout << std::endl; + std::cout << " U_values = "; + for (size_type i = 0; i < thandle.get_nnzU(); ++i) { + std::cout << U_values(i) << " "; + } + std::cout << std::endl; #endif -} // end iluk_numeric - -template -void iluk_numeric_streams(const std::vector &execspace_v, - const std::vector &thandle_v, - const std::vector &A_row_map_v, - const std::vector &A_entries_v, - const std::vector &A_values_v, - const std::vector &L_row_map_v, - const std::vector &L_entries_v, - std::vector &L_values_v, - const std::vector &U_row_map_v, - const std::vector &U_entries_v, - std::vector &U_values_v) { - using size_type = typename IlukHandle::size_type; - using nnz_lno_t = typename IlukHandle::nnz_lno_t; - using HandleDeviceEntriesType = typename IlukHandle::nnz_lno_view_t; - using WorkViewType = typename IlukHandle::work_view_t; - using LevelHostViewType = typename IlukHandle::nnz_lno_view_host_t; - - // Create vectors for handles' data in streams - int nstreams = execspace_v.size(); - std::vector nlevels_v(nstreams); - std::vector lvl_ptr_h_v(nstreams); - std::vector lvl_idx_v(nstreams); // device views - std::vector lvl_start_v(nstreams); - std::vector lvl_end_v(nstreams); - std::vector iw_v(nstreams); // device views - std::vector stream_have_level_v(nstreams); - - // Retrieve data from handles and find max. number of levels among streams - size_type nlevels_max = 0; - for (int i = 0; i < nstreams; i++) { - nlevels_v[i] = thandle_v[i]->get_num_levels(); - lvl_ptr_h_v[i] = thandle_v[i]->get_host_level_ptr(); - lvl_idx_v[i] = thandle_v[i]->get_level_idx(); - iw_v[i] = thandle_v[i]->get_iw(); - stream_have_level_v[i] = true; - if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; - } - - // Assume all streams use the same algorithm - if (thandle_v[0]->get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP) { - // Main loop must be performed sequential - for (size_type lvl = 0; lvl < nlevels_max; lvl++) { - // Initial work across streams at each level - for (int i = 0; i < nstreams; i++) { - // Only do this if this stream has this level - if (lvl < nlevels_v[i]) { - lvl_start_v[i] = lvl_ptr_h_v[i](lvl); - lvl_end_v[i] = lvl_ptr_h_v[i](lvl + 1); - if ((lvl_end_v[i] - lvl_start_v[i]) != 0) - stream_have_level_v[i] = true; - else - stream_have_level_v[i] = false; - } else - stream_have_level_v[i] = false; - } - - // Main work of the level across streams - // 1. Launch work on all streams - for (int i = 0; i < nstreams; i++) { - // Launch only if stream i-th has this level - if (stream_have_level_v[i]) { - ILUKLvlSchedRPNumericFunctor< - ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType, - LValuesType, URowMapType, UEntriesType, UValuesType, - HandleDeviceEntriesType, WorkViewType, nnz_lno_t> - tstf(A_row_map_v[i], A_entries_v[i], A_values_v[i], - L_row_map_v[i], L_entries_v[i], L_values_v[i], - U_row_map_v[i], U_entries_v[i], U_values_v[i], lvl_idx_v[i], - iw_v[i], lvl_start_v[i]); - Kokkos::parallel_for( - "parfor_rp", - Kokkos::RangePolicy(execspace_v[i], - lvl_start_v[i], lvl_end_v[i]), - tstf); - } // end if (stream_have_level_v[i]) - } // end for streams - } // end for lvl - } // end SEQLVLSCHD_RP - else if (thandle_v[0]->get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { - using policy_type = Kokkos::TeamPolicy; + } // end iluk_numeric + + template + static void iluk_numeric_streams( + const std::vector &execspace_v, + const std::vector &thandle_v, + const std::vector &A_row_map_v, + const std::vector &A_entries_v, + const std::vector &A_values_v, + const std::vector &L_row_map_v, + const std::vector &L_entries_v, + std::vector &L_values_v, + const std::vector &U_row_map_v, + const std::vector &U_entries_v, + std::vector &U_values_v) { + using TPF = FunctorTypeMacro(ILUKLvlSchedTP1NumericFunctor, false); + using TPB = FunctorTypeMacro(ILUKLvlSchedTP1NumericFunctor, true); + + // Create vectors for handles' data in streams + int nstreams = execspace_v.size(); + std::vector nlevels_v(nstreams); + std::vector lvl_ptr_h_v(nstreams); + std::vector lvl_idx_v(nstreams); // device views + std::vector lvl_start_v(nstreams); + std::vector lvl_end_v(nstreams); + std::vector iw_v(nstreams); // device views + std::vector stream_have_level_v(nstreams); + std::vector is_block_enabled_v(nstreams); + std::vector block_size_v(nstreams); + + // Retrieve data from handles and find max. number of levels among streams + size_type nlevels_max = 0; + for (int i = 0; i < nstreams; i++) { + nlevels_v[i] = thandle_v[i]->get_num_levels(); + lvl_ptr_h_v[i] = thandle_v[i]->get_host_level_ptr(); + lvl_idx_v[i] = thandle_v[i]->get_level_idx(); + iw_v[i] = thandle_v[i]->get_iw(); + is_block_enabled_v[i] = thandle_v[i]->is_block_enabled(); + block_size_v[i] = thandle_v[i]->get_block_size(); + stream_have_level_v[i] = true; + if (nlevels_max < nlevels_v[i]) nlevels_max = nlevels_v[i]; + } std::vector lvl_nchunks_h_v(nstreams); std::vector lvl_nrowsperchunk_h_v(nstreams); - std::vector lvl_rowid_start_v(nstreams); + std::vector lvl_rowid_start_v(nstreams); std::vector team_size_v(nstreams); for (int i = 0; i < nstreams; i++) { @@ -590,7 +669,7 @@ void iluk_numeric_streams(const std::vector &execspace_v, // Main loop must be performed sequential for (size_type lvl = 0; lvl < nlevels_max; lvl++) { // Initial work across streams at each level - nnz_lno_t lvl_nchunks_max = 0; + lno_t lvl_nchunks_max = 0; for (int i = 0; i < nstreams; i++) { // Only do this if this stream has this level if (lvl < nlevels_v[i]) { @@ -616,7 +695,7 @@ void iluk_numeric_streams(const std::vector &execspace_v, // Launch only if stream i-th has this chunk if (chunkid < lvl_nchunks_h_v[i](lvl)) { // 1.a. Specify number of rows (i.e. number of teams) to launch - nnz_lno_t lvl_nrows_chunk = 0; + lno_t lvl_nrows_chunk = 0; if ((lvl_rowid_start_v[i] + lvl_nrowsperchunk_h_v[i](lvl)) > (lvl_end_v[i] - lvl_start_v[i])) lvl_nrows_chunk = @@ -625,27 +704,14 @@ void iluk_numeric_streams(const std::vector &execspace_v, lvl_nrows_chunk = lvl_nrowsperchunk_h_v[i](lvl); // 1.b. Create functor for stream i-th and launch - ILUKLvlSchedTP1NumericFunctor< - ARowMapType, AEntriesType, AValuesType, LRowMapType, - LEntriesType, LValuesType, URowMapType, UEntriesType, - UValuesType, HandleDeviceEntriesType, WorkViewType, nnz_lno_t> - tstf(A_row_map_v[i], A_entries_v[i], A_values_v[i], - L_row_map_v[i], L_entries_v[i], L_values_v[i], - U_row_map_v[i], U_entries_v[i], U_values_v[i], - lvl_idx_v[i], iw_v[i], - lvl_start_v[i] + lvl_rowid_start_v[i]); - if (team_size_v[i] == -1) - Kokkos::parallel_for( - "parfor_tp1", - policy_type(execspace_v[i], lvl_nrows_chunk, Kokkos::AUTO), - tstf); - else - Kokkos::parallel_for( - "parfor_tp1", - policy_type(execspace_v[i], lvl_nrows_chunk, - team_size_v[i]), - tstf); - + team_policy tpolicy = get_team_policy( + execspace_v[i], lvl_nrows_chunk, team_size_v[i]); + KernelLaunchMacro(A_row_map_v[i], A_entries_v[i], A_values_v[i], + L_row_map_v[i], L_entries_v[i], L_values_v[i], + U_row_map_v[i], U_entries_v[i], U_values_v[i], + tpolicy, "parfor_tp1", lvl_idx_v[i], iw_v[i], + lvl_start_v[i] + lvl_rowid_start_v[i], TPF, TPB, + is_block_enabled_v[i], block_size_v[i]); // 1.c. Ready to move to next chunk lvl_rowid_start_v[i] += lvl_nrows_chunk; } // end if (chunkid < lvl_nchunks_h_v[i](lvl)) @@ -653,12 +719,15 @@ void iluk_numeric_streams(const std::vector &execspace_v, } // end for streams } // end for chunkid } // end for lvl - } // end SEQLVLSCHD_TP1 + } // end iluk_numeric_streams -} // end iluk_numeric_streams +}; // IlukWrap } // namespace Experimental } // namespace Impl } // namespace KokkosSparse +#undef FunctorTypeMacro +#undef KernelLaunchMacro + #endif diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp index 12f8c43cafa8..f58f691e8939 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spiluk_numeric_spec.hpp @@ -145,6 +145,8 @@ struct SPILUK_NUMERIC { + using Iluk = Experimental::IlukWrap; + static void spiluk_numeric( KernelHandle *handle, const typename KernelHandle::const_nnz_lno_t & /*fill_lev*/, @@ -155,9 +157,9 @@ struct SPILUK_NUMERICget_spiluk_handle(); - Experimental::iluk_numeric(*spiluk_handle, A_row_map, A_entries, A_values, - L_row_map, L_entries, L_values, U_row_map, - U_entries, U_values); + Iluk::iluk_numeric(*spiluk_handle, A_row_map, A_entries, A_values, + L_row_map, L_entries, L_values, U_row_map, U_entries, + U_values); } static void spiluk_numeric_streams( @@ -178,10 +180,10 @@ struct SPILUK_NUMERIC static_cast(L_entries_d.extent(0))) { -#else - if (cntL + lenl > static_cast(L_entries_d.extent(0))) { -#endif // size_type newsize = (size_type) (L_entries_d.extent(0)*EXPAND_FACT); // Kokkos::resize(L_entries, newsize); // Kokkos::resize(L_entries_d, newsize); std::ostringstream os; os << "KokkosSparse::Experimental::spiluk_symbolic: L_entries's extent " "must be larger than " - << L_entries_d.extent(0); + << L_entries_d.extent(0) << ", must be at least " << cntL + lenl + 1; KokkosKernels::Impl::throw_runtime_exception(os.str()); } for (size_type k = 0; k < lenl; ++k) { L_entries(cntL) = h_iL(k); cntL++; } -#ifdef KEEP_DIAG // L diag entry L_entries(cntL) = i; cntL++; -#endif L_row_map(i + 1) = cntL; } // End main loop i diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp index 06fe6f094daa..85e27f1b1bef 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp @@ -27,7 +27,6 @@ #include namespace KokkosSparse { -namespace Experimental { namespace Impl { struct BsrMatrixSpMVTensorCoreFunctorParams { @@ -519,7 +518,6 @@ struct BsrMatrixSpMVTensorCoreDispatcher { }; } // namespace Impl -} // namespace Experimental } // namespace KokkosSparse #endif // #if CUDA && (VOLTA || AMPERE) @@ -537,7 +535,6 @@ struct BsrMatrixSpMVTensorCoreDispatcher { #include "KokkosKernels_ExecSpaceUtils.hpp" namespace KokkosSparse { -namespace Experimental { namespace Impl { namespace Bsr { @@ -677,13 +674,12 @@ struct BSR_GEMV_Functor { // spMatVec_no_transpose: version for CPU execution spaces // (RangePolicy or trivial serial impl used) // -template ()>::type * = nullptr> void spMatVec_no_transpose( - const typename AD::execution_space &exec, - const KokkosKernels::Experimental::Controls &controls, + const typename AD::execution_space &exec, Handle *handle, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix< AT, AO, AD, Kokkos::MemoryTraits, AS> &A, @@ -704,15 +700,8 @@ void spMatVec_no_transpose( AT, AO, AD, Kokkos::MemoryTraits, AS> AMatrix_Internal; - bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule - bool use_static_schedule = false; // Forces the use of a static schedule - if (controls.isParameter("schedule")) { - if (controls.getParameter("schedule") == "dynamic") { - use_dynamic_schedule = true; - } else if (controls.getParameter("schedule") == "static") { - use_static_schedule = true; - } - } + bool use_dynamic_schedule = handle->force_dynamic_schedule; + bool use_static_schedule = handle->force_static_schedule; BSR_GEMV_Functor func( alpha, A, x, beta, y, A.blockDim(), useConjugate); @@ -738,13 +727,12 @@ void spMatVec_no_transpose( // // spMatVec_no_transpose: version for GPU execution spaces (TeamPolicy used) // -template ()>::type * = nullptr> void spMatVec_no_transpose( - const typename AD::execution_space &exec, - const KokkosKernels::Experimental::Controls &controls, + const typename AD::execution_space &exec, Handle *handle, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix< AT, AO, AD, Kokkos::MemoryTraits, AS> &A, @@ -758,15 +746,9 @@ void spMatVec_no_transpose( AMatrix_Internal; typedef typename AMatrix_Internal::execution_space execution_space; - bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule - bool use_static_schedule = false; // Forces the use of a static schedule - if (controls.isParameter("schedule")) { - if (controls.getParameter("schedule") == "dynamic") { - use_dynamic_schedule = true; - } else if (controls.getParameter("schedule") == "static") { - use_static_schedule = true; - } - } + bool use_dynamic_schedule = handle->force_dynamic_schedule; + bool use_static_schedule = handle->force_static_schedule; + int team_size = -1; int vector_length = -1; const auto block_dim = A.blockDim(); @@ -788,14 +770,10 @@ void spMatVec_no_transpose( int64_t worksets = A.numRows(); // - // Use the controls to allow the user to pass in some tuning parameters. + // Use the handle to allow the user to pass in some tuning parameters. // - if (controls.isParameter("team size")) { - team_size = std::stoi(controls.getParameter("team size")); - } - if (controls.isParameter("vector length")) { - vector_length = std::stoi(controls.getParameter("vector length")); - } + if (handle->team_size != -1) team_size = handle->team_size; + if (handle->vector_length != -1) vector_length = handle->vector_length; BSR_GEMV_Functor func( alpha, A, x, beta, y, block_dim, useConjugate); @@ -990,13 +968,12 @@ struct BSR_GEMV_Transpose_Functor { /// \brief spMatVec_transpose: version for CPU execution spaces (RangePolicy or /// trivial serial impl used) -template ()>::type * = nullptr> void spMatVec_transpose( - const typename AD::execution_space &exec, - const KokkosKernels::Experimental::Controls &controls, + const typename AD::execution_space &exec, Handle *handle, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix< AT, AO, AD, Kokkos::MemoryTraits, AS> &A, @@ -1019,15 +996,8 @@ void spMatVec_transpose( AT, AO, AD, Kokkos::MemoryTraits, AS> AMatrix_Internal; - bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule - bool use_static_schedule = false; // Forces the use of a static schedule - if (controls.isParameter("schedule")) { - if (controls.getParameter("schedule") == "dynamic") { - use_dynamic_schedule = true; - } else if (controls.getParameter("schedule") == "static") { - use_static_schedule = true; - } - } + bool use_dynamic_schedule = handle->force_dynamic_schedule; + bool use_static_schedule = handle->force_static_schedule; BSR_GEMV_Transpose_Functor func( alpha, A, x, y, useConjugate); @@ -1051,15 +1021,14 @@ void spMatVec_transpose( // // spMatVec_transpose: version for GPU execution spaces (TeamPolicy used) // -template ()>::type * = nullptr> void spMatVec_transpose(const typename AMatrix::execution_space &exec, - const KokkosKernels::Experimental::Controls &controls, - const AlphaType &alpha, const AMatrix &A, - const XVector &x, const BetaType &beta, YVector &y, - bool useConjugate) { + Handle *handle, const AlphaType &alpha, + const AMatrix &A, const XVector &x, + const BetaType &beta, YVector &y, bool useConjugate) { if (A.numRows() <= 0) { return; } @@ -1073,17 +1042,10 @@ void spMatVec_transpose(const typename AMatrix::execution_space &exec, else if (beta != Kokkos::ArithTraits::one()) KokkosBlas::scal(exec, y, beta, y); - bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule - bool use_static_schedule = false; // Forces the use of a static schedule - if (controls.isParameter("schedule")) { - if (controls.getParameter("schedule") == "dynamic") { - use_dynamic_schedule = true; - } else if (controls.getParameter("schedule") == "static") { - use_static_schedule = true; - } - } - int team_size = -1; - int vector_length = -1; + bool use_dynamic_schedule = handle->force_dynamic_schedule; + bool use_static_schedule = handle->force_static_schedule; + int team_size = -1; + int vector_length = -1; int64_t worksets = A.numRows(); @@ -1104,14 +1066,10 @@ void spMatVec_transpose(const typename AMatrix::execution_space &exec, } // - // Use the controls to allow the user to pass in some tuning parameters. + // Use the handle to allow the user to pass in some tuning parameters. // - if (controls.isParameter("team size")) { - team_size = std::stoi(controls.getParameter("team size")); - } - if (controls.isParameter("vector length")) { - vector_length = std::stoi(controls.getParameter("vector length")); - } + if (handle->team_size != -1) team_size = handle->team_size; + if (handle->vector_length != -1) vector_length = handle->vector_length; BSR_GEMV_Transpose_Functor func(alpha, A, x, y, useConjugate); @@ -1319,13 +1277,12 @@ struct BSR_GEMM_Functor { // spMatMultiVec_no_transpose: version for CPU execution spaces // (RangePolicy or trivial serial impl used) // -template ()>::type * = nullptr> void spMatMultiVec_no_transpose( - const typename AD::execution_space &exec, - const KokkosKernels::Experimental::Controls &controls, + const typename AD::execution_space &exec, Handle *handle, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix< AT, AO, AD, Kokkos::MemoryTraits, AS> &A, @@ -1344,15 +1301,8 @@ void spMatMultiVec_no_transpose( AT, AO, AD, Kokkos::MemoryTraits, AS> AMatrix_Internal; - bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule - bool use_static_schedule = false; // Forces the use of a static schedule - if (controls.isParameter("schedule")) { - if (controls.getParameter("schedule") == "dynamic") { - use_dynamic_schedule = true; - } else if (controls.getParameter("schedule") == "static") { - use_static_schedule = true; - } - } + bool use_dynamic_schedule = handle->force_dynamic_schedule; + bool use_static_schedule = handle->force_static_schedule; BSR_GEMM_Functor func(alpha, A, x, beta, y, useConjugate); @@ -1379,13 +1329,12 @@ void spMatMultiVec_no_transpose( // spMatMultiVec_no_transpose: version for GPU execution spaces (TeamPolicy // used) // -template ()>::type * = nullptr> void spMatMultiVec_no_transpose( - const typename AD::execution_space &exec, - const KokkosKernels::Experimental::Controls &controls, + const typename AD::execution_space &exec, Handle *handle, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix< AT, AO, AD, Kokkos::MemoryTraits, AS> &A, @@ -1399,15 +1348,10 @@ void spMatMultiVec_no_transpose( AMatrix_Internal; typedef typename AMatrix_Internal::execution_space execution_space; - bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule - bool use_static_schedule = false; // Forces the use of a static schedule - if (controls.isParameter("schedule")) { - if (controls.getParameter("schedule") == "dynamic") { - use_dynamic_schedule = true; - } else if (controls.getParameter("schedule") == "static") { - use_static_schedule = true; - } - } + bool use_dynamic_schedule = + handle->force_dynamic_schedule; // Forces the use of a dynamic schedule + bool use_static_schedule = + handle->force_static_schedule; // Forces the use of a static schedule int team_size = -1; int vector_length = -1; @@ -1429,14 +1373,10 @@ void spMatMultiVec_no_transpose( } // - // Use the controls to allow the user to pass in some tuning parameters. + // Use the handle to allow the user to pass in some tuning parameters. // - if (controls.isParameter("team size")) { - team_size = std::stoi(controls.getParameter("team size")); - } - if (controls.isParameter("vector length")) { - vector_length = std::stoi(controls.getParameter("vector length")); - } + if (handle->team_size != -1) team_size = handle->team_size; + if (handle->vector_length != -1) vector_length = handle->vector_length; BSR_GEMM_Functor func(alpha, A, x, beta, y, useConjugate); @@ -1649,14 +1589,13 @@ struct BSR_GEMM_Transpose_Functor { /// \brief spMatMultiVec_transpose: version for CPU execution spaces /// (RangePolicy or trivial serial impl used) -template ()>::type * = nullptr> void spMatMultiVec_transpose( - const execution_space &exec, - const KokkosKernels::Experimental::Controls &controls, - const AlphaType &alpha, + const execution_space &exec, Handle *handle, const AlphaType &alpha, const KokkosSparse::Experimental::BsrMatrix< AT, AO, AD, Kokkos::MemoryTraits, AS> &A, const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) { @@ -1674,15 +1613,8 @@ void spMatMultiVec_transpose( AT, AO, AD, Kokkos::MemoryTraits, AS> AMatrix_Internal; - bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule - bool use_static_schedule = false; // Forces the use of a static schedule - if (controls.isParameter("schedule")) { - if (controls.getParameter("schedule") == "dynamic") { - use_dynamic_schedule = true; - } else if (controls.getParameter("schedule") == "static") { - use_static_schedule = true; - } - } + bool use_dynamic_schedule = handle->force_dynamic_schedule; + bool use_static_schedule = handle->force_static_schedule; BSR_GEMM_Transpose_Functor @@ -1705,15 +1637,14 @@ void spMatMultiVec_transpose( // // spMatMultiVec_transpose: version for GPU execution spaces (TeamPolicy used) // -template ()>::type * = nullptr> -void spMatMultiVec_transpose( - const execution_space &exec, - const KokkosKernels::Experimental::Controls &controls, - const AlphaType &alpha, const AMatrix &A, const XVector &x, - const BetaType &beta, YVector &y, bool useConjugate) { +void spMatMultiVec_transpose(const execution_space &exec, Handle *handle, + const AlphaType &alpha, const AMatrix &A, + const XVector &x, const BetaType &beta, YVector &y, + bool useConjugate) { if (A.numRows() <= 0) { return; } @@ -1723,18 +1654,11 @@ void spMatMultiVec_transpose( else if (beta != Kokkos::ArithTraits::one()) KokkosBlas::scal(exec, y, beta, y); - bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule - bool use_static_schedule = false; // Forces the use of a static schedule - if (controls.isParameter("schedule")) { - if (controls.getParameter("schedule") == "dynamic") { - use_dynamic_schedule = true; - } else if (controls.getParameter("schedule") == "static") { - use_static_schedule = true; - } - } - int team_size = -1; - int vector_length = -1; - int64_t worksets = A.numRows(); + bool use_dynamic_schedule = handle->force_dynamic_schedule; + bool use_static_schedule = handle->force_static_schedule; + int team_size = -1; + int vector_length = -1; + int64_t worksets = A.numRows(); const auto block_dim = A.blockDim(); if (block_dim <= 4) { @@ -1752,15 +1676,10 @@ void spMatMultiVec_transpose( } // - // Use the controls to allow the user to pass in some tuning - // parameters. + // Use the handle to allow the user to pass in some tuning parameters. // - if (controls.isParameter("team size")) { - team_size = std::stoi(controls.getParameter("team size")); - } - if (controls.isParameter("vector length")) { - vector_length = std::stoi(controls.getParameter("vector length")); - } + if (handle->team_size != -1) team_size = handle->team_size; + if (handle->vector_length != -1) vector_length = handle->vector_length; BSR_GEMM_Transpose_Functor func( alpha, A, x, y, useConjugate); @@ -1813,9 +1732,7 @@ void spMatMultiVec_transpose( /* ******************* */ } // namespace Bsr - } // namespace Impl -} // namespace Experimental } // namespace KokkosSparse #endif // KOKKOSSPARSE_IMPL_SPMV_BSRMATRIX_IMPL_HPP_ diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp index 1c0d2fc3614b..a0f4ed154008 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp @@ -121,11 +121,6 @@ void apply_v42(const typename AMatrix::execution_space &exec, Kokkos::RangePolicy policy(exec, 0, y.size()); if constexpr (YVector::rank == 1) { -// lbv - 07/26/2023: -// with_unmanaged_t<...> required Kokkos 4.1.0, -// the content of this header will be guarded -// until v4.3.0 -#if KOKKOS_VERSION >= 40100 || defined(DOXY) // Implementation expects a 2D view, so create an unmanaged 2D view // with extent 1 in the second dimension using Y2D = KokkosKernels::Impl::with_unmanaged_t>; -#else - // Implementation expects a 2D view, so create an unmanaged 2D view - // with extent 1 in the second dimension - using Y2D = Kokkos::View< - typename YVector::value_type * [1], typename YVector::array_layout, - typename YVector::device_type, Kokkos::MemoryTraits>; - using X2D = Kokkos::View< - typename XVector::value_type * [1], typename XVector::array_layout, - typename XVector::device_type, Kokkos::MemoryTraits>; -#endif // KOKKOS_VERSION >= 40100 || defined(DOXY) const Y2D yu(y.data(), y.extent(0), 1); const X2D xu(x.data(), x.extent(0), 1); BsrSpmvV42NonTrans op(alpha, a, xu, beta, yu); diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index 564100879eee..5c2bf0edfa58 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -21,7 +21,7 @@ #include #include "KokkosSparse_BsrMatrix.hpp" -#include "KokkosKernels_Controls.hpp" +#include "KokkosSparse_spmv_handle.hpp" #include "KokkosKernels_Error.hpp" #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY #include @@ -29,32 +29,32 @@ #endif namespace KokkosSparse { -namespace Experimental { namespace Impl { // default is no eti available -template +template struct spmv_bsrmatrix_eti_spec_avail { enum : bool { value = false }; }; -template > struct spmv_mv_bsrmatrix_eti_spec_avail { enum : bool { value = false }; }; -} // namespace Impl -} // namespace Experimental -} // namespace KokkosSparse - #define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_AVAIL( \ SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ MEM_SPACE_TYPE) \ template <> \ struct spmv_bsrmatrix_eti_spec_avail< \ EXEC_SPACE_TYPE, \ + KokkosSparse::Impl::SPMVHandleImpl, \ ::KokkosSparse::Experimental::BsrMatrix< \ const SCALAR_TYPE, const ORDINAL_TYPE, \ Kokkos::Device, \ @@ -75,6 +75,9 @@ struct spmv_mv_bsrmatrix_eti_spec_avail { template <> \ struct spmv_mv_bsrmatrix_eti_spec_avail< \ EXEC_SPACE_TYPE, \ + KokkosSparse::Impl::SPMVHandleImpl, \ ::KokkosSparse::Experimental::BsrMatrix< \ const SCALAR_TYPE, const ORDINAL_TYPE, \ Kokkos::Device, \ @@ -89,86 +92,83 @@ struct spmv_mv_bsrmatrix_eti_spec_avail { enum : bool { value = true }; \ }; +} // namespace Impl +} // namespace KokkosSparse + // Include which ETIs are available #include #include #include namespace KokkosSparse { -namespace Experimental { namespace Impl { // declaration -template ::value, + ExecutionSpace, Handle, AMatrix, XVector, YVector>::value, bool eti_spec_avail = spmv_bsrmatrix_eti_spec_avail< - ExecutionSpace, AMatrix, XVector, YVector>::value> + ExecutionSpace, Handle, AMatrix, XVector, YVector>::value> struct SPMV_BSRMATRIX { typedef typename YVector::non_const_value_type YScalar; - static void spmv_bsrmatrix( - const ExecutionSpace &space, - const KokkosKernels::Experimental::Controls &controls, const char mode[], - const YScalar &alpha, const AMatrix &A, const XVector &x, - const YScalar &beta, const YVector &y); + static void spmv_bsrmatrix(const ExecutionSpace &space, Handle *handle, + const char mode[], const YScalar &alpha, + const AMatrix &A, const XVector &x, + const YScalar &beta, const YVector &y); }; // declaration -template , bool tpl_spec_avail = spmv_mv_bsrmatrix_tpl_spec_avail< - ExecutionSpace, AMatrix, XVector, YVector>::value, + ExecutionSpace, Handle, AMatrix, XVector, YVector>::value, bool eti_spec_avail = spmv_mv_bsrmatrix_eti_spec_avail< - ExecutionSpace, AMatrix, XVector, YVector>::value> + ExecutionSpace, Handle, AMatrix, XVector, YVector>::value> struct SPMV_MV_BSRMATRIX { typedef typename YVector::non_const_value_type YScalar; - static void spmv_mv_bsrmatrix( - const ExecutionSpace &space, - const KokkosKernels::Experimental::Controls &controls, const char mode[], - const YScalar &alpha, const AMatrix &A, const XVector &x, - const YScalar &beta, const YVector &y); + static void spmv_mv_bsrmatrix(const ExecutionSpace &space, Handle *handle, + const char mode[], const YScalar &alpha, + const AMatrix &A, const XVector &x, + const YScalar &beta, const YVector &y); }; // actual implementations to be compiled #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -// these should all be different -constexpr inline const char *ALG_V41 = "v4.1"; -constexpr inline const char *ALG_V42 = "v4.2"; -constexpr inline const char *ALG_TC = "experimental_bsr_tc"; - -template -struct SPMV_BSRMATRIX +struct SPMV_BSRMATRIX { typedef typename YVector::non_const_value_type YScalar; - static void spmv_bsrmatrix( - const ExecutionSpace &space, - const KokkosKernels::Experimental::Controls &controls, const char mode[], - const YScalar &alpha, const AMatrix &A, const XVector &X, - const YScalar &beta, const YVector &Y) { + static void spmv_bsrmatrix(const ExecutionSpace &space, Handle *handle, + const char mode[], const YScalar &alpha, + const AMatrix &A, const XVector &X, + const YScalar &beta, const YVector &Y) { const bool modeIsNoTrans = (mode[0] == NoTranspose[0]); const bool modeIsConjugate = (mode[0] == Conjugate[0]); const bool modeIsConjugateTrans = (mode[0] == ConjugateTranspose[0]); const bool modeIsTrans = (mode[0] == Transpose[0]); // use V41 if requested - if (controls.getParameter("algorithm") == ALG_V41) { + if (handle->algo == SPMV_BSR_V41) { if (modeIsNoTrans || modeIsConjugate) { - return Bsr::spMatVec_no_transpose(space, controls, alpha, A, X, beta, Y, + return Bsr::spMatVec_no_transpose(space, handle, alpha, A, X, beta, Y, modeIsConjugate); } else if (modeIsTrans || modeIsConjugateTrans) { - return Bsr::spMatVec_transpose(space, controls, alpha, A, X, beta, Y, + return Bsr::spMatVec_transpose(space, handle, alpha, A, X, beta, Y, modeIsConjugateTrans); } } // use V42 if possible if (KokkosKernels::Impl::kk_is_gpu_exec_space() || - controls.getParameter("algorithm") == ALG_V42) { + handle->algo == SPMV_BSR_V42) { if (modeIsNoTrans) { ::KokkosSparse::Impl::apply_v42(space, alpha, A, X, beta, Y); return; @@ -177,10 +177,10 @@ struct SPMV_BSRMATRIX -struct SPMV_MV_BSRMATRIX { +template +struct SPMV_MV_BSRMATRIX { typedef typename YVector::non_const_value_type YScalar; enum class Method { @@ -204,27 +205,18 @@ struct SPMV_MV_BSRMATRIX::value) { + if (handle->algo == SPMV_BSR_TC) method = Method::TensorCores; + if (!KokkosSparse::Impl::TensorCoresAvailable::value) { method = Method::Fallback; } // can't use tensor cores unless mode is no-transpose @@ -249,28 +241,23 @@ struct SPMV_MV_BSRMATRIXbsr_tc_precision; switch (precision) { - case Precision::Mixed: { + case KokkosSparse::Experimental::Bsr_TC_Precision::Mixed: { BsrMatrixSpMVTensorCoreDispatcher::dispatch(space, alpha, A, X, beta, Y); return; } - case Precision::Double: { + case KokkosSparse::Experimental::Bsr_TC_Precision::Double: { BsrMatrixSpMVTensorCoreDispatcher::dispatch(space, alpha, A, X, beta, Y); return; } - case Precision::Automatic: // fallthrough + case KokkosSparse::Experimental::Bsr_TC_Precision::Automatic: default: { constexpr bool operandsHalfHalfFloat = std::is_same::value && @@ -312,19 +299,19 @@ struct SPMV_MV_BSRMATRIXalgo == SPMV_BSR_V41) { if (modeIsNoTrans || modeIsConjugate) { - return Bsr::spMatMultiVec_no_transpose(space, controls, alpha, A, X, - beta, Y, modeIsConjugate); + return Bsr::spMatMultiVec_no_transpose(space, handle, alpha, A, X, beta, + Y, modeIsConjugate); } else if (modeIsTrans || modeIsConjugateTrans) { - return Bsr::spMatMultiVec_transpose(space, controls, alpha, A, X, beta, - Y, modeIsConjugateTrans); + return Bsr::spMatMultiVec_transpose(space, handle, alpha, A, X, beta, Y, + modeIsConjugateTrans); } } // use V42 if possible if (KokkosKernels::Impl::kk_is_gpu_exec_space() || - controls.getParameter("algorithm") == ALG_V42) { + handle->algo == SPMV_BSR_V42) { if (modeIsNoTrans) { ::KokkosSparse::Impl::apply_v42(space, alpha, A, X, beta, Y); return; @@ -333,10 +320,10 @@ struct SPMV_MV_BSRMATRIX -struct SPMV_MV_BSRMATRIX { +template +struct SPMV_MV_BSRMATRIX { typedef typename YVector::non_const_value_type YScalar; - static void spmv_mv_bsrmatrix( - const ExecutionSpace &space, - const KokkosKernels::Experimental::Controls &controls, const char mode[], - const YScalar &alpha, const AMatrix &A, const XVector &X, - const YScalar &beta, const YVector &Y) { + static void spmv_mv_bsrmatrix(const ExecutionSpace &space, Handle *handle, + const char mode[], const YScalar &alpha, + const AMatrix &A, const XVector &X, + const YScalar &beta, const YVector &Y) { static_assert(std::is_integral_v, "This implementation is only for integer Scalar types."); - for (typename AMatrix::non_const_size_type j = 0; j < X.extent(1); ++j) { + for (size_t j = 0; j < X.extent(1); ++j) { const auto x_j = Kokkos::subview(X, Kokkos::ALL(), j); auto y_j = Kokkos::subview(Y, Kokkos::ALL(), j); - typedef SPMV_BSRMATRIX impl_type; - impl_type::spmv_bsrmatrix(space, controls, mode, alpha, A, x_j, beta, - y_j); + impl_type::spmv_bsrmatrix(space, handle, mode, alpha, A, x_j, beta, y_j); } } }; #endif // !defined(KOKKOSKERNELS_ETI_ONLY) || // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY } // namespace Impl -} // namespace Experimental } // namespace KokkosSparse // declare / instantiate the vector version @@ -387,6 +372,9 @@ struct SPMV_MV_BSRMATRIX, \ ::KokkosSparse::Experimental::BsrMatrix< \ const SCALAR_TYPE, const ORDINAL_TYPE, \ Kokkos::Device, \ @@ -405,6 +393,9 @@ struct SPMV_MV_BSRMATRIX, \ ::KokkosSparse::Experimental::BsrMatrix< \ const SCALAR_TYPE, const ORDINAL_TYPE, \ Kokkos::Device, \ @@ -426,6 +417,9 @@ struct SPMV_MV_BSRMATRIX, \ ::KokkosSparse::Experimental::BsrMatrix< \ const SCALAR_TYPE, const ORDINAL_TYPE, \ Kokkos::Device, \ @@ -444,6 +438,9 @@ struct SPMV_MV_BSRMATRIX, \ ::KokkosSparse::Experimental::BsrMatrix< \ const SCALAR_TYPE, const ORDINAL_TYPE, \ Kokkos::Device, \ diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_impl.hpp index 4f90002a61fa..5f9cbea04004 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -24,6 +24,7 @@ #include "KokkosBlas1_scal.hpp" #include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_spmv_handle.hpp" #include "KokkosSparse_spmv_impl_omp.hpp" #include "KokkosSparse_spmv_impl_merge.hpp" #include "KokkosKernels_Error.hpp" @@ -249,16 +250,15 @@ int64_t spmv_launch_parameters(int64_t numRows, int64_t nnz, // spmv_beta_no_transpose: version for CPU execution spaces (RangePolicy or // trivial serial impl used) -template ()>::type* = nullptr> -static void spmv_beta_no_transpose( - const execution_space& exec, - const KokkosKernels::Experimental::Controls& controls, - typename YVector::const_value_type& alpha, const AMatrix& A, - const XVector& x, typename YVector::const_value_type& beta, - const YVector& y) { +static void spmv_beta_no_transpose(const execution_space& exec, Handle* handle, + typename YVector::const_value_type& alpha, + const AMatrix& A, const XVector& x, + typename YVector::const_value_type& beta, + const YVector& y) { typedef typename AMatrix::non_const_ordinal_type ordinal_type; if (A.numRows() <= static_cast(0)) { @@ -363,15 +363,8 @@ static void spmv_beta_no_transpose( } #endif - bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule - bool use_static_schedule = false; // Forces the use of a static schedule - if (controls.isParameter("schedule")) { - if (controls.getParameter("schedule") == "dynamic") { - use_dynamic_schedule = true; - } else if (controls.getParameter("schedule") == "static") { - use_static_schedule = true; - } - } + bool use_dynamic_schedule = handle->force_dynamic_schedule; + bool use_static_schedule = handle->force_static_schedule; SPMV_Functor func(alpha, A, x, beta, y, 1); if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) @@ -389,47 +382,26 @@ static void spmv_beta_no_transpose( } // spmv_beta_no_transpose: version for GPU execution spaces (TeamPolicy used) -template ()>::type* = nullptr> -static void spmv_beta_no_transpose( - const execution_space& exec, - const KokkosKernels::Experimental::Controls& controls, - typename YVector::const_value_type& alpha, const AMatrix& A, - const XVector& x, typename YVector::const_value_type& beta, - const YVector& y) { +static void spmv_beta_no_transpose(const execution_space& exec, Handle* handle, + typename YVector::const_value_type& alpha, + const AMatrix& A, const XVector& x, + typename YVector::const_value_type& beta, + const YVector& y) { typedef typename AMatrix::non_const_ordinal_type ordinal_type; if (A.numRows() <= static_cast(0)) { return; } - bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule - bool use_static_schedule = false; // Forces the use of a static schedule - if (controls.isParameter("schedule")) { - if (controls.getParameter("schedule") == "dynamic") { - use_dynamic_schedule = true; - } else if (controls.getParameter("schedule") == "static") { - use_static_schedule = true; - } - } - int team_size = -1; - int vector_length = -1; - int64_t rows_per_thread = -1; - - // Note on 03/24/20, lbv: We can use the controls - // here to allow the user to pass in some tunning - // parameters. - if (controls.isParameter("team size")) { - team_size = std::stoi(controls.getParameter("team size")); - } - if (controls.isParameter("vector length")) { - vector_length = std::stoi(controls.getParameter("vector length")); - } - if (controls.isParameter("rows per thread")) { - rows_per_thread = std::stoll(controls.getParameter("rows per thread")); - } + bool use_dynamic_schedule = handle->force_dynamic_schedule; + bool use_static_schedule = handle->force_static_schedule; + int team_size = handle->team_size; + int vector_length = handle->vector_length; + int64_t rows_per_thread = handle->rows_per_thread; int64_t rows_per_team = spmv_launch_parameters( A.numRows(), A.nnz(), rows_per_thread, team_size, vector_length); @@ -622,30 +594,29 @@ static void spmv_beta_transpose(const execution_space& exec, op); } -template -static void spmv_beta(const execution_space& exec, - const KokkosKernels::Experimental::Controls& controls, +template +static void spmv_beta(const execution_space& exec, Handle* handle, const char mode[], typename YVector::const_value_type& alpha, const AMatrix& A, const XVector& x, typename YVector::const_value_type& beta, const YVector& y) { if (mode[0] == NoTranspose[0]) { - if (controls.getParameter("algorithm") == KOKKOSSPARSE_ALG_NATIVE_MERGE) { + if (handle->algo == SPMV_MERGE_PATH) { SpmvMergeHierarchical::spmv( exec, mode, alpha, A, x, beta, y); } else { - spmv_beta_no_transpose(exec, controls, alpha, A, x, beta, y); + spmv_beta_no_transpose(exec, handle, alpha, A, x, beta, y); } } else if (mode[0] == Conjugate[0]) { - if (controls.getParameter("algorithm") == KOKKOSSPARSE_ALG_NATIVE_MERGE) { + if (handle->algo == SPMV_MERGE_PATH) { SpmvMergeHierarchical::spmv( exec, mode, alpha, A, x, beta, y); } else { - spmv_beta_no_transpose(exec, controls, alpha, A, x, beta, y); + spmv_beta_no_transpose(exec, handle, alpha, A, x, beta, y); } } else if (mode[0] == Transpose[0]) { spmv_beta_transpose #include "KokkosSparse_CrsMatrix.hpp" -#include "KokkosKernels_Controls.hpp" +#include "KokkosSparse_spmv_handle.hpp" // Include the actual functors #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY #include @@ -30,11 +30,13 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spmv_eti_spec_avail { enum : bool { value = false }; }; -template > struct spmv_mv_eti_spec_avail { @@ -50,6 +52,9 @@ struct spmv_mv_eti_spec_avail { template <> \ struct spmv_eti_spec_avail< \ EXEC_SPACE_TYPE, \ + KokkosSparse::Impl::SPMVHandleImpl, \ KokkosSparse::CrsMatrix, \ Kokkos::MemoryTraits, \ @@ -70,6 +75,9 @@ struct spmv_mv_eti_spec_avail { template <> \ struct spmv_mv_eti_spec_avail< \ EXEC_SPACE_TYPE, \ + KokkosSparse::Impl::SPMVHandleImpl, \ KokkosSparse::CrsMatrix, \ Kokkos::MemoryTraits, \ @@ -100,17 +108,16 @@ namespace Impl { /// /// For the implementation of KokkosSparse::spmv for multivectors (2-D /// Views), see the SPMV_MV struct below. -template < - class ExecutionSpace, class AMatrix, class XVector, class YVector, - bool tpl_spec_avail = - spmv_tpl_spec_avail::value, - bool eti_spec_avail = - spmv_eti_spec_avail::value> +template ::value, + bool eti_spec_avail = spmv_eti_spec_avail< + ExecutionSpace, Handle, AMatrix, XVector, YVector>::value> struct SPMV { typedef typename YVector::non_const_value_type coefficient_type; - static void spmv(const ExecutionSpace& space, - const KokkosKernels::Experimental::Controls& controls, + static void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y); @@ -140,18 +147,18 @@ struct SPMV { /// matrix's entries have integer type. Per Github Issue #700, we /// don't optimize as heavily for that case, in order to reduce build /// times and library sizes. -template , - bool tpl_spec_avail = spmv_mv_tpl_spec_avail::value, - bool eti_spec_avail = spmv_mv_eti_spec_avail::value> + bool tpl_spec_avail = spmv_mv_tpl_spec_avail< + ExecutionSpace, Handle, AMatrix, XVector, YVector>::value, + bool eti_spec_avail = spmv_mv_eti_spec_avail< + ExecutionSpace, Handle, AMatrix, XVector, YVector>::value> struct SPMV_MV { typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv(const ExecutionSpace& space, - const KokkosKernels::Experimental::Controls& controls, + static void spmv_mv(const ExecutionSpace& space, Handle* handle, const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y); @@ -160,90 +167,114 @@ struct SPMV_MV { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of spmv for single vectors (1-D Views). // Unification layer -template -struct SPMV +struct SPMV { typedef typename YVector::non_const_value_type coefficient_type; - static void spmv(const ExecutionSpace& space, - const KokkosKernels::Experimental::Controls& controls, + static void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y) { typedef Kokkos::ArithTraits KAT; - if (alpha == KAT::zero()) { - if (beta != KAT::one()) { - KokkosBlas::scal(space, y, beta, y); - } - return; - } - if (beta == KAT::zero()) { - spmv_beta( - space, controls, mode, alpha, A, x, beta, y); + spmv_beta( + space, handle, mode, alpha, A, x, beta, y); } else if (beta == KAT::one()) { - spmv_beta( - space, controls, mode, alpha, A, x, beta, y); + spmv_beta( + space, handle, mode, alpha, A, x, beta, y); } else if (beta == -KAT::one()) { - spmv_beta( - space, controls, mode, alpha, A, x, beta, y); + spmv_beta( + space, handle, mode, alpha, A, x, beta, y); } else { - spmv_beta( - space, controls, mode, alpha, A, x, beta, y); + spmv_beta( + space, handle, mode, alpha, A, x, beta, y); } } }; //! Full specialization of spmv_mv for single vectors (2-D Views). // Unification layer -template -struct SPMV_MV +struct SPMV_MV { typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv(const ExecutionSpace& space, - const KokkosKernels::Experimental::Controls& /*controls*/, + static void spmv_mv(const ExecutionSpace& space, Handle* handle, const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y) { typedef Kokkos::ArithTraits KAT; - - if (alpha == KAT::zero()) { - spmv_alpha_mv( - space, mode, alpha, A, x, beta, y); - } else if (alpha == KAT::one()) { - spmv_alpha_mv( - space, mode, alpha, A, x, beta, y); - } else if (alpha == -KAT::one()) { - spmv_alpha_mv( - space, mode, alpha, A, x, beta, y); + // Intercept special case: if x/y have only 1 column and both are + // contiguous, use the more efficient single-vector impl. + // + // We cannot do this if x or y is noncontiguous, because the column subview + // must be LayoutStride which is not ETI'd. + // + // Do not use a TPL even if one is available for the types: + // we don't want the same handle being used in both TPL and non-TPL versions + if (x.extent(1) == size_t(1) && x.span_is_contiguous() && + y.span_is_contiguous()) { + Kokkos::View + x0(x.data(), x.extent(0)); + Kokkos::View + y0(y.data(), y.extent(0)); + if (beta == KAT::zero()) { + spmv_beta(space, handle, mode, alpha, A, x0, beta, y0); + } else if (beta == KAT::one()) { + spmv_beta(space, handle, mode, alpha, A, x0, beta, y0); + } else if (beta == -KAT::one()) { + spmv_beta(space, handle, mode, alpha, A, x0, beta, y0); + } else { + spmv_beta(space, handle, mode, alpha, A, x0, beta, y0); + } } else { - spmv_alpha_mv( - space, mode, alpha, A, x, beta, y); + if (alpha == KAT::zero()) { + spmv_alpha_mv( + space, mode, alpha, A, x, beta, y); + } else if (alpha == KAT::one()) { + spmv_alpha_mv( + space, mode, alpha, A, x, beta, y); + } else if (alpha == -KAT::one()) { + spmv_alpha_mv( + space, mode, alpha, A, x, beta, y); + } else { + spmv_alpha_mv( + space, mode, alpha, A, x, beta, y); + } } } }; -template -struct SPMV_MV +struct SPMV_MV { typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv(const ExecutionSpace& space, - const KokkosKernels::Experimental::Controls& /*controls*/, + static void spmv_mv(const ExecutionSpace& space, Handle* handle, const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y) { static_assert(std::is_integral_v, "This implementation is only for integer Scalar types."); KokkosKernels::Experimental::Controls defaultControls; - for (typename AMatrix::non_const_size_type j = 0; j < x.extent(1); ++j) { + for (size_t j = 0; j < x.extent(1); ++j) { auto x_j = Kokkos::subview(x, Kokkos::ALL(), j); auto y_j = Kokkos::subview(y, Kokkos::ALL(), j); - typedef SPMV + typedef SPMV impl_type; - impl_type::spmv(space, defaultControls, mode, alpha, A, x_j, beta, y_j); + impl_type::spmv(space, handle, mode, alpha, A, x_j, beta, y_j); } } }; @@ -264,6 +295,9 @@ struct SPMV_MV, \ KokkosSparse::CrsMatrix, \ Kokkos::MemoryTraits, \ @@ -282,6 +316,9 @@ struct SPMV_MV, \ KokkosSparse::CrsMatrix, \ Kokkos::MemoryTraits, \ @@ -300,6 +337,9 @@ struct SPMV_MV, \ KokkosSparse::CrsMatrix, \ Kokkos::MemoryTraits, \ @@ -318,6 +358,9 @@ struct SPMV_MV, \ KokkosSparse::CrsMatrix, \ Kokkos::MemoryTraits, \ diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp index 7605f03fa2da..019a63fcd7aa 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp @@ -22,10 +22,11 @@ namespace KokkosSparse { namespace Impl { -template -void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, +void sptrsvcuSPARSE_symbolic(ExecutionSpace &space, KernelHandle *sptrsv_handle, typename KernelHandle::nnz_lno_t nrows, ain_row_index_view_type row_map, ain_nonzero_index_view_type entries, @@ -61,6 +62,9 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h->handle, space.cuda_stream())); + int64_t nnz = static_cast(entries.extent(0)); size_t pBufferSize; void *rm; @@ -98,13 +102,13 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, CUSPARSE_INDEX_BASE_ZERO, cudaValueType)); // Create dummy dense vector B (RHS) - nnz_scalar_view_t b_dummy("b_dummy", nrows); + nnz_scalar_view_t b_dummy(Kokkos::view_alloc(space, "b_dummy"), nrows); KOKKOS_CUSPARSE_SAFE_CALL( cusparseCreateDnVec(&(h->vecBDescr_dummy), static_cast(nrows), b_dummy.data(), cudaValueType)); // Create dummy dense vector X (LHS) - nnz_scalar_view_t x_dummy("x_dummy", nrows); + nnz_scalar_view_t x_dummy(Kokkos::view_alloc(space, "x_dummy"), nrows); KOKKOS_CUSPARSE_SAFE_CALL( cusparseCreateDnVec(&(h->vecXDescr_dummy), static_cast(nrows), x_dummy.data(), cudaValueType)); @@ -155,17 +159,20 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, std::is_same::value || std::is_same::value; - if (!is_cuda_space) { + if constexpr (!is_cuda_space) { throw std::runtime_error( "KokkosKernels sptrsvcuSPARSE_symbolic: MEMORY IS NOT ALLOCATED IN GPU " "DEVICE for CUSPARSE\n"); - } else if (std::is_same::value) { + } else if constexpr (std::is_same::value) { bool is_lower = sptrsv_handle->is_lower_tri(); sptrsv_handle->create_cuSPARSE_Handle(trans, is_lower); - typename KernelHandle::SPTRSVcuSparseHandleType* h = + typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h->handle, space.cuda_stream())); + cusparseStatus_t status; status = cusparseCreateCsrsv2Info(&(h->info)); if (CUSPARSE_STATUS_SUCCESS != status) @@ -178,85 +185,86 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, if (!std::is_same::value) sptrsv_handle->allocate_tmp_int_rowmap(row_map.extent(0)); - const int* rm = !std::is_same::value + const int *rm = !std::is_same::value ? sptrsv_handle->get_int_rowmap_ptr_copy(row_map) - : (const int*)row_map.data(); - const int* ent = (const int*)entries.data(); - const scalar_type* vals = values.data(); + : (const int *)row_map.data(); + const int *ent = (const int *)entries.data(); + const scalar_type *vals = values.data(); if (std::is_same::value) { cusparseDcsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr, - (double*)vals, (int*)rm, (int*)ent, h->info, + (double *)vals, (int *)rm, (int *)ent, h->info, &pBufferSize); // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. cudaError_t my_error; - my_error = cudaMalloc((void**)&(h->pBuffer), pBufferSize); + my_error = cudaMalloc((void **)&(h->pBuffer), pBufferSize); if (cudaSuccess != my_error) std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(my_error) << std::endl; status = cusparseDcsrsv2_analysis( - h->handle, h->transpose, nrows, nnz, h->descr, (double*)vals, - (int*)rm, (int*)ent, h->info, h->policy, h->pBuffer); + h->handle, h->transpose, nrows, nnz, h->descr, (double *)vals, + (int *)rm, (int *)ent, h->info, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "analysis status error name " << (status) << std::endl; } else if (std::is_same::value) { cusparseScsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr, - (float*)vals, (int*)rm, (int*)ent, h->info, + (float *)vals, (int *)rm, (int *)ent, h->info, &pBufferSize); // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. cudaError_t my_error; - my_error = cudaMalloc((void**)&(h->pBuffer), pBufferSize); + my_error = cudaMalloc((void **)&(h->pBuffer), pBufferSize); if (cudaSuccess != my_error) std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(my_error) << std::endl; status = cusparseScsrsv2_analysis( - h->handle, h->transpose, nrows, nnz, h->descr, (float*)vals, (int*)rm, - (int*)ent, h->info, h->policy, h->pBuffer); + h->handle, h->transpose, nrows, nnz, h->descr, (float *)vals, + (int *)rm, (int *)ent, h->info, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "analysis status error name " << (status) << std::endl; } else if (std::is_same >::value) { cusparseZcsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr, - (cuDoubleComplex*)vals, (int*)rm, (int*)ent, + (cuDoubleComplex *)vals, (int *)rm, (int *)ent, h->info, &pBufferSize); // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. cudaError_t my_error; - my_error = cudaMalloc((void**)&(h->pBuffer), pBufferSize); + my_error = cudaMalloc((void **)&(h->pBuffer), pBufferSize); if (cudaSuccess != my_error) std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(my_error) << std::endl; - status = cusparseZcsrsv2_analysis( - h->handle, h->transpose, nrows, nnz, h->descr, (cuDoubleComplex*)vals, - (int*)rm, (int*)ent, h->info, h->policy, h->pBuffer); + status = + cusparseZcsrsv2_analysis(h->handle, h->transpose, nrows, nnz, + h->descr, (cuDoubleComplex *)vals, (int *)rm, + (int *)ent, h->info, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "analysis status error name " << (status) << std::endl; } else if (std::is_same >::value) { cusparseCcsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr, - (cuComplex*)vals, (int*)rm, (int*)ent, h->info, - &pBufferSize); + (cuComplex *)vals, (int *)rm, (int *)ent, + h->info, &pBufferSize); // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. cudaError_t my_error; - my_error = cudaMalloc((void**)&(h->pBuffer), pBufferSize); + my_error = cudaMalloc((void **)&(h->pBuffer), pBufferSize); if (cudaSuccess != my_error) std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(my_error) << std::endl; status = cusparseCcsrsv2_analysis( - h->handle, h->transpose, nrows, nnz, h->descr, (cuComplex*)vals, - (int*)rm, (int*)ent, h->info, h->policy, h->pBuffer); + h->handle, h->transpose, nrows, nnz, h->descr, (cuComplex *)vals, + (int *)rm, (int *)ent, h->info, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "analysis status error name " << (status) << std::endl; @@ -269,6 +277,7 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, } #endif #else + (void)space; (void)sptrsv_handle; (void)nrows; (void)row_map; @@ -281,10 +290,11 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, } template < - typename KernelHandle, typename ain_row_index_view_type, - typename ain_nonzero_index_view_type, typename ain_values_scalar_view_type, - typename b_values_scalar_view_type, typename x_values_scalar_view_type> -void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, + typename ExecutionSpace, typename KernelHandle, + typename ain_row_index_view_type, typename ain_nonzero_index_view_type, + typename ain_values_scalar_view_type, typename b_values_scalar_view_type, + typename x_values_scalar_view_type> +void sptrsvcuSPARSE_solve(ExecutionSpace &space, KernelHandle *sptrsv_handle, typename KernelHandle::nnz_lno_t nrows, ain_row_index_view_type row_map, ain_nonzero_index_view_type entries, @@ -323,6 +333,9 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h->handle, space.cuda_stream())); + const scalar_type alpha = scalar_type(1.0); cudaDataType cudaValueType = cuda_data_type_from(); @@ -354,18 +367,23 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, if (std::is_same::value) { cusparseStatus_t status; - typename KernelHandle::SPTRSVcuSparseHandleType* h = + typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); + if constexpr (std::is_same_v) { + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h->handle, space.cuda_stream())); + } + int nnz = entries.extent_int(0); - const int* rm = !std::is_same::value + const int *rm = !std::is_same::value ? sptrsv_handle->get_int_rowmap_ptr() - : (const int*)row_map.data(); - const int* ent = (const int*)entries.data(); - const scalar_type* vals = values.data(); - const scalar_type* bv = rhs.data(); - scalar_type* xv = lhs.data(); + : (const int *)row_map.data(); + const int *ent = (const int *)entries.data(); + const scalar_type *vals = values.data(); + const scalar_type *bv = rhs.data(); + scalar_type *xv = lhs.data(); if (std::is_same::value) { if (h->pBuffer == nullptr) { @@ -373,10 +391,10 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, } const double alpha = double(1); - status = cusparseDcsrsv2_solve(h->handle, h->transpose, nrows, nnz, - &alpha, h->descr, (double*)vals, (int*)rm, - (int*)ent, h->info, (double*)bv, - (double*)xv, h->policy, h->pBuffer); + status = cusparseDcsrsv2_solve( + h->handle, h->transpose, nrows, nnz, &alpha, h->descr, (double *)vals, + (int *)rm, (int *)ent, h->info, (double *)bv, (double *)xv, h->policy, + h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "solve status error name " << (status) << std::endl; @@ -387,9 +405,9 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, const float alpha = float(1); status = cusparseScsrsv2_solve(h->handle, h->transpose, nrows, nnz, - &alpha, h->descr, (float*)vals, (int*)rm, - (int*)ent, h->info, (float*)bv, (float*)xv, - h->policy, h->pBuffer); + &alpha, h->descr, (float *)vals, (int *)rm, + (int *)ent, h->info, (float *)bv, + (float *)xv, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "solve status error name " << (status) << std::endl; @@ -399,8 +417,8 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, cualpha.y = 0.0; status = cusparseZcsrsv2_solve( h->handle, h->transpose, nrows, nnz, &cualpha, h->descr, - (cuDoubleComplex*)vals, (int*)rm, (int*)ent, h->info, - (cuDoubleComplex*)bv, (cuDoubleComplex*)xv, h->policy, h->pBuffer); + (cuDoubleComplex *)vals, (int *)rm, (int *)ent, h->info, + (cuDoubleComplex *)bv, (cuDoubleComplex *)xv, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "solve status error name " << (status) << std::endl; @@ -410,8 +428,8 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, cualpha.y = 0.0; status = cusparseCcsrsv2_solve( h->handle, h->transpose, nrows, nnz, &cualpha, h->descr, - (cuComplex*)vals, (int*)rm, (int*)ent, h->info, (cuComplex*)bv, - (cuComplex*)xv, h->policy, h->pBuffer); + (cuComplex *)vals, (int *)rm, (int *)ent, h->info, (cuComplex *)bv, + (cuComplex *)xv, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "solve status error name " << (status) << std::endl; @@ -425,6 +443,7 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, } #endif #else + (void)space; (void)sptrsv_handle; (void)nrows; (void)row_map; @@ -539,13 +558,13 @@ void sptrsvcuSPARSE_solve_streams( "CUSPARSE requires local ordinals to be integer.\n"); } else { const scalar_type alpha = scalar_type(1.0); - std::vector sptrsv_handle_v(nstreams); - std::vector h_v(nstreams); - std::vector rm_v(nstreams); - std::vector ent_v(nstreams); - std::vector vals_v(nstreams); - std::vector bv_v(nstreams); - std::vector xv_v(nstreams); + std::vector sptrsv_handle_v(nstreams); + std::vector h_v(nstreams); + std::vector rm_v(nstreams); + std::vector ent_v(nstreams); + std::vector vals_v(nstreams); + std::vector bv_v(nstreams); + std::vector xv_v(nstreams); for (int i = 0; i < nstreams; i++) { sptrsv_handle_v[i] = handle_v[i].get_sptrsv_handle(); @@ -560,8 +579,8 @@ void sptrsvcuSPARSE_solve_streams( } rm_v[i] = !std::is_same::value ? sptrsv_handle_v[i]->get_int_rowmap_ptr() - : reinterpret_cast(row_map_v[i].data()); - ent_v[i] = reinterpret_cast(entries_v[i].data()); + : reinterpret_cast(row_map_v[i].data()); + ent_v[i] = reinterpret_cast(entries_v[i].data()); vals_v[i] = values_v[i].data(); bv_v[i] = rhs_v[i].data(); xv_v[i] = lhs_v[i].data(); @@ -573,42 +592,42 @@ void sptrsvcuSPARSE_solve_streams( if (std::is_same::value) { KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrsv2_solve( h_v[i]->handle, h_v[i]->transpose, nrows, nnz, - reinterpret_cast(&alpha), h_v[i]->descr, - reinterpret_cast(vals_v[i]), - reinterpret_cast(rm_v[i]), - reinterpret_cast(ent_v[i]), h_v[i]->info, - reinterpret_cast(bv_v[i]), - reinterpret_cast(xv_v[i]), h_v[i]->policy, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); } else if (std::is_same::value) { KOKKOS_CUSPARSE_SAFE_CALL(cusparseScsrsv2_solve( h_v[i]->handle, h_v[i]->transpose, nrows, nnz, - reinterpret_cast(&alpha), h_v[i]->descr, - reinterpret_cast(vals_v[i]), - reinterpret_cast(rm_v[i]), - reinterpret_cast(ent_v[i]), h_v[i]->info, - reinterpret_cast(bv_v[i]), - reinterpret_cast(xv_v[i]), h_v[i]->policy, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); } else if (std::is_same >::value) { KOKKOS_CUSPARSE_SAFE_CALL(cusparseZcsrsv2_solve( h_v[i]->handle, h_v[i]->transpose, nrows, nnz, - reinterpret_cast(&alpha), h_v[i]->descr, - reinterpret_cast(vals_v[i]), - reinterpret_cast(rm_v[i]), - reinterpret_cast(ent_v[i]), h_v[i]->info, - reinterpret_cast(bv_v[i]), - reinterpret_cast(xv_v[i]), h_v[i]->policy, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); } else if (std::is_same >::value) { KOKKOS_CUSPARSE_SAFE_CALL(cusparseCcsrsv2_solve( h_v[i]->handle, h_v[i]->transpose, nrows, nnz, - reinterpret_cast(&alpha), h_v[i]->descr, - reinterpret_cast(vals_v[i]), - reinterpret_cast(rm_v[i]), - reinterpret_cast(ent_v[i]), h_v[i]->info, - reinterpret_cast(bv_v[i]), - reinterpret_cast(xv_v[i]), h_v[i]->policy, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); } else { throw std::runtime_error("CUSPARSE wrapper error: unsupported type.\n"); diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index ee7e83b55449..a64a4d23bc03 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -664,8 +664,6 @@ struct LowerTriLvlSchedTP2SolverFunctor { // Helper functors for Lower-triangular solve with SpMV template struct SparseTriSupernodalSpMVFunctor { - // using execution_space = typename LHSType::execution_space; - // using memory_space = typename execution_space::memory_space; using execution_space = typename TriSolveHandle::HandleExecSpace; using memory_space = typename TriSolveHandle::HandleTempMemorySpace; @@ -2891,16 +2889,15 @@ void upper_tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map, #endif -template -void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - const RHSType &rhs, LHSType &lhs) { +template +void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, const RHSType &rhs, + LHSType &lhs) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) cudaProfilerStop(); #endif - - typedef typename TriSolveHandle::execution_space execution_space; typedef typename TriSolveHandle::size_type size_type; typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; @@ -2914,7 +2911,8 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; - using memory_space = typename TriSolveHandle::memory_space; + using memory_space = typename TriSolveHandle::HandleTempMemorySpace; + using device_t = Kokkos::Device; using integer_view_t = typename TriSolveHandle::integer_view_t; using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; using scalar_t = typename ValuesType::non_const_value_type; @@ -2981,8 +2979,10 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { Kokkos::parallel_for( "parfor_fixed_lvl", - Kokkos::RangePolicy(node_count, - node_count + lvl_nodes), + Kokkos::Experimental::require( + Kokkos::RangePolicy(space, node_count, + node_count + lvl_nodes), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), LowerTriLvlSchedRPSolverFunctor( @@ -2990,8 +2990,8 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm:: SEQLVLSCHD_TP1) { - typedef Kokkos::TeamPolicy policy_type; - int team_size = thandle.get_team_size(); + using team_policy_t = Kokkos::TeamPolicy; + int team_size = thandle.get_team_size(); #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED TriLvlSchedTP1SolverFunctor; + using team_policy_type = Kokkos::TeamPolicy; using supernode_view_type = - Kokkos::View; if (diag_kernel_type_host(lvl) == 3) { // using device-level kernels (functor is called to scatter the @@ -3079,9 +3087,12 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-2, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } for (size_type league_rank = 0; league_rank < lvl_nodes; @@ -3118,7 +3129,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, auto Ljj = Kokkos::subview( viewL, range_type(0, nsrow), Kokkos::ALL()); // s-th supernocal column of L - KokkosBlas::gemv("N", one, Ljj, Xj, zero, Y); + KokkosBlas::gemv(space, "N", one, Ljj, Xj, zero, Y); } else { auto Xj = Kokkos::subview( lhs, @@ -3131,15 +3142,17 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, if (invert_diagonal) { auto Y = Kokkos::subview( work, range_type(workoffset, workoffset + nscol)); - KokkosBlas::gemv("N", one, Ljj, Y, zero, Xj); + KokkosBlas::gemv(space, "N", one, Ljj, Y, zero, Xj); } else { char unit_diag = (unit_diagonal ? 'U' : 'N'); // NOTE: we currently supports only default_layout = // LayoutLeft - Kokkos::View Xjj(Xj.data(), nscol, 1); - KokkosBlas::trsm("L", "L", "N", &unit_diag, one, Ljj, Xjj); + KokkosBlas::trsm(space, "L", "L", "N", &unit_diag, one, Ljj, + Xjj); + // TODO: space.fence(); Kokkos::fence(); } // update off-diagonal blocks @@ -3155,7 +3168,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, viewL, range_type(nscol, nsrow), Kokkos::ALL()); // off-diagonal blocks of s-th supernodal // column of L - KokkosBlas::gemv("N", one, Lij, Xj, zero, Z); + KokkosBlas::gemv(space, "N", one, Lij, Xj, zero, Z); } } } @@ -3165,9 +3178,12 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } } @@ -3178,9 +3194,12 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, supercols, row_map, entries, values, lvl, kernel_type, diag_kernel_type, lhs, work, work_offset, nodes_grouped_by_level, node_count); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_functor); #ifdef profile_supernodal_etree Kokkos::fence(); @@ -3200,7 +3219,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, #endif // initialize input & output vectors - using team_policy_type = Kokkos::TeamPolicy; + using team_policy_type = Kokkos::TeamPolicy; // update with spmv (one or two SpMV) bool transpose_spmv = @@ -3210,36 +3229,45 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, if (!invert_offdiagonal) { // solve with diagonals auto digmat = thandle.get_diagblock(lvl); - KokkosSparse::spmv(tran, one, digmat, lhs, one, work); + KokkosSparse::spmv(space, tran, one, digmat, lhs, one, work); // copy from work to lhs corresponding to diagonal blocks SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } else { // copy lhs corresponding to diagonal blocks to work and zero out in // lhs SparseTriSupernodalSpMVFunctor sptrsv_init_functor(1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } // update off-diagonals (potentiall combined with solve with // diagonals) auto submat = thandle.get_submatrix(lvl); - KokkosSparse::spmv(tran, one, submat, work, one, lhs); + KokkosSparse::spmv(space, tran, one, submat, work, one, lhs); // reinitialize workspace SparseTriSupernodalSpMVFunctor sptrsv_finalize_functor(0, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_finalize_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_finalize_functor); #ifdef profile_supernodal_etree Kokkos::fence(); @@ -3272,16 +3300,17 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, } // end lower_tri_solve -template -void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - const RHSType &rhs, LHSType &lhs) { +template +void upper_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, const RHSType &rhs, + LHSType &lhs) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) cudaProfilerStop(); #endif - typedef typename TriSolveHandle::execution_space execution_space; - + using memory_space = typename TriSolveHandle::HandleTempMemorySpace; + using device_t = Kokkos::Device; typedef typename TriSolveHandle::size_type size_type; typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; @@ -3298,7 +3327,6 @@ void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) using namespace KokkosSparse::Experimental; - using memory_space = typename TriSolveHandle::memory_space; using integer_view_t = typename TriSolveHandle::integer_view_t; using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; using scalar_t = typename ValuesType::non_const_value_type; @@ -3365,14 +3393,16 @@ void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { Kokkos::parallel_for( "parfor_fixed_lvl", - Kokkos::RangePolicy(node_count, - node_count + lvl_nodes), + Kokkos::Experimental::require( + Kokkos::RangePolicy(space, node_count, + node_count + lvl_nodes), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), UpperTriLvlSchedRPSolverFunctor( row_map, entries, values, lhs, rhs, nodes_grouped_by_level)); } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1) { - typedef Kokkos::TeamPolicy policy_type; + using team_policy_t = Kokkos::TeamPolicy; int team_size = thandle.get_team_size(); @@ -3388,11 +3418,19 @@ void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, node_count); #endif if (team_size == -1) - Kokkos::parallel_for("parfor_u_team", - policy_type(lvl_nodes, Kokkos::AUTO), tstf); + Kokkos::parallel_for( + "parfor_u_team", + Kokkos::Experimental::require( + team_policy_t(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); else - Kokkos::parallel_for("parfor_u_team", - policy_type(lvl_nodes, team_size), tstf); + Kokkos::parallel_for( + "parfor_u_team", + Kokkos::Experimental::require( + team_policy_t(space, lvl_nodes, team_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } // TP2 algorithm has issues with some offset-ordinal combo to be addressed /* @@ -3444,7 +3482,7 @@ tstf); } // end elseif timer.reset(); #endif - using team_policy_type = Kokkos::TeamPolicy; + using team_policy_type = Kokkos::TeamPolicy; if (thandle.is_column_major()) { // U stored in CSC if (diag_kernel_type_host(lvl) == 3) { // using device-level kernels (functor is called to gather the input @@ -3457,9 +3495,12 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-2, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } for (size_type league_rank = 0; league_rank < lvl_nodes; league_rank++) { @@ -3486,7 +3527,7 @@ tstf); } // end elseif // create a view for the s-th supernocal block column // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View viewU(&dataU[i1], nsrow, nscol); @@ -3500,7 +3541,7 @@ tstf); } // end elseif workoffset, workoffset + nsrow)); // needed with gemv for update&scatter - KokkosBlas::gemv("N", one, Uij, Xj, zero, Z); + KokkosBlas::gemv(space, "N", one, Uij, Xj, zero, Z); } else { // extract part of the solution, corresponding to the diagonal // block @@ -3517,14 +3558,14 @@ tstf); } // end elseif workoffset, workoffset + nscol)); // needed for gemv instead of trmv/trsv - KokkosBlas::gemv("N", one, Ujj, Y, zero, Xj); + KokkosBlas::gemv(space, "N", one, Ujj, Y, zero, Xj); } else { // NOTE: we currently supports only default_layout = // LayoutLeft - Kokkos::View Xjj(Xj.data(), nscol, 1); - KokkosBlas::trsm("L", "U", "N", "N", one, Ujj, Xjj); + KokkosBlas::trsm(space, "L", "U", "N", "N", one, Ujj, Xjj); } // update off-diagonal blocks if (nsrow2 > 0) { @@ -3538,7 +3579,7 @@ tstf); } // end elseif workoffset + nscol, workoffset + nscol + nsrow2)); // needed with gemv for update&scatter - KokkosBlas::gemv("N", one, Uij, Xj, zero, Z); + KokkosBlas::gemv(space, "N", one, Uij, Xj, zero, Z); } } } @@ -3548,9 +3589,12 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } } @@ -3562,10 +3606,13 @@ tstf); } // end elseif diag_kernel_type, lhs, work, work_offset, nodes_grouped_by_level, node_count); - using policy_type = Kokkos::TeamPolicy; - Kokkos::parallel_for("parfor_usolve_tran_supernode", - policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_functor); + using team_policy_t = Kokkos::TeamPolicy; + Kokkos::parallel_for( + "parfor_usolve_tran_supernode", + Kokkos::Experimental::require( + team_policy_t(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_functor); } else { // U stored in CSR // launching sparse-triangular solve functor UpperTriSupernodalFunctor; - Kokkos::parallel_for("parfor_usolve_supernode", - policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_functor); + using team_policy_t = Kokkos::TeamPolicy; + Kokkos::parallel_for( + "parfor_usolve_supernode", + Kokkos::Experimental::require( + team_policy_t(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_functor); if (diag_kernel_type_host(lvl) == 3) { // using device-level kernels (functor is called to gather the input @@ -3608,7 +3658,7 @@ tstf); } // end elseif // create a view for the s-th supernocal block column // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View viewU(&dataU[i1], nsrow, nscol); @@ -3634,7 +3684,7 @@ tstf); } // end elseif workoffset + nscol, workoffset + nscol + nsrow2)); // needed with gemv for update&scatter - KokkosBlas::gemv("T", -one, Uij, Z, one, Xj); + KokkosBlas::gemv(space, "T", -one, Uij, Z, one, Xj); } // "triangular-solve" to compute Xj @@ -3642,13 +3692,13 @@ tstf); } // end elseif auto Ujj = Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL()); if (invert_diagonal) { - KokkosBlas::gemv("T", one, Ujj, Xj, zero, Y); + KokkosBlas::gemv(space, "T", one, Ujj, Xj, zero, Y); } else { // NOTE: we currently supports only default_layout = LayoutLeft - Kokkos::View Xjj(Xj.data(), nscol, 1); - KokkosBlas::trsm("L", "L", "T", "N", one, Ujj, Xjj); + KokkosBlas::trsm(space, "L", "L", "T", "N", one, Ujj, Xjj); } } if (invert_diagonal) { @@ -3657,9 +3707,12 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } } } @@ -3680,7 +3733,7 @@ tstf); } // end elseif #endif // initialize input & output vectors - using team_policy_type = Kokkos::TeamPolicy; + using team_policy_type = Kokkos::TeamPolicy; // update with one, or two, spmv bool transpose_spmv = @@ -3691,28 +3744,34 @@ tstf); } // end elseif if (!invert_offdiagonal) { // solve with diagonals auto digmat = thandle.get_diagblock(lvl); - KokkosSparse::spmv(tran, one, digmat, lhs, one, work); + KokkosSparse::spmv(space, tran, one, digmat, lhs, one, work); // copy from work to lhs corresponding to diagonal blocks SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } else { // zero out lhs corresponding to diagonal blocks in lhs, and copy to // work SparseTriSupernodalSpMVFunctor sptrsv_init_functor(1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } // update with off-diagonals (potentiall combined with diagonal // solves) auto submat = thandle.get_submatrix(lvl); - KokkosSparse::spmv(tran, one, submat, work, one, lhs); + KokkosSparse::spmv(space, tran, one, submat, work, one, lhs); } else { if (!invert_offdiagonal) { // zero out lhs corresponding to diagonal blocks in lhs, and copy to @@ -3720,17 +3779,20 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_init_functor(1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); // update with off-diagonals auto submat = thandle.get_submatrix(lvl); - KokkosSparse::spmv(tran, one, submat, lhs, one, work); + KokkosSparse::spmv(space, tran, one, submat, lhs, one, work); // solve with diagonals auto digmat = thandle.get_diagblock(lvl); - KokkosSparse::spmv(tran, one, digmat, work, one, lhs); + KokkosSparse::spmv(space, tran, one, digmat, work, one, lhs); } else { std::cout << " ** invert_offdiag with U in CSR not supported **" << std::endl; @@ -3740,9 +3802,12 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_finalize_functor(0, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_finalize_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_finalize_functor); #ifdef profile_supernodal_etree Kokkos::fence(); @@ -3765,23 +3830,22 @@ tstf); } // end elseif double sptrsv_time_seconds = sptrsv_timer.seconds(); std::cout << " + SpTrsv(uppper) time: " << sptrsv_time_seconds << std::endl << std::endl; - std::cout << " + Execution space : " << execution_space::name() + std::cout << " + Execution space : " << ExecutionSpace::name() << std::endl; std::cout << " + Memory space : " << memory_space::name() << std::endl; #endif } // end upper_tri_solve -template -void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - const RHSType &rhs, LHSType &lhs, +template +void tri_solve_chain(ExecutionSpace &space, TriSolveHandle &thandle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, const RHSType &rhs, LHSType &lhs, const bool /*is_lowertri_*/) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) cudaProfilerStop(); #endif - typedef typename TriSolveHandle::execution_space execution_space; typedef typename TriSolveHandle::size_type size_type; typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; @@ -3802,9 +3866,9 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, size_type node_count = 0; // REFACTORED to cleanup; next, need debug and timer routines - using policy_type = Kokkos::TeamPolicy; + using policy_type = Kokkos::TeamPolicy; using large_cutoff_policy_type = - Kokkos::TeamPolicy; + Kokkos::TeamPolicy; /* using TP1Functor = TriLvlSchedTP1SolverFunctor; using LTP1Functor = @@ -3865,14 +3929,17 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif if (team_size == -1) { team_size = - policy_type(1, 1, vector_size) + policy_type(space, 1, 1, vector_size) .team_size_recommended(tstf, Kokkos::ParallelForTag()); } size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? - Kokkos::parallel_for("parfor_l_team_chain1", - policy_type(lvl_nodes, team_size, vector_size), - tstf); + Kokkos::parallel_for( + "parfor_l_team_chain1", + Kokkos::Experimental::require( + policy_type(space, lvl_nodes, team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); node_count += lvl_nodes; } else { @@ -3884,7 +3951,7 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, if (team_size_singleblock <= 0) { team_size_singleblock = - policy_type(1, 1, vector_size) + policy_type(space, 1, 1, vector_size) .team_size_recommended( SingleBlockFunctor(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, @@ -3907,7 +3974,10 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif Kokkos::parallel_for( "parfor_l_team_chainmulti", - policy_type(1, team_size_singleblock, vector_size), tstf); + Kokkos::Experimental::require( + policy_type(space, 1, team_size_singleblock, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } else { // team_size_singleblock < cutoff => kernel must allow for a // block-stride internally @@ -3925,11 +3995,15 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif Kokkos::parallel_for( "parfor_l_team_chainmulti_cutoff", - large_cutoff_policy_type(1, team_size_singleblock, vector_size), + Kokkos::Experimental::require( + large_cutoff_policy_type(1, team_size_singleblock, + vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); } node_count += lvl_nodes; } + // TODO: space.fence() Kokkos::fence(); // TODO - is this necessary? that is, can the // parallel_for launch before the s/echain values have // been updated? @@ -3955,16 +4029,19 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif if (team_size == -1) { team_size = - policy_type(1, 1, vector_size) + policy_type(space, 1, 1, vector_size) .team_size_recommended(tstf, Kokkos::ParallelForTag()); } // TODO To use cudagraph here, need to know how many non-unit chains // there are, create a graph for each and launch accordingly size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? - Kokkos::parallel_for("parfor_u_team_chain1", - policy_type(lvl_nodes, team_size, vector_size), - tstf); + Kokkos::parallel_for( + "parfor_u_team_chain1", + Kokkos::Experimental::require( + policy_type(space, lvl_nodes, team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); node_count += lvl_nodes; } else { @@ -3980,7 +4057,7 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, // values, lhs, rhs, nodes_grouped_by_level, is_lowertri, node_count), // Kokkos::ParallelForTag()); team_size_singleblock = - policy_type(1, 1, vector_size) + policy_type(space, 1, 1, vector_size) .team_size_recommended( SingleBlockFunctor(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, @@ -4003,7 +4080,10 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif Kokkos::parallel_for( "parfor_u_team_chainmulti", - policy_type(1, team_size_singleblock, vector_size), tstf); + Kokkos::Experimental::require( + policy_type(space, 1, team_size_singleblock, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } else { // team_size_singleblock < cutoff => kernel must allow for a // block-stride internally @@ -4021,11 +4101,15 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif Kokkos::parallel_for( "parfor_u_team_chainmulti_cutoff", - large_cutoff_policy_type(1, team_size_singleblock, vector_size), + Kokkos::Experimental::require( + large_cutoff_policy_type(1, team_size_singleblock, + vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); } node_count += lvl_nodes; } + // TODO: space.fence() Kokkos::fence(); // TODO - is this necessary? that is, can the // parallel_for launch before the s/echain values have // been updated? diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index e36b9df2369d..6ad321c286b9 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -96,9 +96,9 @@ template ::value> struct SPTRSV_SOLVE { - static void sptrsv_solve(KernelHandle *handle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - BType b, XType x); + static void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, BType b, XType x); static void sptrsv_solve_streams( const std::vector &execspace_v, @@ -117,9 +117,9 @@ template { - static void sptrsv_solve(KernelHandle *handle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - BType b, XType x) { + static void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, BType b, XType x) { // Call specific algorithm type auto sptrsv_handle = handle->get_sptrsv_handle(); Kokkos::Profiling::pushRegion(sptrsv_handle->is_lower_tri() @@ -127,40 +127,44 @@ struct SPTRSV_SOLVEis_lower_tri()) { if (sptrsv_handle->is_symbolic_complete() == false) { - Experimental::lower_tri_symbolic(*sptrsv_handle, row_map, entries); + Experimental::lower_tri_symbolic(space, *sptrsv_handle, row_map, + entries); } if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { - Experimental::tri_solve_chain(*sptrsv_handle, row_map, entries, values, - b, x, true); + Experimental::tri_solve_chain(space, *sptrsv_handle, row_map, entries, + values, b, x, true); } else { #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT using ExecSpace = typename RowMapType::memory_space::execution_space; if (std::is_same::value) + // TODO: set stream in thandle's sptrsvCudaGraph Experimental::lower_tri_solve_cg(*sptrsv_handle, row_map, entries, values, b, x); else #endif - Experimental::lower_tri_solve(*sptrsv_handle, row_map, entries, + Experimental::lower_tri_solve(space, *sptrsv_handle, row_map, entries, values, b, x); } } else { if (sptrsv_handle->is_symbolic_complete() == false) { - Experimental::upper_tri_symbolic(*sptrsv_handle, row_map, entries); + Experimental::upper_tri_symbolic(space, *sptrsv_handle, row_map, + entries); } if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { - Experimental::tri_solve_chain(*sptrsv_handle, row_map, entries, values, - b, x, false); + Experimental::tri_solve_chain(space, *sptrsv_handle, row_map, entries, + values, b, x, false); } else { #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT using ExecSpace = typename RowMapType::memory_space::execution_space; if (std::is_same::value) + // TODO: set stream in thandle's sptrsvCudaGraph Experimental::upper_tri_solve_cg(*sptrsv_handle, row_map, entries, values, b, x); else #endif - Experimental::upper_tri_solve(*sptrsv_handle, row_map, entries, + Experimental::upper_tri_solve(space, *sptrsv_handle, row_map, entries, values, b, x); } } @@ -188,7 +192,8 @@ struct SPTRSV_SOLVEis_lower_tri()) { for (int i = 0; i < static_cast(execspace_v.size()); i++) { if (sptrsv_handle_v[i]->is_symbolic_complete() == false) { - Experimental::lower_tri_symbolic(*(sptrsv_handle_v[i]), row_map_v[i], + Experimental::lower_tri_symbolic(execspace_v[i], + *(sptrsv_handle_v[i]), row_map_v[i], entries_v[i]); } } @@ -198,7 +203,8 @@ struct SPTRSV_SOLVE(execspace_v.size()); i++) { if (sptrsv_handle_v[i]->is_symbolic_complete() == false) { - Experimental::upper_tri_symbolic(*(sptrsv_handle_v[i]), row_map_v[i], + Experimental::upper_tri_symbolic(execspace_v[i], + *(sptrsv_handle_v[i]), row_map_v[i], entries_v[i]); } } diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp index 3ef3be878070..36ea2d9df82e 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp @@ -147,9 +147,10 @@ void symbolic_chain_phase(TriSolveHandle& thandle, #endif } // end symbolic_chain_phase -template -void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, - const EntriesType dentries) { +template +void lower_tri_symbolic(ExecSpaceIn& space, TriSolveHandle& thandle, + const RowMapType drow_map, const EntriesType dentries) { #ifdef TRISOLVE_SYMB_TIMERS Kokkos::Timer timer_sym_lowertri_total; Kokkos::Timer timer; @@ -177,10 +178,10 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, size_type nrows = drow_map.extent(0) - 1; auto row_map = Kokkos::create_mirror_view(drow_map); - Kokkos::deep_copy(row_map, drow_map); + Kokkos::deep_copy(space, row_map, drow_map); auto entries = Kokkos::create_mirror_view(dentries); - Kokkos::deep_copy(entries, dentries); + Kokkos::deep_copy(space, entries, dentries); // get device view - will deep_copy to it at end of this host routine DeviceEntriesType dnodes_per_level = thandle.get_nodes_per_level(); @@ -193,11 +194,12 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, DeviceSignedEntriesType dlevel_list = thandle.get_level_list(); HostSignedEntriesType level_list = Kokkos::create_mirror_view(dlevel_list); - Kokkos::deep_copy(level_list, dlevel_list); + Kokkos::deep_copy(space, level_list, dlevel_list); signed_integral_t level = 0; size_type node_count = 0; + space.fence(); // wait for deep copy write to land typename DeviceEntriesType::HostMirror level_ptr( "lp", nrows + 1); // temp View used for index bookkeeping level_ptr(0) = 0; @@ -227,9 +229,9 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, // Create the chain now if (thandle.algm_requires_symb_chain()) { + // No need to pass in space, chain phase runs on the host symbolic_chain_phase(thandle, nodes_per_level); } - thandle.set_symbolic_complete(); // Output check @@ -257,9 +259,9 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, #endif // Deep copy to device views - Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level); - Kokkos::deep_copy(dnodes_per_level, nodes_per_level); - Kokkos::deep_copy(dlevel_list, level_list); + Kokkos::deep_copy(space, dnodes_grouped_by_level, nodes_grouped_by_level); + Kokkos::deep_copy(space, dnodes_per_level, nodes_per_level); + Kokkos::deep_copy(space, dlevel_list, level_list); // Extra check: #ifdef LVL_OUTPUT_INFO @@ -279,6 +281,7 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, check_count); std::cout << " host check_count= " << check_count << std::endl; + space.fence(); // wait for deep copy writes to land check_count = 0; // reset Kokkos::parallel_reduce( "check_count device", @@ -568,20 +571,21 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, thandle.set_workspace_size(max_lwork); // workspace offset initialized to be zero integer_view_t work_offset = thandle.get_work_offset(); - Kokkos::deep_copy(work_offset, work_offset_host); + Kokkos::deep_copy(space, work_offset, work_offset_host); // kernel types // > off-diagonal integer_view_t dkernel_type_by_level = thandle.get_kernel_type(); - Kokkos::deep_copy(dkernel_type_by_level, kernel_type_by_level); + Kokkos::deep_copy(space, dkernel_type_by_level, kernel_type_by_level); // > diagonal integer_view_t ddiag_kernel_type_by_level = thandle.get_diag_kernel_type(); - Kokkos::deep_copy(ddiag_kernel_type_by_level, diag_kernel_type_by_level); + Kokkos::deep_copy(space, ddiag_kernel_type_by_level, + diag_kernel_type_by_level); // deep copy to device (of scheduling info) - Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level); - Kokkos::deep_copy(dnodes_per_level, nodes_per_level); - Kokkos::deep_copy(dlevel_list, level_list); + Kokkos::deep_copy(space, dnodes_grouped_by_level, nodes_grouped_by_level); + Kokkos::deep_copy(space, dnodes_per_level, nodes_per_level); + Kokkos::deep_copy(space, dlevel_list, level_list); #ifdef TRISOLVE_SYMB_TIMERS std::cout << " + workspace time = " << timer.seconds() << std::endl; @@ -598,9 +602,10 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, #endif } // end lower_tri_symbolic -template -void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, - const EntriesType dentries) { +template +void upper_tri_symbolic(ExecutionSpace& space, TriSolveHandle& thandle, + const RowMapType drow_map, const EntriesType dentries) { #ifdef TRISOLVE_SYMB_TIMERS Kokkos::Timer timer_sym_uppertri_total; Kokkos::Timer timer; @@ -626,10 +631,10 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, size_type nrows = drow_map.extent(0) - 1; auto row_map = Kokkos::create_mirror_view(drow_map); - Kokkos::deep_copy(row_map, drow_map); + Kokkos::deep_copy(space, row_map, drow_map); auto entries = Kokkos::create_mirror_view(dentries); - Kokkos::deep_copy(entries, dentries); + Kokkos::deep_copy(space, entries, dentries); // get device view - will deep_copy to it at end of this host routine DeviceEntriesType dnodes_per_level = thandle.get_nodes_per_level(); @@ -642,11 +647,12 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, DeviceSignedEntriesType dlevel_list = thandle.get_level_list(); HostSignedEntriesType level_list = Kokkos::create_mirror_view(dlevel_list); - Kokkos::deep_copy(level_list, dlevel_list); + Kokkos::deep_copy(space, level_list, dlevel_list); signed_integral_t level = 0; size_type node_count = 0; + space.fence(); // Wait for deep copy writes to land typename DeviceEntriesType::HostMirror level_ptr( "lp", nrows + 1); // temp View used for index bookkeeping level_ptr(0) = 0; @@ -708,9 +714,9 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, #endif // Deep copy to device views - Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level); - Kokkos::deep_copy(dnodes_per_level, nodes_per_level); - Kokkos::deep_copy(dlevel_list, level_list); + Kokkos::deep_copy(space, dnodes_grouped_by_level, nodes_grouped_by_level); + Kokkos::deep_copy(space, dnodes_per_level, nodes_per_level); + Kokkos::deep_copy(space, dlevel_list, level_list); // Extra check: #ifdef LVL_OUTPUT_INFO @@ -730,6 +736,7 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, check_count); std::cout << " host check_count= " << check_count << std::endl; + space.fence(); // wait for deep copy writes to land check_count = 0; // reset Kokkos::parallel_reduce( "check_count device", diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp index 73389d10d076..5b9304356d5d 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp @@ -67,33 +67,37 @@ namespace Impl { // Unification layer /// \brief Implementation of KokkosSparse::sptrsv_symbolic -template ::value, bool eti_spec_avail = sptrsv_symbolic_eti_spec_avail< KernelHandle, RowMapType, EntriesType>::value> struct SPTRSV_SYMBOLIC { - static void sptrsv_symbolic(KernelHandle *handle, const RowMapType row_map, + static void sptrsv_symbolic(const ExecutionSpace &space, KernelHandle *handle, + const RowMapType row_map, const EntriesType entries); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of sptrsv_symbolic // Unification layer -template -struct SPTRSV_SYMBOLIC { - static void sptrsv_symbolic(KernelHandle *handle, const RowMapType row_map, +template +struct SPTRSV_SYMBOLIC { + static void sptrsv_symbolic(const ExecutionSpace &space, KernelHandle *handle, + const RowMapType row_map, const EntriesType entries) { auto sptrsv_handle = handle->get_sptrsv_handle(); auto nrows = row_map.extent(0) - 1; sptrsv_handle->new_init_handle(nrows); if (sptrsv_handle->is_lower_tri()) { - Experimental::lower_tri_symbolic(*sptrsv_handle, row_map, entries); + Experimental::lower_tri_symbolic(space, *sptrsv_handle, row_map, entries); sptrsv_handle->set_symbolic_complete(); } else { - Experimental::upper_tri_symbolic(*sptrsv_handle, row_map, entries); + Experimental::upper_tri_symbolic(space, *sptrsv_handle, row_map, entries); sptrsv_handle->set_symbolic_complete(); } } @@ -113,6 +117,7 @@ struct SPTRSV_SYMBOLIC, \ @@ -130,6 +135,7 @@ struct SPTRSV_SYMBOLIC, \ diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_trsv_impl.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_trsv_impl.hpp index fbbd547e347f..9adb029d12ab 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_trsv_impl.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_trsv_impl.hpp @@ -14,15 +14,20 @@ // //@HEADER -#ifndef KOKKOSSPARSE_IMPL_TRSM_HPP_ -#define KOKKOSSPARSE_IMPL_TRSM_HPP_ +#ifndef KOKKOSSPARSE_TRSV_IMPL_HPP_ +#define KOKKOSSPARSE_TRSV_IMPL_HPP_ -/// \file KokkosSparse_impl_trsm.hpp -/// \brief Implementation(s) of sparse triangular solve. +/// \file KokkosSparse_trsv_impl.hpp +/// \brief Implementation(s) of sequential sparse triangular solve. #include #include -#include // temporarily +#include "KokkosBatched_Axpy.hpp" +#include "KokkosBatched_Gemm_Decl.hpp" +#include "KokkosBatched_Gemm_Serial_Impl.hpp" +#include "KokkosBatched_Gesv.hpp" +#include "KokkosBlas2_gemv.hpp" +#include "KokkosBlas1_set.hpp" namespace KokkosSparse { namespace Impl { @@ -30,652 +35,694 @@ namespace Sequential { template -void lowerTriSolveCsrUnitDiag(RangeMultiVectorType X, const CrsMatrixType& A, - DomainMultiVectorType Y) { - typedef - typename CrsMatrixType::row_map_type::non_const_value_type offset_type; - typedef typename CrsMatrixType::index_type::non_const_value_type - local_ordinal_type; - typedef typename CrsMatrixType::values_type::non_const_value_type - matrix_scalar_type; - - const local_ordinal_type numRows = A.numRows(); - // const local_ordinal_type numCols = A.numCols (); - const local_ordinal_type numVecs = X.extent(1); - typename CrsMatrixType::row_map_type ptr = A.graph.row_map; - typename CrsMatrixType::index_type ind = A.graph.entries; - typename CrsMatrixType::values_type val = A.values; - - for (local_ordinal_type r = 0; r < numRows; ++r) { - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) = Y(r, j); - } - const offset_type beg = ptr(r); - const offset_type end = ptr(r + 1); - for (offset_type k = beg; k < end; ++k) { - const matrix_scalar_type A_rc = val(k); - const local_ordinal_type c = ind(k); - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); +struct TrsvWrap { + using offset_type = + typename CrsMatrixType::row_map_type::non_const_value_type; + using lno_t = typename CrsMatrixType::index_type::non_const_value_type; + using scalar_t = typename CrsMatrixType::values_type::non_const_value_type; + using device_t = typename CrsMatrixType::device_type; + using sview_1d = typename Kokkos::View; + using STS = Kokkos::ArithTraits; + + static inline void manual_copy(RangeMultiVectorType X, + DomainMultiVectorType Y) { + auto numRows = X.extent(0); + auto numVecs = X.extent(1); + for (decltype(numRows) i = 0; i < numRows; ++i) { + for (decltype(numVecs) j = 0; j < numVecs; ++j) { + X(i, j) = Y(i, j); } - } // for each entry A_rc in the current row r - } // for each row r -} - -template -void lowerTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A, - DomainMultiVectorType Y) { - typedef - typename CrsMatrixType::row_map_type::non_const_value_type offset_type; - typedef typename CrsMatrixType::index_type::non_const_value_type - local_ordinal_type; - typedef typename CrsMatrixType::values_type::non_const_value_type - matrix_scalar_type; - typedef Kokkos::ArithTraits STS; - - const local_ordinal_type numRows = A.numRows(); - // const local_ordinal_type numCols = A.numCols (); - const local_ordinal_type numVecs = X.extent(1); - typename CrsMatrixType::row_map_type ptr = A.graph.row_map; - typename CrsMatrixType::index_type ind = A.graph.entries; - typename CrsMatrixType::values_type val = A.values; - - for (local_ordinal_type r = 0; r < numRows; ++r) { - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) = Y(r, j); } + } - matrix_scalar_type A_rr = STS::zero(); - const offset_type beg = ptr(r); - const offset_type end = ptr(r + 1); - - for (offset_type k = beg; k < end; ++k) { - const matrix_scalar_type A_rc = val(k); - const local_ordinal_type c = ind(k); - // FIXME (mfh 28 Aug 2014) This assumes that the diagonal entry - // has equal local row and column indices. That may not - // necessarily hold, depending on the row and column Maps. The - // way to fix this would be for Tpetra::CrsMatrix to remember - // the local column index of the diagonal entry (if there is - // one) in each row, and pass that along to this function. - if (r == c) { - A_rr += A_rc; - } else { - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); - } - } - } // for each entry A_rc in the current row r - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) = X(r, j) / A_rr; + struct CommonUnblocked { + CommonUnblocked(const lno_t block_size) { + KK_REQUIRE_MSG(block_size == 1, + "Tried to use block_size>1 for non-block-enabled Common"); } - } // for each row r -} -template -void upperTriSolveCsrUnitDiag(RangeMultiVectorType X, const CrsMatrixType& A, - DomainMultiVectorType Y) { - typedef - typename CrsMatrixType::row_map_type::non_const_value_type offset_type; - typedef typename CrsMatrixType::index_type::non_const_value_type - local_ordinal_type; - typedef typename CrsMatrixType::values_type::non_const_value_type - matrix_scalar_type; - - const local_ordinal_type numRows = A.numRows(); - // const local_ordinal_type numCols = A.numCols (); - const local_ordinal_type numVecs = X.extent(1); - typename CrsMatrixType::row_map_type ptr = A.graph.row_map; - typename CrsMatrixType::index_type ind = A.graph.entries; - typename CrsMatrixType::values_type val = A.values; - - // If local_ordinal_type is unsigned and numRows is 0, the loop - // below will have entirely the wrong number of iterations. - if (numRows == 0) { - return; - } + scalar_t zero() { return STS::zero(); } - // Don't use r >= 0 as the test, because that fails if - // local_ordinal_type is unsigned. We do r == 0 (last - // iteration) below. - for (local_ordinal_type r = numRows - 1; r != 0; --r) { - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) = Y(r, j); - } - const offset_type beg = ptr(r); - const offset_type end = ptr(r + 1); - for (offset_type k = beg; k < end; ++k) { - const matrix_scalar_type A_rc = val(k); - const local_ordinal_type c = ind(k); - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); - } - } // for each entry A_rc in the current row r - } // for each row r - - // Last iteration: r = 0. - { - const local_ordinal_type r = 0; - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) = Y(r, j); + template + scalar_t get(const ValuesView& vals, const offset_type i) { + return vals(i); } - const offset_type beg = ptr(r); - const offset_type end = ptr(r + 1); - for (offset_type k = beg; k < end; ++k) { - const matrix_scalar_type A_rc = val(k); - const local_ordinal_type c = ind(k); - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); - } - } // for each entry A_rc in the current row r - } // last iteration: r = 0 -} -template -void upperTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A, - DomainMultiVectorType Y) { - typedef - typename CrsMatrixType::row_map_type::non_const_value_type offset_type; - typedef typename CrsMatrixType::index_type::non_const_value_type - local_ordinal_type; - typedef typename CrsMatrixType::values_type::non_const_value_type - matrix_scalar_type; - - const local_ordinal_type numRows = A.numRows(); - // const local_ordinal_type numCols = A.numCols (); - const local_ordinal_type numVecs = X.extent(1); - typename CrsMatrixType::row_map_type ptr = A.graph.row_map; - typename CrsMatrixType::index_type ind = A.graph.entries; - typename CrsMatrixType::values_type val = A.values; - typedef Kokkos::ArithTraits STS; - - // If local_ordinal_type is unsigned and numRows is 0, the loop - // below will have entirely the wrong number of iterations. - if (numRows == 0) { - return; - } + void pluseq(scalar_t& lhs, const scalar_t& rhs) { lhs += rhs; } - // Don't use r >= 0 as the test, because that fails if - // local_ordinal_type is unsigned. We do r == 0 (last - // iteration) below. - for (local_ordinal_type r = numRows - 1; r != 0; --r) { - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) = Y(r, j); + void gemv(RangeMultiVectorType X, const scalar_t& A, const lno_t r, + const lno_t c, const lno_t j, const char = 'N') { + X(r, j) -= A * X(c, j); } - const offset_type beg = ptr(r); - const offset_type end = ptr(r + 1); - matrix_scalar_type A_rr = STS::zero(); - for (offset_type k = beg; k < end; ++k) { - const matrix_scalar_type A_rc = val(k); - const local_ordinal_type c = ind(k); - if (r == c) { - A_rr += A_rc; - } else { - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); - } - } - } // for each entry A_rc in the current row r - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) = X(r, j) / A_rr; + + template + void divide(RangeMultiVectorType X, const scalar_t& A, const lno_t r, + const lno_t j) { + X(r, j) /= A; + } + }; + + struct CommonBlocked { + // BSR data is in LayoutRight! + using Layout = Kokkos::LayoutRight; + + using UBlock = Kokkos::View< + scalar_t**, Layout, typename CrsMatrixType::device_type, + Kokkos::MemoryTraits >; + + using Block = + Kokkos::View >; + + using Vector = Kokkos::View >; + + using UVector = Kokkos::View< + scalar_t*, typename CrsMatrixType::device_type, + Kokkos::MemoryTraits >; + + lno_t m_block_size; + lno_t m_block_items; + Vector m_ones; + Block m_data; + Block m_tmp; // Needed for SerialGesv + UBlock m_utmp; // Needed for SerialGesv + Vector m_vec_data1; + Vector m_vec_data2; + + CommonBlocked(const lno_t block_size) + : m_block_size(block_size), + m_block_items(block_size * block_size), + m_ones("ones", block_size), + m_data("m_data", block_size, block_size), + m_tmp("m_tmp", block_size, block_size + 4), + m_utmp(m_tmp.data(), block_size, block_size + 4), + m_vec_data1("m_vec_data1", block_size), + m_vec_data2("m_vec_data2", block_size) { + Kokkos::deep_copy(m_ones, 1.0); } - } // for each row r - // Last iteration: r = 0. - { - const local_ordinal_type r = 0; - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) = Y(r, j); + UBlock zero() { + UBlock block(m_data.data(), m_block_size, m_block_size); + KokkosBlas::SerialSet::invoke(STS::zero(), block); + return block; } - const offset_type beg = ptr(r); - const offset_type end = ptr(r + 1); - matrix_scalar_type A_rr = STS::zero(); - for (offset_type k = beg; k < end; ++k) { - const matrix_scalar_type A_rc = val(k); - const local_ordinal_type c = ind(k); - if (r == c) - A_rr += A_rc; - else { - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); - } - } - } // for each entry A_rc in the current row r - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) = X(r, j) / A_rr; + + template + UBlock get(const ValuesView& vals, const offset_type i) { + scalar_t* data = const_cast(vals.data()); + UBlock rv(data + (i * m_block_items), m_block_size, m_block_size); + return rv; } - } // last iteration: r = 0 -} -template -void upperTriSolveCscUnitDiag(RangeMultiVectorType X, const CrsMatrixType& A, - DomainMultiVectorType Y) { - typedef - typename CrsMatrixType::row_map_type::non_const_value_type offset_type; - typedef typename CrsMatrixType::index_type::non_const_value_type - local_ordinal_type; - typedef typename CrsMatrixType::values_type::non_const_value_type - matrix_scalar_type; - - const local_ordinal_type numRows = A.numRows(); - const local_ordinal_type numCols = A.numCols(); - const local_ordinal_type numVecs = X.extent(1); - typename CrsMatrixType::row_map_type ptr = A.graph.row_map; - typename CrsMatrixType::index_type ind = A.graph.entries; - typename CrsMatrixType::values_type val = A.values; - - for (local_ordinal_type j = 0; j < numVecs; ++j) { - for (local_ordinal_type i = 0; i < numRows; ++i) { - X(i, j) = Y(i, j); + void pluseq(UBlock& lhs, const UBlock& rhs) { + KokkosBatched::SerialAxpy::invoke(m_ones, rhs, lhs); } - } - // If local_ordinal_type is unsigned and numCols is 0, the loop - // below will have entirely the wrong number of iterations. - if (numCols == 0) { - return; - } + void gemv(RangeMultiVectorType X, const UBlock& A, const lno_t r, + const lno_t c, const lno_t j, const char transpose = 'N') { + // Create and populate x and y + UVector x(m_vec_data1.data(), m_block_size); + UVector y(m_vec_data2.data(), m_block_size); + for (lno_t b = 0; b < m_block_size; ++b) { + x(b) = X(c * m_block_size + b, j); + y(b) = X(r * m_block_size + b, j); + } + + KokkosBlas::Experimental::serial_gemv(transpose, -1, A, x, 1, y); - // Don't use c >= 0 as the test, because that fails if - // local_ordinal_type is unsigned. We do c == 0 (last - // iteration) below. - for (local_ordinal_type c = numCols - 1; c != 0; --c) { - const offset_type beg = ptr(c); - const offset_type end = ptr(c + 1); - for (offset_type k = beg; k < end; ++k) { - const matrix_scalar_type A_rc = val(k); - const local_ordinal_type r = ind(k); - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); + for (lno_t b = 0; b < m_block_size; ++b) { + X(r * m_block_size + b, j) = y(b); } - } // for each entry A_rc in the current column c - } // for each column c - - // Last iteration: c = 0. - { - const local_ordinal_type c = 0; - const offset_type beg = ptr(c); - const offset_type end = ptr(c + 1); - for (offset_type k = beg; k < end; ++k) { - const matrix_scalar_type A_rc = val(k); - const local_ordinal_type r = ind(k); - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); + } + + template + void divide(RangeMultiVectorType X, const UBlock& A, const lno_t r, + const lno_t j) { + UVector x(m_vec_data1.data(), m_block_size); + UVector y(m_vec_data2.data(), m_block_size); + for (lno_t b = 0; b < m_block_size; ++b) { + y(b) = X(r * m_block_size + b, j); } - } // for each entry A_rc in the current column c - } -} -template -void upperTriSolveCsc(RangeMultiVectorType X, const CrsMatrixType& A, - DomainMultiVectorType Y) { - typedef - typename CrsMatrixType::row_map_type::non_const_value_type offset_type; - typedef typename CrsMatrixType::index_type::non_const_value_type - local_ordinal_type; - typedef typename CrsMatrixType::values_type::non_const_value_type - matrix_scalar_type; - - const local_ordinal_type numRows = A.numRows(); - const local_ordinal_type numCols = A.numCols(); - const local_ordinal_type numVecs = X.extent(1); - typename CrsMatrixType::row_map_type ptr = A.graph.row_map; - typename CrsMatrixType::index_type ind = A.graph.entries; - typename CrsMatrixType::values_type val = A.values; - - for (local_ordinal_type j = 0; j < numVecs; ++j) { - for (local_ordinal_type i = 0; i < numRows; ++i) { - X(i, j) = Y(i, j); - } - } + // if StaticPivoting is used, there are compiler errors related to + // comparing complex and non-complex. + using Algo = KokkosBatched::Gesv::NoPivoting; + + KokkosBatched::SerialGesv::invoke(A, x, y, m_utmp); - // If local_ordinal_type is unsigned and numCols is 0, the loop - // below will have entirely the wrong number of iterations. - if (numCols == 0) { - return; + for (lno_t b = 0; b < m_block_size; ++b) { + X(r * m_block_size + b, j) = x(b); + } + } + }; + + using CommonOps = std::conditional_t< + KokkosSparse::Experimental::is_bsr_matrix::value, + CommonBlocked, CommonUnblocked>; + + static void lowerTriSolveCsrUnitDiag(RangeMultiVectorType X, + const CrsMatrixType& A, + DomainMultiVectorType Y) { + const lno_t numRows = A.numRows(); + const lno_t numPointRows = A.numPointRows(); + const lno_t block_size = numPointRows / numRows; + const lno_t numVecs = X.extent(1); + typename CrsMatrixType::row_map_type ptr = A.graph.row_map; + typename CrsMatrixType::index_type ind = A.graph.entries; + typename CrsMatrixType::values_type val = A.values; + + KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); + + manual_copy(X, Y); + + for (lno_t r = 0; r < numRows; ++r) { + const offset_type beg = ptr(r); + const offset_type end = ptr(r + 1); + for (offset_type k = beg; k < end; ++k) { + const scalar_t A_rc = val(k); + const lno_t c = ind(k); + for (lno_t j = 0; j < numVecs; ++j) { + X(r, j) -= A_rc * X(c, j); + } + } // for each entry A_rc in the current row r + } // for each row r } - // Don't use c >= 0 as the test, because that fails if - // local_ordinal_type is unsigned. We do c == 0 (last - // iteration) below. - for (local_ordinal_type c = numCols - 1; c != 0; --c) { - const offset_type beg = ptr(c); - const offset_type end = ptr(c + 1); - for (offset_type k = end - 1; k >= beg; --k) { - const local_ordinal_type r = ind(k); - const matrix_scalar_type A_rc = val(k); - /*(vqd 20 Jul 2020) This assumes that the diagonal entry - has equal local row and column indices. That may not - necessarily hold, depending on the row and column Maps. See - note above.*/ - for (local_ordinal_type j = 0; j < numVecs; ++j) { + static void lowerTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A, + DomainMultiVectorType Y) { + const lno_t numRows = A.numRows(); + const lno_t numPointRows = A.numPointRows(); + const lno_t block_size = numPointRows / numRows; + const lno_t numVecs = X.extent(1); + typename CrsMatrixType::row_map_type ptr = A.graph.row_map; + typename CrsMatrixType::index_type ind = A.graph.entries; + typename CrsMatrixType::values_type val = A.values; + + CommonOps co(block_size); + + manual_copy(X, Y); + + for (lno_t r = 0; r < numRows; ++r) { + auto A_rr = co.zero(); + const offset_type beg = ptr(r); + const offset_type end = ptr(r + 1); + + for (offset_type k = beg; k < end; ++k) { + const auto A_rc = co.get(val, k); + const lno_t c = ind(k); + // FIXME (mfh 28 Aug 2014) This assumes that the diagonal entry + // has equal local row and column indices. That may not + // necessarily hold, depending on the row and column Maps. The + // way to fix this would be for Tpetra::CrsMatrix to remember + // the local column index of the diagonal entry (if there is + // one) in each row, and pass that along to this function. if (r == c) { - X(c, j) = X(c, j) / A_rc; + co.pluseq(A_rr, A_rc); } else { - X(r, j) -= A_rc * X(c, j); + for (lno_t j = 0; j < numVecs; ++j) { + co.gemv(X, A_rc, r, c, j); + } } + } // for each entry A_rc in the current row r + for (lno_t j = 0; j < numVecs; ++j) { + co.template divide(X, A_rr, r, j); } - } // for each entry A_rc in the current column c - } // for each column c - - // Last iteration: c = 0. - { - const offset_type beg = ptr(0); - const matrix_scalar_type A_rc = val(beg); - /*(vqd 20 Jul 2020) This assumes that the diagonal entry - has equal local row and column indices. That may not - necessarily hold, depending on the row and column Maps. See - note above.*/ - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(0, j) = X(0, j) / A_rc; - } + } // for each row r } -} -template -void lowerTriSolveCscUnitDiag(RangeMultiVectorType X, const CrsMatrixType& A, - DomainMultiVectorType Y) { - typedef - typename CrsMatrixType::row_map_type::non_const_value_type offset_type; - typedef typename CrsMatrixType::index_type::non_const_value_type - local_ordinal_type; - typedef typename CrsMatrixType::values_type::non_const_value_type - matrix_scalar_type; - - const local_ordinal_type numRows = A.numRows(); - const local_ordinal_type numCols = A.numCols(); - const local_ordinal_type numVecs = X.extent(1); - typename CrsMatrixType::row_map_type ptr = A.graph.row_map; - typename CrsMatrixType::index_type ind = A.graph.entries; - typename CrsMatrixType::values_type val = A.values; - - for (local_ordinal_type j = 0; j < numVecs; ++j) { - for (local_ordinal_type i = 0; i < numRows; ++i) { - X(i, j) = Y(i, j); + static void upperTriSolveCsrUnitDiag(RangeMultiVectorType X, + const CrsMatrixType& A, + DomainMultiVectorType Y) { + const lno_t numRows = A.numRows(); + const lno_t numPointRows = A.numPointRows(); + const lno_t block_size = numPointRows / numRows; + const lno_t numVecs = X.extent(1); + typename CrsMatrixType::row_map_type ptr = A.graph.row_map; + typename CrsMatrixType::index_type ind = A.graph.entries; + typename CrsMatrixType::values_type val = A.values; + + KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); + + manual_copy(X, Y); + + // If lno_t is unsigned and numRows is 0, the loop + // below will have entirely the wrong number of iterations. + if (numRows == 0) { + return; } + + // Don't use r >= 0 as the test, because that fails if + // lno_t is unsigned. We do r == 0 (last + // iteration) below. + for (lno_t r = numRows - 1; r != 0; --r) { + const offset_type beg = ptr(r); + const offset_type end = ptr(r + 1); + for (offset_type k = beg; k < end; ++k) { + const scalar_t A_rc = val(k); + const lno_t c = ind(k); + for (lno_t j = 0; j < numVecs; ++j) { + X(r, j) -= A_rc * X(c, j); + } + } // for each entry A_rc in the current row r + } // for each row r + + // Last iteration: r = 0. + { + const lno_t r = 0; + const offset_type beg = ptr(r); + const offset_type end = ptr(r + 1); + for (offset_type k = beg; k < end; ++k) { + const scalar_t A_rc = val(k); + const lno_t c = ind(k); + for (lno_t j = 0; j < numVecs; ++j) { + X(r, j) -= A_rc * X(c, j); + } + } // for each entry A_rc in the current row r + } // last iteration: r = 0 } - for (local_ordinal_type c = 0; c < numCols; ++c) { - const offset_type beg = ptr(c); - const offset_type end = ptr(c + 1); - for (offset_type k = beg; k < end; ++k) { - const local_ordinal_type r = ind(k); - const matrix_scalar_type A_rc = val(k); - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); - } - } // for each entry A_rc in the current column c - } // for each column c -} + static void upperTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A, + DomainMultiVectorType Y) { + const lno_t numRows = A.numRows(); + const lno_t numPointRows = A.numPointRows(); + const lno_t block_size = numPointRows / numRows; + const lno_t numVecs = X.extent(1); + typename CrsMatrixType::row_map_type ptr = A.graph.row_map; + typename CrsMatrixType::index_type ind = A.graph.entries; + typename CrsMatrixType::values_type val = A.values; -template -void upperTriSolveCscUnitDiagConj(RangeMultiVectorType X, - const CrsMatrixType& A, - DomainMultiVectorType Y) { - typedef - typename CrsMatrixType::row_map_type::non_const_value_type offset_type; - typedef typename CrsMatrixType::index_type::non_const_value_type - local_ordinal_type; - typedef typename CrsMatrixType::values_type::non_const_value_type - matrix_scalar_type; - typedef Kokkos::ArithTraits STS; - - const local_ordinal_type numRows = A.numRows(); - const local_ordinal_type numCols = A.numCols(); - const local_ordinal_type numVecs = X.extent(1); - typename CrsMatrixType::row_map_type ptr = A.graph.row_map; - typename CrsMatrixType::index_type ind = A.graph.entries; - typename CrsMatrixType::values_type val = A.values; - - for (local_ordinal_type j = 0; j < numVecs; ++j) { - for (local_ordinal_type i = 0; i < numRows; ++i) { - X(i, j) = Y(i, j); - } - } + CommonOps co(block_size); - // If local_ordinal_type is unsigned and numCols is 0, the loop - // below will have entirely the wrong number of iterations. - if (numCols == 0) { - return; - } + manual_copy(X, Y); - // Don't use c >= 0 as the test, because that fails if - // local_ordinal_type is unsigned. We do c == 0 (last - // iteration) below. - for (local_ordinal_type c = numCols - 1; c != 0; --c) { - const offset_type beg = ptr(c); - const offset_type end = ptr(c + 1); - for (offset_type k = beg; k < end; ++k) { - const local_ordinal_type r = ind(k); - const matrix_scalar_type A_rc = STS::conj(val(k)); - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); + // If lno_t is unsigned and numRows is 0, the loop + // below will have entirely the wrong number of iterations. + if (numRows == 0) { + return; + } + + // Don't use r >= 0 as the test, because that fails if + // lno_t is unsigned. We do r == 0 (last + // iteration) below. + for (lno_t r = numRows - 1; r != 0; --r) { + const offset_type beg = ptr(r); + const offset_type end = ptr(r + 1); + auto A_rr = co.zero(); + for (offset_type k = beg; k < end; ++k) { + const auto A_rc = co.get(val, k); + const lno_t c = ind(k); + if (r == c) { + co.pluseq(A_rr, A_rc); + } else { + for (lno_t j = 0; j < numVecs; ++j) { + co.gemv(X, A_rc, r, c, j); + } + } + } // for each entry A_rc in the current row r + for (lno_t j = 0; j < numVecs; ++j) { + co.template divide(X, A_rr, r, j); } - } // for each entry A_rc in the current column c - } // for each column c - - // Last iteration: c = 0. - { - const local_ordinal_type c = 0; - const offset_type beg = ptr(c); - const offset_type end = ptr(c + 1); - for (offset_type k = beg; k < end; ++k) { - const local_ordinal_type r = ind(k); - const matrix_scalar_type A_rc = STS::conj(val(k)); - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); + } // for each row r + + // Last iteration: r = 0. + { + const lno_t r = 0; + const offset_type beg = ptr(r); + const offset_type end = ptr(r + 1); + auto A_rr = co.zero(); + for (offset_type k = beg; k < end; ++k) { + const auto A_rc = co.get(val, k); + const lno_t c = ind(k); + if (r == c) { + co.pluseq(A_rr, A_rc); + } else { + for (lno_t j = 0; j < numVecs; ++j) { + co.gemv(X, A_rc, r, c, j); + } + } + } // for each entry A_rc in the current row r + for (lno_t j = 0; j < numVecs; ++j) { + co.template divide(X, A_rr, r, j); } - } // for each entry A_rc in the current column c + } // last iteration: r = 0 } -} -template -void upperTriSolveCscConj(RangeMultiVectorType X, const CrsMatrixType& A, - DomainMultiVectorType Y) { - typedef - typename CrsMatrixType::row_map_type::non_const_value_type offset_type; - typedef typename CrsMatrixType::index_type::non_const_value_type - local_ordinal_type; - typedef typename CrsMatrixType::values_type::non_const_value_type - matrix_scalar_type; - typedef Kokkos::ArithTraits STS; - - const local_ordinal_type numRows = A.numRows(); - const local_ordinal_type numCols = A.numCols(); - const local_ordinal_type numVecs = X.extent(1); - typename CrsMatrixType::row_map_type ptr = A.graph.row_map; - typename CrsMatrixType::index_type ind = A.graph.entries; - typename CrsMatrixType::values_type val = A.values; - - for (local_ordinal_type j = 0; j < numVecs; ++j) { - for (local_ordinal_type i = 0; i < numRows; ++i) { - X(i, j) = Y(i, j); + static void upperTriSolveCscUnitDiag(RangeMultiVectorType X, + const CrsMatrixType& A, + DomainMultiVectorType Y) { + const lno_t numRows = A.numRows(); + const lno_t numCols = A.numCols(); + const lno_t numPointRows = A.numPointRows(); + const lno_t block_size = numPointRows / numRows; + const lno_t numVecs = X.extent(1); + typename CrsMatrixType::row_map_type ptr = A.graph.row_map; + typename CrsMatrixType::index_type ind = A.graph.entries; + typename CrsMatrixType::values_type val = A.values; + + KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); + + manual_copy(X, Y); + + // If lno_t is unsigned and numCols is 0, the loop + // below will have entirely the wrong number of iterations. + if (numCols == 0) { + return; } - } - - // If local_ordinal_type is unsigned and numCols is 0, the loop - // below will have entirely the wrong number of iterations. - if (numCols == 0) { - return; - } - // Don't use c >= 0 as the test, because that fails if - // local_ordinal_type is unsigned. We do c == 0 (last - // iteration) below. - for (local_ordinal_type c = numCols - 1; c != 0; --c) { - const offset_type beg = ptr(c); - const offset_type end = ptr(c + 1); - for (offset_type k = end - 1; k >= beg; --k) { - const local_ordinal_type r = ind(k); - const matrix_scalar_type A_rc = STS::conj(val(k)); - /*(vqd 20 Jul 2020) This assumes that the diagonal entry - has equal local row and column indices. That may not - necessarily hold, depending on the row and column Maps. See - note above.*/ - for (local_ordinal_type j = 0; j < numVecs; ++j) { - if (r == c) { - X(c, j) = X(c, j) / A_rc; - } else { + // Don't use c >= 0 as the test, because that fails if + // lno_t is unsigned. We do c == 0 (last + // iteration) below. + for (lno_t c = numCols - 1; c != 0; --c) { + const offset_type beg = ptr(c); + const offset_type end = ptr(c + 1); + for (offset_type k = beg; k < end; ++k) { + const scalar_t A_rc = val(k); + const lno_t r = ind(k); + for (lno_t j = 0; j < numVecs; ++j) { X(r, j) -= A_rc * X(c, j); } - } - } // for each entry A_rc in the current column c - } // for each column c - - // Last iteration: c = 0. - { - const offset_type beg = ptr(0); - const matrix_scalar_type A_rc = STS::conj(val(beg)); - /*(vqd 20 Jul 2020) This assumes that the diagonal entry - has equal local row and column indices. That may not - necessarily hold, depending on the row and column Maps. See - note above.*/ - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(0, j) = X(0, j) / A_rc; + } // for each entry A_rc in the current column c + } // for each column c + + // Last iteration: c = 0. + { + const lno_t c = 0; + const offset_type beg = ptr(c); + const offset_type end = ptr(c + 1); + for (offset_type k = beg; k < end; ++k) { + const scalar_t A_rc = val(k); + const lno_t r = ind(k); + for (lno_t j = 0; j < numVecs; ++j) { + X(r, j) -= A_rc * X(c, j); + } + } // for each entry A_rc in the current column c } } -} -template -void lowerTriSolveCsc(RangeMultiVectorType X, const CrsMatrixType& A, - DomainMultiVectorType Y) { - typedef - typename CrsMatrixType::row_map_type::non_const_value_type offset_type; - typedef typename CrsMatrixType::index_type::non_const_value_type - local_ordinal_type; - typedef typename CrsMatrixType::values_type::non_const_value_type - matrix_scalar_type; - - const local_ordinal_type numRows = A.numRows(); - const local_ordinal_type numCols = A.numCols(); - const local_ordinal_type numVecs = X.extent(1); - typename CrsMatrixType::row_map_type ptr = A.graph.row_map; - typename CrsMatrixType::index_type ind = A.graph.entries; - typename CrsMatrixType::values_type val = A.values; - - for (local_ordinal_type j = 0; j < numVecs; ++j) { - for (local_ordinal_type i = 0; i < numRows; ++i) { - X(i, j) = Y(i, j); + static void upperTriSolveCsc(RangeMultiVectorType X, const CrsMatrixType& A, + DomainMultiVectorType Y) { + const lno_t numRows = A.numRows(); + const lno_t numCols = A.numCols(); + const lno_t numPointRows = A.numPointRows(); + const lno_t block_size = numPointRows / numRows; + const lno_t numVecs = X.extent(1); + typename CrsMatrixType::row_map_type ptr = A.graph.row_map; + typename CrsMatrixType::index_type ind = A.graph.entries; + typename CrsMatrixType::values_type val = A.values; + + manual_copy(X, Y); + + KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); + + // If lno_t is unsigned and numCols is 0, the loop + // below will have entirely the wrong number of iterations. + if (numCols == 0) { + return; } - } - for (local_ordinal_type c = 0; c < numCols; ++c) { - const offset_type beg = ptr(c); - const offset_type end = ptr(c + 1); - for (offset_type k = beg; k < end; ++k) { - const local_ordinal_type r = ind(k); - const matrix_scalar_type A_rc = val(k); + // Don't use c >= 0 as the test, because that fails if + // lno_t is unsigned. We do c == 0 (last + // iteration) below. + for (lno_t c = numCols - 1; c != 0; --c) { + const offset_type beg = ptr(c); + const offset_type end = ptr(c + 1); + for (offset_type k = end - 1; k >= beg; --k) { + const lno_t r = ind(k); + const auto A_rc = val(k); + /*(vqd 20 Jul 2020) This assumes that the diagonal entry + has equal local row and column indices. That may not + necessarily hold, depending on the row and column Maps. See + note above.*/ + for (lno_t j = 0; j < numVecs; ++j) { + if (r == c) { + X(c, j) = X(c, j) / A_rc; + } else { + X(r, j) -= A_rc * X(c, j); + } + } + } // for each entry A_rc in the current column c + } // for each column c + + // Last iteration: c = 0. + { + const offset_type beg = ptr(0); + const auto A_rc = val(beg); /*(vqd 20 Jul 2020) This assumes that the diagonal entry has equal local row and column indices. That may not necessarily hold, depending on the row and column Maps. See note above.*/ - for (local_ordinal_type j = 0; j < numVecs; ++j) { - if (r == c) { - X(c, j) = X(c, j) / A_rc; - } else { - X(r, j) -= A_rc * X(c, j); - } + for (lno_t j = 0; j < numVecs; ++j) { + X(0, j) = X(0, j) / A_rc; } - } // for each entry A_rc in the current column c - } // for each column c -} - -template -void lowerTriSolveCscUnitDiagConj(RangeMultiVectorType X, - const CrsMatrixType& A, - DomainMultiVectorType Y) { - typedef - typename CrsMatrixType::row_map_type::non_const_value_type offset_type; - typedef typename CrsMatrixType::index_type::non_const_value_type - local_ordinal_type; - typedef typename CrsMatrixType::values_type::non_const_value_type - matrix_scalar_type; - typedef Kokkos::ArithTraits STS; - - const local_ordinal_type numRows = A.numRows(); - const local_ordinal_type numCols = A.numCols(); - const local_ordinal_type numVecs = X.extent(1); - typename CrsMatrixType::row_map_type ptr = A.graph.row_map; - typename CrsMatrixType::index_type ind = A.graph.entries; - typename CrsMatrixType::values_type val = A.values; - - for (local_ordinal_type j = 0; j < numVecs; ++j) { - for (local_ordinal_type i = 0; i < numRows; ++i) { - X(i, j) = Y(i, j); } } - for (local_ordinal_type c = 0; c < numCols; ++c) { - const offset_type beg = ptr(c); - const offset_type end = ptr(c + 1); - for (offset_type k = beg; k < end; ++k) { - const local_ordinal_type r = ind(k); - const matrix_scalar_type A_rc = STS::conj(val(k)); - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); - } - } // for each entry A_rc in the current column c - } // for each column c -} + static void lowerTriSolveCscUnitDiag(RangeMultiVectorType X, + const CrsMatrixType& A, + DomainMultiVectorType Y) { + const lno_t numRows = A.numRows(); + const lno_t numCols = A.numCols(); + const lno_t numPointRows = A.numPointRows(); + const lno_t block_size = numPointRows / numRows; + const lno_t numVecs = X.extent(1); + typename CrsMatrixType::row_map_type ptr = A.graph.row_map; + typename CrsMatrixType::index_type ind = A.graph.entries; + typename CrsMatrixType::values_type val = A.values; + + KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); + + manual_copy(X, Y); + + for (lno_t c = 0; c < numCols; ++c) { + const offset_type beg = ptr(c); + const offset_type end = ptr(c + 1); + for (offset_type k = beg; k < end; ++k) { + const lno_t r = ind(k); + const scalar_t A_rc = val(k); + for (lno_t j = 0; j < numVecs; ++j) { + X(r, j) -= A_rc * X(c, j); + } + } // for each entry A_rc in the current column c + } // for each column c + } -template -void lowerTriSolveCscConj(RangeMultiVectorType X, const CrsMatrixType& A, - DomainMultiVectorType Y) { - typedef - typename CrsMatrixType::row_map_type::non_const_value_type offset_type; - typedef typename CrsMatrixType::index_type::non_const_value_type - local_ordinal_type; - typedef typename CrsMatrixType::values_type::non_const_value_type - matrix_scalar_type; - typedef Kokkos::ArithTraits STS; - - const local_ordinal_type numRows = A.numRows(); - const local_ordinal_type numCols = A.numCols(); - const local_ordinal_type numVecs = X.extent(1); - typename CrsMatrixType::row_map_type ptr = A.graph.row_map; - typename CrsMatrixType::index_type ind = A.graph.entries; - typename CrsMatrixType::values_type val = A.values; - - for (local_ordinal_type j = 0; j < numVecs; ++j) { - for (local_ordinal_type i = 0; i < numRows; ++i) { - X(i, j) = Y(i, j); + static void upperTriSolveCscUnitDiagConj(RangeMultiVectorType X, + const CrsMatrixType& A, + DomainMultiVectorType Y) { + const lno_t numRows = A.numRows(); + const lno_t numCols = A.numCols(); + const lno_t numPointRows = A.numPointRows(); + const lno_t block_size = numPointRows / numRows; + const lno_t numVecs = X.extent(1); + typename CrsMatrixType::row_map_type ptr = A.graph.row_map; + typename CrsMatrixType::index_type ind = A.graph.entries; + typename CrsMatrixType::values_type val = A.values; + + KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); + + manual_copy(X, Y); + + // If lno_t is unsigned and numCols is 0, the loop + // below will have entirely the wrong number of iterations. + if (numCols == 0) { + return; + } + + // Don't use c >= 0 as the test, because that fails if + // lno_t is unsigned. We do c == 0 (last + // iteration) below. + for (lno_t c = numCols - 1; c != 0; --c) { + const offset_type beg = ptr(c); + const offset_type end = ptr(c + 1); + for (offset_type k = beg; k < end; ++k) { + const lno_t r = ind(k); + const scalar_t A_rc = STS::conj(val(k)); + for (lno_t j = 0; j < numVecs; ++j) { + X(r, j) -= A_rc * X(c, j); + } + } // for each entry A_rc in the current column c + } // for each column c + + // Last iteration: c = 0. + { + const lno_t c = 0; + const offset_type beg = ptr(c); + const offset_type end = ptr(c + 1); + for (offset_type k = beg; k < end; ++k) { + const lno_t r = ind(k); + const scalar_t A_rc = STS::conj(val(k)); + for (lno_t j = 0; j < numVecs; ++j) { + X(r, j) -= A_rc * X(c, j); + } + } // for each entry A_rc in the current column c } } - for (local_ordinal_type c = 0; c < numCols; ++c) { - const offset_type beg = ptr(c); - const offset_type end = ptr(c + 1); - for (offset_type k = beg; k < end; ++k) { - const local_ordinal_type r = ind(k); - const matrix_scalar_type A_rc = STS::conj(val(k)); + static void upperTriSolveCscConj(RangeMultiVectorType X, + const CrsMatrixType& A, + DomainMultiVectorType Y) { + const lno_t numRows = A.numRows(); + const lno_t numCols = A.numCols(); + const lno_t numPointRows = A.numPointRows(); + const lno_t block_size = numPointRows / numRows; + const lno_t numVecs = X.extent(1); + typename CrsMatrixType::row_map_type ptr = A.graph.row_map; + typename CrsMatrixType::index_type ind = A.graph.entries; + typename CrsMatrixType::values_type val = A.values; + + KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); + + manual_copy(X, Y); + + // If lno_t is unsigned and numCols is 0, the loop + // below will have entirely the wrong number of iterations. + if (numCols == 0) { + return; + } + + // Don't use c >= 0 as the test, because that fails if + // lno_t is unsigned. We do c == 0 (last + // iteration) below. + for (lno_t c = numCols - 1; c != 0; --c) { + const offset_type beg = ptr(c); + const offset_type end = ptr(c + 1); + for (offset_type k = end - 1; k >= beg; --k) { + const lno_t r = ind(k); + const scalar_t A_rc = STS::conj(val(k)); + /*(vqd 20 Jul 2020) This assumes that the diagonal entry + has equal local row and column indices. That may not + necessarily hold, depending on the row and column Maps. See + note above.*/ + for (lno_t j = 0; j < numVecs; ++j) { + if (r == c) { + X(c, j) = X(c, j) / A_rc; + } else { + X(r, j) -= A_rc * X(c, j); + } + } + } // for each entry A_rc in the current column c + } // for each column c + + // Last iteration: c = 0. + { + const offset_type beg = ptr(0); + const scalar_t A_rc = STS::conj(val(beg)); /*(vqd 20 Jul 2020) This assumes that the diagonal entry has equal local row and column indices. That may not necessarily hold, depending on the row and column Maps. See note above.*/ - for (local_ordinal_type j = 0; j < numVecs; ++j) { - if (r == c) { - X(c, j) = X(c, j) / A_rc; - } else { + for (lno_t j = 0; j < numVecs; ++j) { + X(0, j) = X(0, j) / A_rc; + } + } + } + + static void lowerTriSolveCsc(RangeMultiVectorType X, const CrsMatrixType& A, + DomainMultiVectorType Y) { + const lno_t numRows = A.numRows(); + const lno_t numCols = A.numCols(); + const lno_t numPointRows = A.numPointRows(); + const lno_t block_size = numPointRows / numRows; + const lno_t numVecs = X.extent(1); + typename CrsMatrixType::row_map_type ptr = A.graph.row_map; + typename CrsMatrixType::index_type ind = A.graph.entries; + typename CrsMatrixType::values_type val = A.values; + + KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); + + manual_copy(X, Y); + + for (lno_t c = 0; c < numCols; ++c) { + const offset_type beg = ptr(c); + const offset_type end = ptr(c + 1); + for (offset_type k = beg; k < end; ++k) { + const lno_t r = ind(k); + const scalar_t A_rc = val(k); + /*(vqd 20 Jul 2020) This assumes that the diagonal entry + has equal local row and column indices. That may not + necessarily hold, depending on the row and column Maps. See + note above.*/ + for (lno_t j = 0; j < numVecs; ++j) { + if (r == c) { + X(c, j) = X(c, j) / A_rc; + } else { + X(r, j) -= A_rc * X(c, j); + } + } + } // for each entry A_rc in the current column c + } // for each column c + } + + static void lowerTriSolveCscUnitDiagConj(RangeMultiVectorType X, + const CrsMatrixType& A, + DomainMultiVectorType Y) { + const lno_t numRows = A.numRows(); + const lno_t numCols = A.numCols(); + const lno_t numPointRows = A.numPointRows(); + const lno_t block_size = numPointRows / numRows; + const lno_t numVecs = X.extent(1); + typename CrsMatrixType::row_map_type ptr = A.graph.row_map; + typename CrsMatrixType::index_type ind = A.graph.entries; + typename CrsMatrixType::values_type val = A.values; + + KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); + + manual_copy(X, Y); + + for (lno_t c = 0; c < numCols; ++c) { + const offset_type beg = ptr(c); + const offset_type end = ptr(c + 1); + for (offset_type k = beg; k < end; ++k) { + const lno_t r = ind(k); + const scalar_t A_rc = STS::conj(val(k)); + for (lno_t j = 0; j < numVecs; ++j) { X(r, j) -= A_rc * X(c, j); } - } - } // for each entry A_rc in the current column c - } // for each column c -} + } // for each entry A_rc in the current column c + } // for each column c + } + + static void lowerTriSolveCscConj(RangeMultiVectorType X, + const CrsMatrixType& A, + DomainMultiVectorType Y) { + const lno_t numRows = A.numRows(); + const lno_t numCols = A.numCols(); + const lno_t numPointRows = A.numPointRows(); + const lno_t block_size = numPointRows / numRows; + const lno_t numVecs = X.extent(1); + typename CrsMatrixType::row_map_type ptr = A.graph.row_map; + typename CrsMatrixType::index_type ind = A.graph.entries; + typename CrsMatrixType::values_type val = A.values; + + KK_REQUIRE_MSG(block_size == 1, "BSRs not support for this function yet"); + + manual_copy(X, Y); + + for (lno_t c = 0; c < numCols; ++c) { + const offset_type beg = ptr(c); + const offset_type end = ptr(c + 1); + for (offset_type k = beg; k < end; ++k) { + const lno_t r = ind(k); + const scalar_t A_rc = STS::conj(val(k)); + /*(vqd 20 Jul 2020) This assumes that the diagonal entry + has equal local row and column indices. That may not + necessarily hold, depending on the row and column Maps. See + note above.*/ + for (lno_t j = 0; j < numVecs; ++j) { + if (r == c) { + X(c, j) = X(c, j) / A_rc; + } else { + X(r, j) -= A_rc * X(c, j); + } + } + } // for each entry A_rc in the current column c + } // for each column c + } +}; } // namespace Sequential } // namespace Impl } // namespace KokkosSparse -#endif // KOKKOSSPARSE_IMPL_TRSM_HPP +#endif // KOKKOSSPARSE_TRSV_IMPL_HPP_ diff --git a/packages/kokkos-kernels/sparse/impl/KokkosSparse_trsv_spec.hpp b/packages/kokkos-kernels/sparse/impl/KokkosSparse_trsv_spec.hpp index 2e838337d226..a74f4ffe64b5 100644 --- a/packages/kokkos-kernels/sparse/impl/KokkosSparse_trsv_spec.hpp +++ b/packages/kokkos-kernels/sparse/impl/KokkosSparse_trsv_spec.hpp @@ -20,6 +20,7 @@ #include #include #include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_BsrMatrix.hpp" // Include the actual functors #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY @@ -55,6 +56,22 @@ struct trsv_eti_spec_avail { Kokkos::Device, \ Kokkos::MemoryTraits > > { \ enum : bool { value = true }; \ + }; \ + \ + template <> \ + struct trsv_eti_spec_avail< \ + KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE>, \ + Kokkos::View< \ + const SCALAR_TYPE **, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -93,50 +110,52 @@ struct TRSV; if (trans[0] == 'N' || trans[0] == 'n') { // no transpose if (uplo[0] == 'L' || uplo[0] == 'l') { // lower triangular if (diag[0] == 'U' || diag[0] == 'u') { // unit diagonal - Sequential::lowerTriSolveCsrUnitDiag(X, A, B); + Wrap::lowerTriSolveCsrUnitDiag(X, A, B); } else { // non unit diagonal - Sequential::lowerTriSolveCsr(X, A, B); + Wrap::lowerTriSolveCsr(X, A, B); } } else { // upper triangular if (diag[0] == 'U' || diag[0] == 'u') { // unit diagonal - Sequential::upperTriSolveCsrUnitDiag(X, A, B); + Wrap::upperTriSolveCsrUnitDiag(X, A, B); } else { // non unit diagonal - Sequential::upperTriSolveCsr(X, A, B); + Wrap::upperTriSolveCsr(X, A, B); } } } else if (trans[0] == 'T' || trans[0] == 't') { // transpose if (uplo[0] == 'L' || uplo[0] == 'l') { // lower triangular // Transposed lower tri CSR => upper tri CSC. if (diag[0] == 'U' || diag[0] == 'u') { // unit diagonal - Sequential::upperTriSolveCscUnitDiag(X, A, B); + Wrap::upperTriSolveCscUnitDiag(X, A, B); } else { // non unit diagonal - Sequential::upperTriSolveCsc(X, A, B); + Wrap::upperTriSolveCsc(X, A, B); } } else { // upper triangular // Transposed upper tri CSR => lower tri CSC. if (diag[0] == 'U' || diag[0] == 'u') { // unit diagonal - Sequential::lowerTriSolveCscUnitDiag(X, A, B); + Wrap::lowerTriSolveCscUnitDiag(X, A, B); } else { // non unit diagonal - Sequential::lowerTriSolveCsc(X, A, B); + Wrap::lowerTriSolveCsc(X, A, B); } } } else if (trans[0] == 'C' || trans[0] == 'c') { // conj transpose if (uplo[0] == 'L' || uplo[0] == 'l') { // lower triangular // Transposed lower tri CSR => upper tri CSC. if (diag[0] == 'U' || diag[0] == 'u') { // unit diagonal - Sequential::upperTriSolveCscUnitDiagConj(X, A, B); + Wrap::upperTriSolveCscUnitDiagConj(X, A, B); } else { // non unit diagonal - Sequential::upperTriSolveCscConj(X, A, B); + Wrap::upperTriSolveCscConj(X, A, B); } } else { // upper triangular // Transposed upper tri CSR => lower tri CSC. if (diag[0] == 'U' || diag[0] == 'u') { // unit diagonal - Sequential::lowerTriSolveCscUnitDiagConj(X, A, B); + Wrap::lowerTriSolveCscUnitDiagConj(X, A, B); } else { // non unit diagonal - Sequential::lowerTriSolveCscConj(X, A, B); + Wrap::lowerTriSolveCscConj(X, A, B); } } } @@ -169,6 +188,20 @@ struct TRSV, \ Kokkos::MemoryTraits >, \ + false, true>; \ + \ + extern template struct TRSV< \ + KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE>, \ + Kokkos::View< \ + const SCALAR_TYPE **, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ false, true>; #define KOKKOSSPARSE_TRSV_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, \ @@ -186,6 +219,20 @@ struct TRSV, \ Kokkos::MemoryTraits >, \ + false, true>; \ + \ + template struct TRSV< \ + KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE>, \ + Kokkos::View< \ + const SCALAR_TYPE **, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ false, true>; #include diff --git a/packages/kokkos-kernels/sparse/src/KokkosKernels_Handle.hpp b/packages/kokkos-kernels/sparse/src/KokkosKernels_Handle.hpp index d500f19d4885..680045823e6f 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosKernels_Handle.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosKernels_Handle.hpp @@ -605,18 +605,18 @@ class KokkosKernelsHandle { // clang-format off /** * @brief Create a gauss seidel handle object - * + * * @param handle_exec_space The execution space instance to execute kernels on. * @param num_streams The number of streams to allocate memory for. * @param gs_algorithm Specifies which algorithm to use: - * + * * KokkosSpace::GS_DEFAULT PointGaussSeidel * KokkosSpace::GS_PERMUTED ?? * KokkosSpace::GS_TEAM ?? * KokkosSpace::GS_CLUSTER ?? * KokkosSpace::GS_TWOSTAGE ?? * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: - * + * * KokkosGraph::COLORING_DEFAULT ?? * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring * KokkosGraph::COLORING_VB Vertex Based Coloring @@ -649,9 +649,9 @@ class KokkosKernelsHandle { // clang-format off /** * @brief Create a gauss seidel handle object - * + * * @param gs_algorithm Specifies which algorithm to use: - * + * * KokkosSpace::GS_DEFAULT PointGaussSeidel or BlockGaussSeidel, depending on matrix type. * KokkosSpace::GS_PERMUTED Reorders rows/cols into colors to improve locality. Uses RangePolicy over rows. * KokkosSpace::GS_TEAM Uses TeamPolicy over batches of rows with ThreadVector within rows. @@ -660,7 +660,7 @@ class KokkosKernelsHandle { * KokkosSpace::GS_TWOSTAGE Uses spmv to parallelize inner sweeps of x. * For more information, see: https://arxiv.org/pdf/2104.01196.pdf. * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: - * + * * KokkosGraph::COLORING_DEFAULT Depends on execution space: * COLORING_SERIAL on Kokkos::Serial; * COLORING_EB on GPUs; @@ -744,16 +744,16 @@ class KokkosKernelsHandle { // clang-format off /** * @brief Create a gs handle object - * + * * @param clusterAlgo Specifies which clustering algorithm to use: - * - * KokkosSparse::ClusteringAlgorithm::CLUSTER_DEFAULT ?? - * KokkosSparse::ClusteringAlgorithm::CLUSTER_MIS2 ?? - * KokkosSparse::ClusteringAlgorithm::CLUSTER_BALLOON ?? - * KokkosSparse::ClusteringAlgorithm::NUM_CLUSTERING_ALGORITHMS ?? + * + * KokkosSparse::CLUSTER_DEFAULT ?? + * KokkosSparse::CLUSTER_MIS2 ?? + * KokkosSparse::CLUSTER_BALLOON ?? + * KokkosSparse::NUM_CLUSTERING_ALGORITHMS ?? * @param hint_verts_per_cluster Hint how many verticies to use per cluster * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: - * + * * KokkosGraph::COLORING_DEFAULT ?? * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring * KokkosGraph::COLORING_VB Vertex Based Coloring @@ -821,10 +821,11 @@ class KokkosKernelsHandle { // ---------------------------------------- // SPADDHandleType *get_spadd_handle() { return this->spaddHandle; } - void create_spadd_handle(bool input_sorted) { + void create_spadd_handle(bool input_sorted = false, + bool input_merged = false) { this->destroy_spadd_handle(); this->is_owner_of_the_spadd_handle = true; - this->spaddHandle = new SPADDHandleType(input_sorted); + this->spaddHandle = new SPADDHandleType(input_sorted, input_merged); } void destroy_spadd_handle() { if (is_owner_of_the_spadd_handle && this->spaddHandle != NULL) { @@ -947,11 +948,13 @@ class KokkosKernelsHandle { SPILUKHandleType *get_spiluk_handle() { return this->spilukHandle; } void create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm algm, - size_type nrows, size_type nnzL, size_type nnzU) { + size_type nrows, size_type nnzL, size_type nnzU, + size_type block_size = 0) { this->destroy_spiluk_handle(); this->is_owner_of_the_spiluk_handle = true; - this->spilukHandle = new SPILUKHandleType(algm, nrows, nnzL, nnzU); - this->spilukHandle->reset_handle(nrows, nnzL, nnzU); + this->spilukHandle = + new SPILUKHandleType(algm, nrows, nnzL, nnzU, block_size); + this->spilukHandle->reset_handle(nrows, nnzL, nnzU, block_size); this->spilukHandle->set_team_size(this->team_work_size); this->spilukHandle->set_vector_size(this->vector_size); } diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_BsrMatrix.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_BsrMatrix.hpp index e0d6e61a3bd4..db9ef717531a 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_BsrMatrix.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_BsrMatrix.hpp @@ -1108,6 +1108,10 @@ template struct is_bsr_matrix> : public std::true_type {}; template struct is_bsr_matrix> : public std::true_type {}; + +/// \brief Equivalent to is_bsr_matrix::value. +template +inline constexpr bool is_bsr_matrix_v = is_bsr_matrix::value; //---------------------------------------------------------------------------- } // namespace Experimental diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_CrsMatrix.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_CrsMatrix.hpp index 7070172a1f3b..ce9ec99e4e5b 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_CrsMatrix.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_CrsMatrix.hpp @@ -867,5 +867,9 @@ struct is_crs_matrix> : public std::true_type {}; template struct is_crs_matrix> : public std::true_type {}; +/// \brief Equivalent to is_crs_matrix::value. +template +inline constexpr bool is_crs_matrix_v = is_crs_matrix::value; + } // namespace KokkosSparse #endif diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_LUPrec.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_LUPrec.hpp index a257b8f09c5a..d687c8dd4fb8 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_LUPrec.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_LUPrec.hpp @@ -24,6 +24,7 @@ #include #include #include +#include namespace KokkosSparse { namespace Experimental { @@ -45,8 +46,9 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { using ScalarType = typename std::remove_const::type; using EXSP = typename CRS::execution_space; using MEMSP = typename CRS::memory_space; + using DEVICE = typename Kokkos::Device; using karith = typename Kokkos::ArithTraits; - using View1d = typename Kokkos::View; + using View1d = typename Kokkos::View; private: // trsm takes host views @@ -61,11 +63,11 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { LUPrec(const CRSArg &L, const CRSArg &U) : _L(L), _U(U), - _tmp("LUPrec::_tmp", L.numRows()), - _tmp2("LUPrec::_tmp", L.numRows()), + _tmp("LUPrec::_tmp", L.numPointRows()), + _tmp2("LUPrec::_tmp", L.numPointRows()), _khL(), _khU() { - KK_REQUIRE_MSG(L.numRows() == U.numRows(), + KK_REQUIRE_MSG(L.numPointRows() == U.numPointRows(), "LUPrec: L.numRows() != U.numRows()"); _khL.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, L.numRows(), @@ -80,22 +82,13 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { _khU.destroy_sptrsv_handle(); } - ///// \brief Apply the preconditioner to X, putting the result in Y. - ///// - ///// \tparam XViewType Input vector, as a 1-D Kokkos::View - ///// \tparam YViewType Output vector, as a nonconst 1-D Kokkos::View - ///// - ///// \param transM [in] Not used. - ///// \param alpha [in] Not used - ///// \param beta [in] Not used. - ///// - ///// It takes L and U and the stores U^inv L^inv X in Y - // - virtual void apply( - const Kokkos::View> &X, - const Kokkos::View> &Y, - const char transM[] = "N", ScalarType alpha = karith::one(), - ScalarType beta = karith::zero()) const { + template < + typename Matrix, + typename std::enable_if::value>::type * = nullptr> + void apply_impl(const Kokkos::View &X, + const Kokkos::View &Y, + const char transM[] = "N", ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const { // tmp = trsv(L, x); //Apply L^inv to x // y = trsv(U, tmp); //Apply U^inv to tmp @@ -111,6 +104,62 @@ class LUPrec : public KokkosSparse::Experimental::Preconditioner { KokkosBlas::axpby(alpha, _tmp2, beta, Y); } + + template < + typename Matrix, + typename std::enable_if::value>::type * = nullptr> + void apply_impl(const Kokkos::View &X, + const Kokkos::View &Y, + const char transM[] = "N", ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const { + // tmp = trsv(L, x); //Apply L^inv to x + // y = trsv(U, tmp); //Apply U^inv to tmp + + KK_REQUIRE_MSG(transM[0] == NoTranspose[0], + "LUPrec::apply only supports 'N' for transM"); + +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + using Layout = Kokkos::LayoutLeft; +#else + using Layout = Kokkos::LayoutRight; +#endif + + // trsv is implemented for MV so we need to convert our views + using UView2d = typename Kokkos::View< + ScalarType **, Layout, DEVICE, + Kokkos::MemoryTraits >; + using UView2dc = typename Kokkos::View< + const ScalarType **, Layout, DEVICE, + Kokkos::MemoryTraits >; + UView2dc X2d(X.data(), X.extent(0), 1); + UView2d Y2d(Y.data(), Y.extent(0), 1), + tmp2d(_tmp.data(), _tmp.extent(0), 1), + tmp22d(_tmp2.data(), _tmp2.extent(0), 1); + + KokkosSparse::trsv("L", "N", "N", _L, X2d, tmp2d); + KokkosSparse::trsv("U", "N", "N", _U, tmp2d, tmp22d); + + KokkosBlas::axpby(alpha, _tmp2, beta, Y); + } + + ///// \brief Apply the preconditioner to X, putting the result in Y. + ///// + ///// \tparam XViewType Input vector, as a 1-D Kokkos::View + ///// \tparam YViewType Output vector, as a nonconst 1-D Kokkos::View + ///// + ///// \param transM [in] Not used. + ///// \param alpha [in] Not used + ///// \param beta [in] Not used. + ///// + ///// It takes L and U and the stores U^inv L^inv X in Y + // + virtual void apply(const Kokkos::View &X, + const Kokkos::View &Y, + const char transM[] = "N", + ScalarType alpha = karith::one(), + ScalarType beta = karith::zero()) const { + apply_impl(X, Y, transM, alpha, beta); + } //@} //! Set this preconditioner's parameters. diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils_mkl.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils_mkl.hpp index 7a8dd0cb22cb..a14e19f3cf65 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils_mkl.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils_mkl.hpp @@ -62,9 +62,15 @@ inline void mkl_internal_safe_call(sparse_status_t mkl_status, const char *name, } } +} // namespace Impl +} // namespace KokkosSparse + #define KOKKOSKERNELS_MKL_SAFE_CALL(call) \ KokkosSparse::Impl::mkl_internal_safe_call(call, #call, __FILE__, __LINE__) +namespace KokkosSparse { +namespace Impl { + inline sparse_operation_t mode_kk_to_mkl(char mode_kk) { switch (toupper(mode_kk)) { case 'N': return SPARSE_OPERATION_NON_TRANSPOSE; @@ -88,11 +94,58 @@ struct mkl_is_supported_value_type> : std::true_type {}; template <> struct mkl_is_supported_value_type> : std::true_type {}; +// Helper to: +// - define the MKL type equivalent to a given Kokkos scalar type +// - provide an easy implicit conversion to that MKL type +template +struct KokkosToMKLScalar { + static_assert(mkl_is_supported_value_type::value, + "Scalar type not supported by MKL"); + using type = Scalar; + KokkosToMKLScalar(Scalar val_) : val(val_) {} + operator Scalar() const { return val; } + Scalar val; +}; + +template <> +struct KokkosToMKLScalar> { + using type = MKL_Complex8; + KokkosToMKLScalar(Kokkos::complex val_) : val(val_) {} + operator MKL_Complex8() const { return {val.real(), val.imag()}; } + Kokkos::complex val; +}; + +template <> +struct KokkosToMKLScalar> { + using type = MKL_Complex16; + KokkosToMKLScalar(Kokkos::complex val_) : val(val_) {} + operator MKL_Complex16() const { return {val.real(), val.imag()}; } + Kokkos::complex val; +}; + +template +struct KokkosToOneMKLScalar { + // Note: we happen to use the same set of types in classic MKL and OneMKL. + // If that changes, update this logic. + static_assert(mkl_is_supported_value_type::value, + "Scalar type not supported by OneMKL"); + using type = Scalar; +}; + +template +struct KokkosToOneMKLScalar> { + static_assert(mkl_is_supported_value_type>::value, + "Scalar type not supported by OneMKL"); + using type = std::complex; +}; + // MKLSparseMatrix provides thin wrapper around MKL matrix handle // (sparse_matrix_t) and encapsulates MKL call dispatches related to details // like value_type, allowing simple client code in kernels. template class MKLSparseMatrix { + static_assert(mkl_is_supported_value_type::value, + "Provided value_type type not supported by MKL"); sparse_matrix_t mtx; public: @@ -100,11 +153,7 @@ class MKLSparseMatrix { // Constructs MKL sparse matrix from KK sparse views (m rows x n cols) inline MKLSparseMatrix(const MKL_INT num_rows, const MKL_INT num_cols, - MKL_INT *xadj, MKL_INT *adj, value_type *values) { - throw std::runtime_error( - "Scalar type used in MKLSparseMatrix is NOT " - "supported by MKL"); - } + MKL_INT *xadj, MKL_INT *adj, value_type *values) {} // Allows using MKLSparseMatrix directly in MKL calls inline operator sparse_matrix_t() const { return mtx; } @@ -112,11 +161,7 @@ class MKLSparseMatrix { // Exports MKL sparse matrix contents into KK views inline void export_data(MKL_INT &num_rows, MKL_INT &num_cols, MKL_INT *&rows_start, MKL_INT *&columns, - value_type *&values) { - throw std::runtime_error( - "Scalar type used in MKLSparseMatrix is NOT " - "supported by MKL"); - } + value_type *&values) {} inline void destroy() { KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(mtx)); diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_coo2crs.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_coo2crs.hpp index 45e54ce4748b..a29d818cb1ba 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_coo2crs.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_coo2crs.hpp @@ -16,11 +16,6 @@ #ifndef _KOKKOSSPARSE_COO2CRS_HPP #define _KOKKOSSPARSE_COO2CRS_HPP -// The unorderedmap changes necessary for this to work -// have not made it into Kokkos 4.0.00 pr 4.0.01 will -// need to see if it happens in 4.1.00 to have a final -// version check here. -#if KOKKOS_VERSION >= 40099 || defined(DOXY) #include "KokkosSparse_CooMatrix.hpp" #include "KokkosSparse_CrsMatrix.hpp" @@ -99,5 +94,4 @@ auto coo2crs(KokkosSparse::CooMatrix= 40099 || defined(DOXY) #endif // _KOKKOSSPARSE_COO2CRS_HPP diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 649229918d73..624382ec5b3d 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -29,13 +29,22 @@ namespace KokkosSparse { enum GSAlgorithm { GS_DEFAULT, GS_PERMUTED, GS_TEAM, GS_CLUSTER, GS_TWOSTAGE }; enum GSDirection { GS_FORWARD, GS_BACKWARD, GS_SYMMETRIC }; -enum ClusteringAlgorithm { +enum struct ClusteringAlgorithm { CLUSTER_DEFAULT, CLUSTER_MIS2, CLUSTER_BALLOON, NUM_CLUSTERING_ALGORITHMS }; +static constexpr ClusteringAlgorithm CLUSTER_DEFAULT = + ClusteringAlgorithm::CLUSTER_DEFAULT; +static constexpr ClusteringAlgorithm CLUSTER_MIS2 = + ClusteringAlgorithm::CLUSTER_MIS2; +static constexpr ClusteringAlgorithm CLUSTER_BALLOON = + ClusteringAlgorithm::CLUSTER_BALLOON; +static constexpr ClusteringAlgorithm NUM_CLUSTERING_ALGORITHMS = + ClusteringAlgorithm::NUM_CLUSTERING_ALGORITHMS; + inline const char *getClusterAlgoName(ClusteringAlgorithm ca) { switch (ca) { case CLUSTER_BALLOON: return "Balloon"; diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_gmres.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_gmres.hpp index 31b736c3937d..b0b708a33046 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_gmres.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_gmres.hpp @@ -89,8 +89,9 @@ void gmres(KernelHandle* handle, AMatrix& A, BType& B, XType& X, "gmres: A size type must match KernelHandle entry " "type (aka size_type, and const doesn't matter)"); - static_assert(KokkosSparse::is_crs_matrix::value, - "gmres: A is not a CRS matrix."); + static_assert(KokkosSparse::is_crs_matrix::value || + KokkosSparse::Experimental::is_bsr_matrix::value, + "gmres: A is not a CRS or BSR matrix."); static_assert(Kokkos::is_view::value, "gmres: B is not a Kokkos::View."); static_assert(Kokkos::is_view::value, @@ -120,8 +121,10 @@ void gmres(KernelHandle* handle, AMatrix& A, BType& B, XType& X, using c_persist_t = typename KernelHandle::HandlePersistentMemorySpace; if ((X.extent(0) != B.extent(0)) || - (static_cast(A.numCols()) != static_cast(X.extent(0))) || - (static_cast(A.numRows()) != static_cast(B.extent(0)))) { + (static_cast(A.numPointCols()) != + static_cast(X.extent(0))) || + (static_cast(A.numPointRows()) != + static_cast(B.extent(0)))) { std::ostringstream os; os << "KokkosSparse::gmres: Dimensions do not match: " << ", A: " << A.numRows() << " x " << A.numCols() @@ -135,11 +138,20 @@ void gmres(KernelHandle* handle, AMatrix& A, BType& B, XType& X, const_handle_type tmp_handle(*handle); - using AMatrix_Internal = KokkosSparse::CrsMatrix< + using AMatrix_Bsr_Internal = KokkosSparse::Experimental::BsrMatrix< typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type, typename AMatrix::device_type, Kokkos::MemoryTraits, typename AMatrix::const_size_type>; + using AMatrix_Internal = std::conditional_t< + KokkosSparse::is_crs_matrix::value, + KokkosSparse::CrsMatrix, + typename AMatrix::const_size_type>, + AMatrix_Bsr_Internal>; + using B_Internal = Kokkos::View< typename BType::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, @@ -154,9 +166,9 @@ void gmres(KernelHandle* handle, AMatrix& A, BType& B, XType& X, using Precond_Internal = Preconditioner; - AMatrix_Internal A_i = A; - B_Internal b_i = B; - X_Internal x_i = X; + AMatrix_Internal A_i(A); + B_Internal b_i = B; + X_Internal x_i = X; Precond_Internal* precond_i = reinterpret_cast(precond); diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_spadd.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_spadd.hpp index 74efed66bc85..127400c752c5 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_spadd.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_spadd.hpp @@ -19,25 +19,27 @@ #include "KokkosKernels_Handle.hpp" #include "KokkosKernels_helpers.hpp" -#include "KokkosSparse_spadd_symbolic_spec.hpp" +#include "KokkosBlas1_scal.hpp" #include "KokkosSparse_spadd_numeric_spec.hpp" +#include "KokkosSparse_spadd_symbolic_spec.hpp" namespace KokkosSparse { namespace Experimental { // Symbolic: count entries in each row in C to produce rowmap // kernel handle has information about whether it is sorted add or not. -template void spadd_symbolic( - KernelHandle* handle, const alno_row_view_t_ a_rowmap, + const ExecSpace &exec, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t m, // same type as column indices + typename KernelHandle::const_nnz_lno_t n, const alno_row_view_t_ a_rowmap, const alno_nnz_view_t_ a_entries, const blno_row_view_t_ b_rowmap, const blno_nnz_view_t_ b_entries, clno_row_view_t_ c_rowmap) // c_rowmap must already be allocated (doesn't // need to be initialized) { - typedef typename KernelHandle::HandleExecSpace ExecSpace; typedef typename KernelHandle::HandleTempMemorySpace MemSpace; typedef typename KernelHandle::HandlePersistentMemorySpace PersistentMemSpace; typedef typename Kokkos::Device DeviceType; @@ -51,49 +53,75 @@ void spadd_symbolic( ConstKernelHandle; ConstKernelHandle tmp_handle(*handle); - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_a_rowmap; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_a_entries; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_b_rowmap; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_b_entries; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_c_rowmap; - KokkosSparse::Impl::SPADD_SYMBOLIC:: - spadd_symbolic(&tmp_handle, - Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), - Internal_a_entries(a_entries.data(), a_entries.extent(0)), - Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), - Internal_b_entries(b_entries.data(), b_entries.extent(0)), - Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0))); + + auto addHandle = handle->get_spadd_handle(); + bool useFallback = !addHandle->is_input_strict_crs(); + if (useFallback) { + KokkosSparse::Impl::SPADD_SYMBOLIC< + ExecSpace, ConstKernelHandle, Internal_a_rowmap, Internal_a_entries, + Internal_b_rowmap, Internal_b_entries, Internal_c_rowmap, false>:: + spadd_symbolic( + exec, &tmp_handle, m, n, + Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), + Internal_a_entries(a_entries.data(), a_entries.extent(0)), + Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), + Internal_b_entries(b_entries.data(), b_entries.extent(0)), + Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0))); + } else { + KokkosSparse::Impl::SPADD_SYMBOLIC< + ExecSpace, ConstKernelHandle, Internal_a_rowmap, Internal_a_entries, + Internal_b_rowmap, Internal_b_entries, Internal_c_rowmap>:: + spadd_symbolic( + exec, &tmp_handle, m, n, + Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), + Internal_a_entries(a_entries.data(), a_entries.extent(0)), + Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), + Internal_b_entries(b_entries.data(), b_entries.extent(0)), + Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0))); + } } -template +void spadd_symbolic(KernelHandle *handle, Args... args) { + spadd_symbolic(typename KernelHandle::HandleExecSpace{}, handle, args...); +} + +template -void spadd_numeric(KernelHandle* handle, const alno_row_view_t_ a_rowmap, +void spadd_numeric(const ExecSpace &exec, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t m, + typename KernelHandle::const_nnz_lno_t n, + const alno_row_view_t_ a_rowmap, const alno_nnz_view_t_ a_entries, const ascalar_nnz_view_t_ a_values, const ascalar_t_ alpha, const blno_row_view_t_ b_rowmap, @@ -101,7 +129,6 @@ void spadd_numeric(KernelHandle* handle, const alno_row_view_t_ a_rowmap, const bscalar_nnz_view_t_ b_values, const bscalar_t_ beta, const clno_row_view_t_ c_rowmap, clno_nnz_view_t_ c_entries, cscalar_nnz_view_t_ c_values) { - typedef typename KernelHandle::HandleExecSpace ExecSpace; typedef typename KernelHandle::HandleTempMemorySpace MemSpace; typedef typename KernelHandle::HandlePersistentMemorySpace PersistentMemSpace; typedef typename Kokkos::Device DeviceType; @@ -113,116 +140,183 @@ void spadd_numeric(KernelHandle* handle, const alno_row_view_t_ a_rowmap, typedef typename KokkosKernels::Experimental::KokkosKernelsHandle< c_size_t, c_lno_t, c_scalar_t, ExecSpace, MemSpace, PersistentMemSpace> ConstKernelHandle; - ConstKernelHandle tmp_handle(*handle); + ConstKernelHandle tmp_handle(*handle); // handle->exec_space is also copied - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_a_rowmap; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_a_entries; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_a_values; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_b_rowmap; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_b_entries; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_b_values; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_c_rowmap; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_c_entries; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_c_values; - KokkosSparse::Impl::SPADD_NUMERIC:: - spadd_numeric(&tmp_handle, alpha, - Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), - Internal_a_entries(a_entries.data(), a_entries.extent(0)), - Internal_a_values(a_values.data(), a_values.extent(0)), - beta, - Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), - Internal_b_entries(b_entries.data(), b_entries.extent(0)), - Internal_b_values(b_values.data(), b_values.extent(0)), - Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0)), - Internal_c_entries(c_entries.data(), c_entries.extent(0)), - Internal_c_values(c_values.data(), c_values.extent(0))); + + auto addHandle = handle->get_spadd_handle(); + bool useFallback = !addHandle->is_input_strict_crs(); + if (useFallback) { + KokkosSparse::Impl::SPADD_NUMERIC< + ExecSpace, ConstKernelHandle, Internal_a_rowmap, Internal_a_entries, + Internal_a_values, Internal_b_rowmap, Internal_b_entries, + Internal_b_values, Internal_c_rowmap, Internal_c_entries, + Internal_c_values, false>:: + spadd_numeric(exec, &tmp_handle, m, n, alpha, + Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), + Internal_a_entries(a_entries.data(), a_entries.extent(0)), + Internal_a_values(a_values.data(), a_values.extent(0)), + beta, + Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), + Internal_b_entries(b_entries.data(), b_entries.extent(0)), + Internal_b_values(b_values.data(), b_values.extent(0)), + Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0)), + Internal_c_entries(c_entries.data(), c_entries.extent(0)), + Internal_c_values(c_values.data(), c_values.extent(0))); + } else { + KokkosSparse::Impl::SPADD_NUMERIC< + ExecSpace, ConstKernelHandle, Internal_a_rowmap, Internal_a_entries, + Internal_a_values, Internal_b_rowmap, Internal_b_entries, + Internal_b_values, Internal_c_rowmap, Internal_c_entries, + Internal_c_values>:: + spadd_numeric(exec, &tmp_handle, m, n, alpha, + Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), + Internal_a_entries(a_entries.data(), a_entries.extent(0)), + Internal_a_values(a_values.data(), a_values.extent(0)), + beta, + Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), + Internal_b_entries(b_entries.data(), b_entries.extent(0)), + Internal_b_values(b_values.data(), b_values.extent(0)), + Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0)), + Internal_c_entries(c_entries.data(), c_entries.extent(0)), + Internal_c_values(c_values.data(), c_values.extent(0))); + } +} + +// one without an execution space arg +template +void spadd_numeric(KernelHandle *handle, Args... args) { + spadd_numeric(typename KernelHandle::HandleExecSpace{}, handle, args...); } } // namespace Experimental // Symbolic: count entries in each row in C to produce rowmap // kernel handle has information about whether it is sorted add or not. -template -void spadd_symbolic(KernelHandle* handle, const AMatrix& A, const BMatrix& B, - CMatrix& C) { +template +void spadd_symbolic(const ExecSpace &exec, KernelHandle *handle, + const AMatrix &A, const BMatrix &B, CMatrix &C) { using row_map_type = typename CMatrix::row_map_type::non_const_type; using entries_type = typename CMatrix::index_type::non_const_type; using values_type = typename CMatrix::values_type::non_const_type; + auto addHandle = handle->get_spadd_handle(); + // Create the row_map of C, no need to initialize it row_map_type row_mapC( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "row map"), + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "row map"), A.numRows() + 1); - KokkosSparse::Experimental::spadd_symbolic(handle, A.graph.row_map, - A.graph.entries, B.graph.row_map, - B.graph.entries, row_mapC); + + // Shortcuts for special cases as they cause errors in some TPL + // implementations (e.g., cusparse and hipsparse) + if (!A.nnz()) { + Kokkos::deep_copy(exec, row_mapC, B.graph.row_map); + addHandle->set_c_nnz(B.graph.entries.extent(0)); + } else if (!B.nnz()) { + Kokkos::deep_copy(exec, row_mapC, A.graph.row_map); + addHandle->set_c_nnz(A.graph.entries.extent(0)); + } else { + KokkosSparse::Experimental::spadd_symbolic( + exec, handle, A.numRows(), A.numCols(), A.graph.row_map, + A.graph.entries, B.graph.row_map, B.graph.entries, row_mapC); + } // Now create and allocate the entries and values // views so we can build a graph and then matrix C // and subsequently construct C. - auto addHandle = handle->get_spadd_handle(); entries_type entriesC( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "entries"), + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "entries"), addHandle->get_c_nnz()); // Finally since we already have the number of nnz handy // we can go ahead and allocate C's values and set them. - values_type valuesC(Kokkos::view_alloc(Kokkos::WithoutInitializing, "values"), - addHandle->get_c_nnz()); + values_type valuesC( + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "values"), + addHandle->get_c_nnz()); C = CMatrix("matrix", A.numRows(), A.numCols(), addHandle->get_c_nnz(), valuesC, row_mapC, entriesC); } -// Symbolic: count entries in each row in C to produce rowmap +// Numeric: fill the column indices and values // kernel handle has information about whether it is sorted add or not. +template +void spadd_numeric(const ExecSpace &exec, KernelHandle *handle, + const AScalar alpha, const AMatrix &A, const BScalar beta, + const BMatrix &B, CMatrix &C) { + if (!A.nnz()) { + Kokkos::deep_copy(exec, C.graph.entries, B.graph.entries); + KokkosBlas::scal(exec, C.values, beta, B.values); + } else if (!B.nnz()) { + Kokkos::deep_copy(exec, C.graph.entries, A.graph.entries); + KokkosBlas::scal(exec, C.values, alpha, A.values); + } else { + KokkosSparse::Experimental::spadd_numeric( + exec, handle, A.numRows(), A.numCols(), A.graph.row_map, + A.graph.entries, A.values, alpha, B.graph.row_map, B.graph.entries, + B.values, beta, C.graph.row_map, C.graph.entries, C.values); + } +} + +// One without an explicit execution space argument +template +void spadd_symbolic(KernelHandle *handle, const AMatrix &A, const BMatrix &B, + CMatrix &C) { + spadd_symbolic(typename AMatrix::execution_space{}, handle, A, B, C); +} + template -void spadd_numeric(KernelHandle* handle, const AScalar alpha, const AMatrix& A, - const BScalar beta, const BMatrix& B, CMatrix& C) { - KokkosSparse::Experimental::spadd_numeric( - handle, A.graph.row_map, A.graph.entries, A.values, alpha, - B.graph.row_map, B.graph.entries, B.values, beta, C.graph.row_map, - C.graph.entries, C.values); +void spadd_numeric(KernelHandle *handle, const AScalar alpha, const AMatrix &A, + const BScalar beta, const BMatrix &B, CMatrix &C) { + spadd_numeric(typename AMatrix::execution_space{}, handle, alpha, A, beta, B, + C); } } // namespace KokkosSparse diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_spadd_handle.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_spadd_handle.hpp index 2902550d6a90..760f912c6d3c 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_spadd_handle.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_spadd_handle.hpp @@ -32,8 +32,46 @@ class SPADDHandle { typedef typename lno_row_view_t_::non_const_value_type size_type; typedef ExecutionSpace execution_space; +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + struct SpaddCusparseData { + size_t nbytes; + void* workspace; + cusparseMatDescr_t descrA, descrB, descrC; + + SpaddCusparseData() + : nbytes(0), + workspace(nullptr), + descrA(nullptr), + descrB(nullptr), + descrC(nullptr) {} + + ~SpaddCusparseData() { + Kokkos::kokkos_free(workspace); + cusparseDestroyMatDescr(descrA); + cusparseDestroyMatDescr(descrB); + cusparseDestroyMatDescr(descrC); + } + }; +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + struct SpaddRocsparseData { + rocsparse_mat_descr descrA, descrB, descrC; + + SpaddRocsparseData() : descrA(nullptr), descrB(nullptr), descrC(nullptr) {} + + ~SpaddRocsparseData() { + rocsparse_destroy_mat_descr(descrA); + rocsparse_destroy_mat_descr(descrB); + rocsparse_destroy_mat_descr(descrC); + } + }; +#endif + private: - bool input_sorted; + // if both are true, the input matrices are strict CRS + bool input_sorted; // column indices in a row are sorted + bool input_merged; // column indices in a row are unique (i.e., merged) size_type result_nnz_size; @@ -76,11 +114,20 @@ class SPADDHandle { int get_sort_option() { return this->sort_option; } +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + SpaddCusparseData cusparseData; +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + SpaddRocsparseData rocsparseData; +#endif + /** * \brief Default constructor. */ - SPADDHandle(bool input_is_sorted) + SPADDHandle(bool input_is_sorted, bool input_is_merged = false) : input_sorted(input_is_sorted), + input_merged(input_is_merged), result_nnz_size(0), called_symbolic(false), called_numeric(false) {} @@ -95,6 +142,8 @@ class SPADDHandle { void set_call_numeric(bool call = true) { this->called_numeric = call; } bool is_input_sorted() { return input_sorted; } + bool is_input_merged() { return input_merged; } + bool is_input_strict_crs() { return input_sorted && input_merged; } }; } // namespace KokkosSparse diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_spiluk.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_spiluk.hpp index 1bf78abe5e54..b3644a8709f5 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_spiluk.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_spiluk.hpp @@ -530,7 +530,6 @@ void spiluk_numeric(KernelHandle* handle, A_entries_i, A_values_i, L_rowmap_i, L_entries_i, L_values_i, U_rowmap_i, U_entries_i, U_values_i); - } // spiluk_numeric template class SPILUKHandle { public: - typedef ExecutionSpace HandleExecSpace; - typedef TemporaryMemorySpace HandleTempMemorySpace; - typedef PersistentMemorySpace HandlePersistentMemorySpace; + using HandleExecSpace = ExecutionSpace; + using HandleTempMemorySpace = TemporaryMemorySpace; + using HandlePersistentMemorySpace = PersistentMemorySpace; - typedef ExecutionSpace execution_space; - typedef HandlePersistentMemorySpace memory_space; + using execution_space = ExecutionSpace; + using memory_space = HandlePersistentMemorySpace; - typedef typename std::remove_const::type size_type; - typedef const size_type const_size_type; + using TeamPolicy = Kokkos::TeamPolicy; + using RangePolicy = Kokkos::RangePolicy; - typedef typename std::remove_const::type nnz_lno_t; - typedef const nnz_lno_t const_nnz_lno_t; + using size_type = typename std::remove_const::type; + using const_size_type = const size_type; - typedef typename std::remove_const::type nnz_scalar_t; - typedef const nnz_scalar_t const_nnz_scalar_t; + using nnz_lno_t = typename std::remove_const::type; + using const_nnz_lno_t = const nnz_lno_t; - typedef typename Kokkos::View - nnz_row_view_t; + using nnz_scalar_t = typename std::remove_const::type; + using const_nnz_scalar_t = const nnz_scalar_t; - typedef typename Kokkos::View - nnz_lno_view_t; + using nnz_row_view_t = Kokkos::View; - typedef typename Kokkos::View - nnz_row_view_host_t; + using nnz_lno_view_t = Kokkos::View; - typedef typename Kokkos::View - nnz_lno_view_host_t; + using nnz_value_view_t = + typename Kokkos::View; - typedef typename std::make_signed< - typename nnz_row_view_t::non_const_value_type>::type signed_integral_t; - typedef Kokkos::View - signed_nnz_lno_view_t; + using nnz_row_view_host_t = + typename Kokkos::View; - typedef Kokkos::View - work_view_t; + using nnz_lno_view_host_t = + typename Kokkos::View; + + using signed_integral_t = typename std::make_signed< + typename nnz_row_view_t::non_const_value_type>::type; + using signed_nnz_lno_view_t = + Kokkos::View; + + using work_view_t = Kokkos::View; private: nnz_row_view_t level_list; // level IDs which the rows belong to @@ -95,6 +95,7 @@ class SPILUKHandle { size_type nlevels; size_type nnzL; size_type nnzU; + size_type block_size; size_type level_maxrows; // max. number of rows among levels size_type level_maxrowsperchunk; // max.number of rows among chunks among levels @@ -109,7 +110,7 @@ class SPILUKHandle { public: SPILUKHandle(SPILUKAlgorithm choice, const size_type nrows_, const size_type nnzL_, const size_type nnzU_, - bool symbolic_complete_ = false) + const size_type block_size_ = 0, bool symbolic_complete_ = false) : level_list(), level_idx(), level_ptr(), @@ -121,6 +122,7 @@ class SPILUKHandle { nlevels(0), nnzL(nnzL_), nnzU(nnzU_), + block_size(block_size_), level_maxrows(0), level_maxrowsperchunk(0), symbolic_complete(symbolic_complete_), @@ -128,21 +130,28 @@ class SPILUKHandle { team_size(-1), vector_size(-1) {} - void reset_handle(const size_type nrows_, const size_type nnzL_, - const size_type nnzU_) { + void reset_handle( + const size_type nrows_, const size_type nnzL_, const size_type nnzU_, + const size_type block_size_ = Kokkos::ArithTraits::max()) { set_nrows(nrows_); set_num_levels(0); set_nnzL(nnzL_); set_nnzU(nnzU_); + // user likely does not want to reset block size to 0, so set default + // to size_type::max + if (block_size_ != Kokkos::ArithTraits::max()) { + set_block_size(block_size_); + } set_level_maxrows(0); set_level_maxrowsperchunk(0); - level_list = nnz_row_view_t("level_list", nrows_), - level_idx = nnz_lno_view_t("level_idx", nrows_), - level_ptr = nnz_lno_view_t("level_ptr", nrows_ + 1), - hlevel_ptr = nnz_lno_view_host_t("hlevel_ptr", nrows_ + 1), - level_nchunks = nnz_lno_view_host_t(), - level_nrowsperchunk = nnz_lno_view_host_t(), reset_symbolic_complete(), + level_list = nnz_row_view_t("level_list", nrows_); + level_idx = nnz_lno_view_t("level_idx", nrows_); + level_ptr = nnz_lno_view_t("level_ptr", nrows_ + 1); + hlevel_ptr = nnz_lno_view_host_t("hlevel_ptr", nrows_ + 1); + level_nchunks = nnz_lno_view_host_t(); + level_nrowsperchunk = nnz_lno_view_host_t(); iw = work_view_t(); + reset_symbolic_complete(); } virtual ~SPILUKHandle(){}; @@ -205,6 +214,14 @@ class SPILUKHandle { KOKKOS_INLINE_FUNCTION void set_nnzU(const size_type nnzU_) { this->nnzU = nnzU_; } + KOKKOS_INLINE_FUNCTION + size_type get_block_size() const { return block_size; } + + KOKKOS_INLINE_FUNCTION + void set_block_size(const size_type block_size_) { + this->block_size = block_size_; + } + KOKKOS_INLINE_FUNCTION size_type get_level_maxrows() const { return level_maxrows; } @@ -223,6 +240,8 @@ class SPILUKHandle { bool is_symbolic_complete() const { return symbolic_complete; } + bool is_block_enabled() const { return block_size > 0; } + size_type get_num_levels() const { return nlevels; } void set_num_levels(size_type nlevels_) { this->nlevels = nlevels_; } @@ -236,9 +255,6 @@ class SPILUKHandle { int get_vector_size() const { return this->vector_size; } void print_algorithm() { - if (algm == SPILUKAlgorithm::SEQLVLSCHD_RP) - std::cout << "SEQLVLSCHD_RP" << std::endl; - if (algm == SPILUKAlgorithm::SEQLVLSCHD_TP1) std::cout << "SEQLVLSCHD_TP1" << std::endl; @@ -249,19 +265,6 @@ class SPILUKHandle { } */ } - - inline SPILUKAlgorithm StringToSPILUKAlgorithm(std::string &name) { - if (name == "SPILUK_DEFAULT") - return SPILUKAlgorithm::SEQLVLSCHD_RP; - else if (name == "SPILUK_RANGEPOLICY") - return SPILUKAlgorithm::SEQLVLSCHD_RP; - else if (name == "SPILUK_TEAMPOLICY1") - return SPILUKAlgorithm::SEQLVLSCHD_TP1; - /*else if(name=="SPILUK_TEAMPOLICY2") return - * SPILUKAlgorithm::SEQLVLSCHED_TP2;*/ - else - throw std::runtime_error("Invalid SPILUKAlgorithm name"); - } }; } // namespace Experimental diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_spmv.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_spmv.hpp index bd038813d1e3..23912916958c 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_spmv.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_spmv.hpp @@ -22,7 +22,7 @@ #define KOKKOSSPARSE_SPMV_HPP_ #include "KokkosKernels_helpers.hpp" -#include "KokkosKernels_Controls.hpp" +#include "KokkosSparse_spmv_handle.hpp" #include "KokkosSparse_spmv_spec.hpp" #include "KokkosSparse_spmv_struct_spec.hpp" #include "KokkosSparse_spmv_bsrmatrix_spec.hpp" @@ -40,816 +40,47 @@ struct RANK_ONE {}; struct RANK_TWO {}; } // namespace -/// \brief Kokkos sparse matrix-vector multiply on single -/// vectors (RANK_ONE tag). Computes y := alpha*Op(A)*x + beta*y, where Op(A) is -/// controlled by mode (see below). -/// -/// \tparam ExecutionSpace A Kokkos execution space. Must be able to access -/// the memory spaces of A, x, and y. -/// \tparam AlphaType Type of coefficient alpha. Must be convertible to -/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or -/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a -/// rank-1 Kokkos::View \tparam BetaType Type of coefficient beta. Must be -/// convertible to YVector::value_type. \tparam YVector Type of y, must be a -/// rank-1 Kokkos::View and its rank must match that of XVector -/// -/// \param space [in] The execution space instance on which to run the -/// kernel. -/// \param controls [in] kokkos-kernels control structure. -/// \param mode [in] Select A's operator mode: "N" for normal, "T" for -/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha -/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. -/// \param x [in] A vector to multiply on the left by A. -/// \param beta [in] Scalar multiplier for the vector y. -/// \param y [in/out] Result vector. -/// \param tag RANK_ONE dispatch -#ifdef DOXY // documentation version - don't separately document SFINAE - // specializations for BSR and CRS -template -#else -template ::value>::type* = nullptr> -#endif -void spmv(const ExecutionSpace& space, - KokkosKernels::Experimental::Controls controls, const char mode[], - const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y, - [[maybe_unused]] const RANK_ONE& tag) { - - // Make sure that x and y are Views. - static_assert(Kokkos::is_view::value, - "KokkosSparse::spmv: XVector must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosSparse::spmv: YVector must be a Kokkos::View."); - // Make sure A, x, y are accessible to ExecutionSpace - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: AMatrix must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: XVector must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); - -// Make sure that x and y have the same rank. -// Make sure that x (and therefore y) is rank 1. -#if (KOKKOS_VERSION >= 40100) - static_assert(XVector::rank() == YVector::rank(), - "KokkosSparse::spmv: Vector ranks do not match."); - - static_assert(XVector::rank() == 1, - "KokkosSparse::spmv: Both Vector inputs must have rank 1 " - "in order to call this specialization of spmv."); -#else - static_assert( - static_cast(XVector::rank) == static_cast(YVector::rank), - "KokkosSparse::spmv: Vector ranks do not match."); - static_assert(static_cast(XVector::rank) == 1, - "KokkosSparse::spmv: Both Vector inputs must have rank 1 " - "in order to call this specialization of spmv."); -#endif - // Make sure that y is non-const. - static_assert(std::is_same::value, - "KokkosSparse::spmv: Output Vector must be non-const."); - - // Check compatibility of dimensions at run time. - if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) { - if ((x.extent(1) != y.extent(1)) || - (static_cast(A.numCols()) > static_cast(x.extent(0))) || - (static_cast(A.numRows()) > static_cast(y.extent(0)))) { - std::ostringstream os; - os << "KokkosSparse::spmv: Dimensions do not match: " - << ", A: " << A.numRows() << " x " << A.numCols() - << ", x: " << x.extent(0) << " x " << x.extent(1) - << ", y: " << y.extent(0) << " x " << y.extent(1); - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - } else { - if ((x.extent(1) != y.extent(1)) || - (static_cast(A.numCols()) > static_cast(y.extent(0))) || - (static_cast(A.numRows()) > static_cast(x.extent(0)))) { - std::ostringstream os; - os << "KokkosSparse::spmv: Dimensions do not match (transpose): " - << ", A: " << A.numRows() << " x " << A.numCols() - << ", x: " << x.extent(0) << " x " << x.extent(1) - << ", y: " << y.extent(0) << " x " << y.extent(1); - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - } - - typedef KokkosSparse::CrsMatrix< - typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type, - typename AMatrix::device_type, Kokkos::MemoryTraits, - typename AMatrix::const_size_type> - AMatrix_Internal; - - typedef Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, - Kokkos::MemoryTraits > - XVector_Internal; - - typedef Kokkos::View< - typename YVector::non_const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename YVector::device_type, Kokkos::MemoryTraits > - YVector_Internal; - - AMatrix_Internal A_i = A; - XVector_Internal x_i = x; - YVector_Internal y_i = y; - - if (alpha == Kokkos::ArithTraits::zero() || A_i.numRows() == 0 || - A_i.numCols() == 0 || A_i.nnz() == 0) { - // This is required to maintain semantics of KokkosKernels native SpMV: - // if y contains NaN but beta = 0, the result y should be filled with 0. - // For example, this is useful for passing in uninitialized y and beta=0. - if (beta == Kokkos::ArithTraits::zero()) - Kokkos::deep_copy(space, y_i, Kokkos::ArithTraits::zero()); - else - KokkosBlas::scal(space, y_i, beta, y_i); - return; - } - - // Whether to call KokkosKernel's native implementation, even if a TPL impl is - // available - bool useFallback = controls.isParameter("algorithm") && - (controls.getParameter("algorithm") != "tpl"); - -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - // cuSPARSE does not support the conjugate mode (C) - if constexpr (std::is_same_v || - std::is_same_v) { - useFallback = useFallback || (mode[0] == Conjugate[0]); - } - // cuSPARSE 12 requires that the output (y) vector is 16-byte aligned for all - // scalar types -#if defined(CUSPARSE_VER_MAJOR) && (CUSPARSE_VER_MAJOR == 12) - uintptr_t yptr = uintptr_t((void*)y.data()); - if (yptr % 16 != 0) useFallback = true; -#endif -#endif - -#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE - if (std::is_same::value) { - useFallback = useFallback || (mode[0] != NoTranspose[0]); - } -#endif - -#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - if (std::is_same_v) { - useFallback = useFallback || (mode[0] == Conjugate[0]); - } -#ifdef KOKKOS_ENABLE_SYCL - if (std::is_same_v) { - useFallback = useFallback || (mode[0] == Conjugate[0]); - } -#endif -#endif - - if (useFallback) { - // Explicitly call the non-TPL SPMV implementation - std::string label = - "KokkosSparse::spmv[NATIVE," + - Kokkos::ArithTraits< - typename AMatrix_Internal::non_const_value_type>::name() + - "]"; - Kokkos::Profiling::pushRegion(label); - Impl::SPMV::spmv(space, controls, mode, alpha, A_i, - x_i, beta, y_i); - Kokkos::Profiling::popRegion(); - } else { - // note: the cuSPARSE spmv wrapper defines a profiling region, so one is not - // needed here. - Impl::SPMV::spmv(space, controls, mode, alpha, A_i, x_i, - beta, y_i); - } -} - -/// \brief Kokkos sparse matrix-vector multiply on single -/// vector (RANK_ONE tag). Computes y := alpha*Op(A)*x + beta*y, where Op(A) is -/// controlled by mode (see below). -/// -/// \tparam AlphaType Type of coefficient alpha. Must be convertible to -/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or -/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a -/// rank-1 Kokkos::View \tparam BetaType Type of coefficient beta. Must be -/// convertible to YVector::value_type. \tparam YVector Type of y, must be a -/// rank-1 Kokkos::View and its rank must match that of XVector -/// -/// \param controls [in] kokkos-kernels control structure. -/// \param mode [in] Select A's operator mode: "N" for normal, "T" for -/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha -/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. -/// \param x [in] A vector to multiply on the left by A. -/// \param beta [in] Scalar multiplier for the vector y. -/// \param y [in/out] Result vector. -/// \param tag RANK_ONE dispatch -#ifdef DOXY // documentation version -template -#else -template ::value>::type* = nullptr> -#endif -void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], - const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y, const RANK_ONE& tag) { - spmv(typename AMatrix::execution_space{}, controls, mode, alpha, A, x, beta, - y, tag); -} - -#ifndef DOXY // hide SFINAE specialization for BSR -template ::value>::type* = nullptr> -void spmv(const ExecutionSpace& space, - KokkosKernels::Experimental::Controls controls, const char mode[], - const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y, - [[maybe_unused]] const RANK_ONE& tag) { - // Make sure that x and y are Views. - static_assert(Kokkos::is_view::value, - "KokkosSparse::spmv: XVector must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosSparse::spmv: YVector must be a Kokkos::View."); - // Make sure A, x, y are accessible to ExecutionSpace - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: AMatrix must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: XVector must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); - // Make sure that x and y have the same rank. -#if (KOKKOS_VERSION >= 40100) - static_assert(XVector::rank() == YVector::rank(), - "KokkosSparse::spmv: Vector ranks do not match."); -#else - static_assert( - static_cast(XVector::rank) == static_cast(YVector::rank), - "KokkosSparse::spmv: Vector ranks do not match."); -#endif - // Make sure that x (and therefore y) is rank 1. - static_assert(static_cast(XVector::rank) == 1, - "KokkosSparse::spmv: Both Vector inputs must have rank 1 " - "in order to call this specialization of spmv."); - // Make sure that y is non-const. - static_assert(std::is_same::value, - "KokkosSparse::spmv: Output Vector must be non-const."); - - // - if (A.blockDim() == 1) { - KokkosSparse::CrsMatrix< - typename AMatrix::value_type, typename AMatrix::ordinal_type, - typename AMatrix::device_type, Kokkos::MemoryTraits, - typename AMatrix::size_type> - Acrs("bsr_to_crs", A.numCols(), A.values, A.graph); - KokkosSparse::spmv(space, controls, mode, alpha, Acrs, x, beta, y, - RANK_ONE()); - return; - } - // Check compatibility of dimensions at run time. - if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) { - if ((x.extent(1) != y.extent(1)) || - (static_cast(A.numCols() * A.blockDim()) != - static_cast(x.extent(0))) || - (static_cast(A.numRows() * A.blockDim()) != - static_cast(y.extent(0)))) { - std::ostringstream os; - os << "KokkosSparse::spmv (BsrMatrix): Dimensions do not match: " - << ", A: " << A.numRows() * A.blockDim() << " x " - << A.numCols() * A.blockDim() << ", x: " << x.extent(0) << " x " - << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1); - - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - } else { - if ((x.extent(1) != y.extent(1)) || - (static_cast(A.numCols() * A.blockDim()) != - static_cast(y.extent(0))) || - (static_cast(A.numRows() * A.blockDim()) != - static_cast(x.extent(0)))) { - std::ostringstream os; - os << "KokkosSparse::spmv (BsrMatrix): Dimensions do not match " - "(transpose): " - << ", A: " << A.numRows() * A.blockDim() << " x " - << A.numCols() * A.blockDim() << ", x: " << x.extent(0) << " x " - << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1); - - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - } - // - typedef KokkosSparse::Experimental::BsrMatrix< - typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type, - typename AMatrix::device_type, Kokkos::MemoryTraits, - typename AMatrix::const_size_type> - AMatrix_Internal; - - typedef Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, - Kokkos::MemoryTraits > - XVector_Internal; - - typedef Kokkos::View< - typename YVector::non_const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename YVector::device_type, Kokkos::MemoryTraits > - YVector_Internal; - - AMatrix_Internal A_i(A); - XVector_Internal x_i(x); - YVector_Internal y_i(y); - - if (alpha == Kokkos::ArithTraits::zero() || A_i.numRows() == 0 || - A_i.numCols() == 0 || A_i.nnz() == 0) { - // This is required to maintain semantics of KokkosKernels native SpMV: - // if y contains NaN but beta = 0, the result y should be filled with 0. - // For example, this is useful for passing in uninitialized y and beta=0. - if (beta == Kokkos::ArithTraits::zero()) - Kokkos::deep_copy(space, y_i, Kokkos::ArithTraits::zero()); - else - KokkosBlas::scal(space, y_i, beta, y_i); - return; - } - - // - // Whether to call KokkosKernel's native implementation, even if a TPL impl is - // available - bool useFallback = controls.isParameter("algorithm") && - (controls.getParameter("algorithm") != "tpl"); - -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - // cuSPARSE does not support the modes (C), (T), (H) - if (std::is_same::value || - std::is_same::value) { - useFallback = useFallback || (mode[0] != NoTranspose[0]); - } -#endif - -#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - if (std::is_same::value) { - useFallback = useFallback || (mode[0] == Conjugate[0]); - } -#endif - -#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE - // rocSparse does not support the modes (C), (T), (H) - if constexpr (std::is_same_v) { - useFallback = useFallback || (mode[0] != NoTranspose[0]); - } -#endif - - if (useFallback) { - // Explicitly call the non-TPL SPMV_BSRMATRIX implementation - std::string label = - "KokkosSparse::spmv[NATIVE,BSRMATRIX," + - Kokkos::ArithTraits< - typename AMatrix_Internal::non_const_value_type>::name() + - "]"; - Kokkos::Profiling::pushRegion(label); - Experimental::Impl::SPMV_BSRMATRIX::spmv_bsrmatrix(space, controls, - mode, alpha, A_i, - x_i, beta, y_i); - Kokkos::Profiling::popRegion(); - } else { - constexpr bool tpl_spec_avail = - KokkosSparse::Experimental::Impl::spmv_bsrmatrix_tpl_spec_avail< - ExecutionSpace, AMatrix_Internal, XVector_Internal, - YVector_Internal>::value; - - constexpr bool eti_spec_avail = - tpl_spec_avail - ? KOKKOSKERNELS_IMPL_COMPILE_LIBRARY /* force FALSE in app/test */ - : KokkosSparse::Experimental::Impl::spmv_bsrmatrix_eti_spec_avail< - ExecutionSpace, AMatrix_Internal, XVector_Internal, - YVector_Internal>::value; - - Experimental::Impl::SPMV_BSRMATRIX< - ExecutionSpace, AMatrix_Internal, XVector_Internal, YVector_Internal, - tpl_spec_avail, eti_spec_avail>::spmv_bsrmatrix(space, controls, mode, - alpha, A_i, x_i, beta, - y_i); - } -} - -template ::value>::type* = nullptr> -void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], - const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y, const RANK_ONE& tag) { - spmv(typename AMatrix::execution_space{}, controls, mode, alpha, A, x, beta, - y, tag); -} -#endif // ifndef DOXY - -namespace Impl { -template -struct SPMV2D1D { - static bool spmv2d1d(const char mode[], const AlphaType& alpha, - const AMatrix& A, const XVector& x, const BetaType& beta, - const YVector& y); - - template - static bool spmv2d1d(const ExecutionSpace& space, const char mode[], - const AlphaType& alpha, const AMatrix& A, - const XVector& x, const BetaType& beta, - const YVector& y); -}; - -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || !defined(KOKKOSKERNELS_ETI_ONLY) -template -struct SPMV2D1D { - static bool spmv2d1d(const char mode[], const AlphaType& alpha, - const AMatrix& A, const XVector& x, const BetaType& beta, - const YVector& y) { - spmv(typename AMatrix::execution_space{}, mode, alpha, A, x, beta, y); - return true; - } - - template - static bool spmv2d1d(const ExecutionSpace& space, const char mode[], - const AlphaType& alpha, const AMatrix& A, - const XVector& x, const BetaType& beta, - const YVector& y) { - spmv(space, mode, alpha, A, x, beta, y); - return true; - } -}; - -#else - -template -struct SPMV2D1D { - static bool spmv2d1d(const char /*mode*/[], const AlphaType& /*alpha*/, - const AMatrix& /*A*/, const XVector& /*x*/, - const BetaType& /*beta*/, const YVector& /*y*/) { - return false; - } - - template - static bool spmv2d1d(const ExecutionSpace& /* space */, const char /*mode*/[], - const AlphaType& /*alpha*/, const AMatrix& /*A*/, - const XVector& /*x*/, const BetaType& /*beta*/, - const YVector& /*y*/) { - return false; - } -}; -#endif - -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || !defined(KOKKOSKERNELS_ETI_ONLY) -template -struct SPMV2D1D { - static bool spmv2d1d(const char mode[], const AlphaType& alpha, - const AMatrix& A, const XVector& x, const BetaType& beta, - const YVector& y) { - spmv(typename AMatrix::execution_space{}, mode, alpha, A, x, beta, y); - return true; - } - - template - static bool spmv2d1d(const ExecutionSpace& space, const char mode[], - const AlphaType& alpha, const AMatrix& A, - const XVector& x, const BetaType& beta, - const YVector& y) { - spmv(space, mode, alpha, A, x, beta, y); - return true; - } -}; - -#else - -template -struct SPMV2D1D { - static bool spmv2d1d(const char /*mode*/[], const AlphaType& /*alpha*/, - const AMatrix& /*A*/, const XVector& /*x*/, - const BetaType& /*beta*/, const YVector& /*y*/) { - return false; - } - - template - static bool spmv2d1d(const ExecutionSpace& /* space */, const char /*mode*/[], - const AlphaType& /*alpha*/, const AMatrix& /*A*/, - const XVector& /*x*/, const BetaType& /*beta*/, - const YVector& /*y*/) { - return false; - } -}; -#endif - -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || !defined(KOKKOSKERNELS_ETI_ONLY) -template -struct SPMV2D1D { - static bool spmv2d1d(const char mode[], const AlphaType& alpha, - const AMatrix& A, const XVector& x, const BetaType& beta, - const YVector& y) { - spmv(typename AMatrix::execution_space{}, mode, alpha, A, x, beta, y); - return true; - } - - template - static bool spmv2d1d(const ExecutionSpace& space, const char mode[], - const AlphaType& alpha, const AMatrix& A, - const XVector& x, const BetaType& beta, - const YVector& y) { - spmv(space, mode, alpha, A, x, beta, y); - return true; - } -}; - -#else - -template -struct SPMV2D1D { - static bool spmv2d1d(const char /*mode*/[], const AlphaType& /*alpha*/, - const AMatrix& /*A*/, const XVector& /*x*/, - const BetaType& /*beta*/, const YVector& /*y*/) { - return false; - } - - template - static bool spmv2d1d(const ExecutionSpace& /* space */, const char /*mode*/[], - const AlphaType& /*alpha*/, const AMatrix& /*A*/, - const XVector& /*x*/, const BetaType& /*beta*/, - const YVector& /*y*/) { - return false; - } -}; -#endif -} // namespace Impl - -template -using SPMV2D1D - [[deprecated("KokkosSparse::SPMV2D1D is not part of the public interface - " - "use KokkosSparse::spmv instead")]] = - Impl::SPMV2D1D; - -/// \brief Kokkos sparse matrix-vector multiply on multivectors -/// (RANK_TWO tag). Computes y := alpha*Op(A)*x + beta*y, where Op(A) is +// clang-format off +/// \brief Kokkos sparse matrix-vector multiply. +/// Computes y := alpha*Op(A)*x + beta*y, where Op(A) is /// controlled by mode (see below). /// /// \tparam ExecutionSpace A Kokkos execution space. Must be able to access -/// the memory spaces of A, x, and y. +/// the memory spaces of A, x, and y. Must match Handle::ExecutionSpaceType. +/// \tparam Handle Specialization of KokkosSparse::SPMVHandle /// \tparam AlphaType Type of coefficient alpha. Must be convertible to -/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or -/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a -/// rank-2 Kokkos::View \tparam BetaType Type of coefficient beta. Must be -/// convertible to YVector::value_type. \tparam YVector Type of y, must be a -/// rank-2 Kokkos::View and its rank must match that of XVector +/// YVector::value_type. +/// \tparam AMatrix A KokkosSparse::CrsMatrix, or +/// KokkosSparse::Experimental::BsrMatrix. Must be identical to Handle::AMatrixType. +/// \tparam XVector Type of x, must be a rank-1 or 2 Kokkos::View. Must be identical to Handle::XVectorType. +/// \tparam BetaType Type of coefficient beta. Must be +/// convertible to YVector::value_type. +/// \tparam YVector Type of y, must be a rank-1 or 2 Kokkos::View and its rank must match that of XVector. Must +/// be identical to Handle::YVectorType. /// /// \param space [in] The execution space instance on which to run the /// kernel. -/// \param controls [in] kokkos-kernels control structure. +/// \param handle [in/out] a pointer to a KokkosSparse::SPMVHandle. On the first call to spmv with +/// a given handle instance, the handle's internal data will be initialized automatically. +/// On all later calls to spmv, this internal data will be reused. /// \param mode [in] Select A's operator mode: "N" for normal, "T" for -/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha -/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. -/// \param x [in] A vector to multiply on the left by A. -/// \param beta [in] Scalar multiplier for the vector y. -/// \param y [in/out] Result vector. -/// \param tag RANK_TWO dispatch -#ifdef DOXY // documentation version -template -#else -template ::value>::type* = nullptr> -#endif -void spmv(const ExecutionSpace& space, - KokkosKernels::Experimental::Controls controls, const char mode[], - const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y, - [[maybe_unused]] const RANK_TWO& tag) { - // Make sure that x and y are Views. - static_assert(Kokkos::is_view::value, - "KokkosSparse::spmv: XVector must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosSparse::spmv: YVector must be a Kokkos::View."); - // Make sure A, x, y are accessible to ExecutionSpace - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: AMatrix must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: XVector must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); -// Make sure that x and y have the same rank. -#if (KOKKOS_VERSION >= 40100) - static_assert(XVector::rank() == YVector::rank(), - "KokkosSparse::spmv: Vector ranks do not match."); -#else - static_assert( - static_cast(XVector::rank) == static_cast(YVector::rank), - "KokkosSparse::spmv: Vector ranks do not match."); -#endif - // Make sure that x (and therefore y) is rank 2. - static_assert(static_cast(XVector::rank) == 2, - "KokkosSparse::spmv: Both Vector inputs must have rank 2 " - "in order to call this specialization of spmv."); - // Make sure that y is non-const. - static_assert(std::is_same::value, - "KokkosSparse::spmv: Output Vector must be non-const."); - - // Check compatibility of dimensions at run time. - if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) { - if ((x.extent(1) != y.extent(1)) || - (static_cast(A.numCols()) > static_cast(x.extent(0))) || - (static_cast(A.numRows()) > static_cast(y.extent(0)))) { - std::ostringstream os; - os << "KokkosBlas::spmv: Dimensions do not match: " - << ", A: " << A.numRows() << " x " << A.numCols() - << ", x: " << x.extent(0) << " x " << x.extent(1) - << ", y: " << y.extent(0) << " x " << y.extent(1); - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - } else { - if ((x.extent(1) != y.extent(1)) || - (static_cast(A.numCols()) > static_cast(y.extent(0))) || - (static_cast(A.numRows()) > static_cast(x.extent(0)))) { - std::ostringstream os; - os << "KokkosBlas::spmv: Dimensions do not match (transpose): " - << ", A: " << A.numRows() << " x " << A.numCols() - << ", x: " << x.extent(0) << " x " << x.extent(1) - << ", y: " << y.extent(0) << " x " << y.extent(1); - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - } - - typedef KokkosSparse::CrsMatrix< - typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type, - typename AMatrix::device_type, Kokkos::MemoryTraits, - typename AMatrix::const_size_type> - AMatrix_Internal; - - AMatrix_Internal A_i = A; - - // Call single-vector version if appropriate - if (x.extent(1) == 1) { - typedef Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, - Kokkos::MemoryTraits > - XVector_SubInternal; - typedef Kokkos::View< - typename YVector::non_const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename YVector::device_type, Kokkos::MemoryTraits > - YVector_SubInternal; - - XVector_SubInternal x_i = Kokkos::subview(x, Kokkos::ALL(), 0); - YVector_SubInternal y_i = Kokkos::subview(y, Kokkos::ALL(), 0); - - // spmv (mode, alpha, A, x_i, beta, y_i); - using impl_type = - Impl::SPMV2D1D; - if (impl_type::spmv2d1d(space, mode, alpha, A, x_i, beta, y_i)) { - return; - } - } - { - typedef Kokkos::View< - typename XVector::const_value_type**, typename XVector::array_layout, - typename XVector::device_type, - Kokkos::MemoryTraits > - XVector_Internal; - - typedef Kokkos::View > - YVector_Internal; - - XVector_Internal x_i = x; - YVector_Internal y_i = y; - - bool useNative = false; - -// cusparseSpMM does not support conjugate mode -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - useNative = useNative || (Conjugate[0] == mode[0]); -#endif - useNative = useNative || (controls.isParameter("algorithm") && - (controls.getParameter("algorithm") != "tpl")); - - if (useNative) { - return Impl::SPMV_MV< - ExecutionSpace, AMatrix_Internal, XVector_Internal, YVector_Internal, - std::is_integral::value, - false>::spmv_mv(space, controls, mode, alpha, A_i, x_i, beta, y_i); - } else { - return Impl::SPMV_MV::spmv_mv(space, controls, mode, - alpha, A_i, x_i, beta, - y_i); - } - } -} - -/// \brief Kokkos sparse matrix-vector multiply on multivectors -/// (RANK_TWO tag). Computes y := alpha*Op(A)*x + beta*y, where Op(A) is -/// controlled by mode (see below). -/// -/// \tparam AlphaType Type of coefficient alpha. Must be convertible to -/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or -/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a -/// rank-2 Kokkos::View \tparam BetaType Type of coefficient beta. Must be -/// convertible to YVector::value_type. \tparam YVector Type of y, must be a -/// rank-2 Kokkos::View and its rank must match that of XVector -/// -/// \param controls [in] kokkos-kernels control structure. -/// \param mode [in] Select A's operator mode: "N" for normal, "T" for -/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha -/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. -/// \param x [in] A vector to multiply on the left by A. -/// \param beta [in] Scalar multiplier for the vector y. -/// \param y [in/out] Result vector. -/// \param tag RANK_TWO dispatch -#ifdef DOXY -template -#else -template ::value>::type* = nullptr> -#endif -void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], - const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y, const RANK_TWO& tag) { - spmv(typename AMatrix::execution_space{}, controls, mode, alpha, A, x, beta, - y, tag); -} - -#ifndef DOXY // hide SFINAE -template ::value>::type* = nullptr> -void spmv(const ExecutionSpace& space, - KokkosKernels::Experimental::Controls controls, const char mode[], +/// transpose, "C" for conjugate or "H" for conjugate transpose. +/// \param alpha [in] Scalar multiplier for the matrix A. +/// \param A [in] The sparse matrix A. If handle has previously been passed to spmv, A must be identical to the +/// A passed in to that first call. +/// \param x [in] A vector to multiply on the left by A. +/// \param beta [in] Scalar multiplier for the vector y. +/// \param y [in/out] Result vector. +// clang-format on +template +void spmv(const ExecutionSpace& space, Handle* handle, const char mode[], const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y, - [[maybe_unused]] const RANK_TWO& tag) { + const BetaType& beta, const YVector& y) { + // Make sure A is a CrsMatrix or BsrMatrix. + static_assert( + is_crs_matrix_v || Experimental::is_bsr_matrix_v, + "KokkosSparse::spmv: AMatrix must be a CrsMatrix or BsrMatrix"); // Make sure that x and y are Views. static_assert(Kokkos::is_view::value, "KokkosSparse::spmv: XVector must be a Kokkos::View."); @@ -859,459 +90,449 @@ void spmv(const ExecutionSpace& space, static_assert( Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: AMatrix must be accessible from ExecutionSpace"); + "KokkosSparse::spmv: AMatrix must be accessible from ExecutionSpace"); static_assert( Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: XVector must be accessible from ExecutionSpace"); + "KokkosSparse::spmv: XVector must be accessible from ExecutionSpace"); static_assert( Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); + "KokkosSparse::spmv: YVector must be accessible from ExecutionSpace"); // Make sure that x and y have the same rank. - static_assert( - static_cast(XVector::rank) == static_cast(YVector::rank), - "KokkosSparse::spmv: Vector ranks do not match."); - // Make sure that x (and therefore y) is rank 2. - static_assert(static_cast(XVector::rank) == 2, - "KokkosSparse::spmv: Both Vector inputs must have rank 2 " - "in order to call this specialization of spmv."); + static_assert(XVector::rank() == YVector::rank(), + "KokkosSparse::spmv: Vector ranks do not match."); + // Make sure that x (and therefore y) is rank 1 or 2. + static_assert(XVector::rank() == size_t(1) || XVector::rank() == size_t(2), + "KokkosSparse::spmv: Both Vector inputs must have rank 1 or 2"); // Make sure that y is non-const. - static_assert(std::is_same::value, + static_assert(!std::is_const_v, "KokkosSparse::spmv: Output Vector must be non-const."); - // - if (A.blockDim() == 1) { - KokkosSparse::CrsMatrix< - typename AMatrix::value_type, typename AMatrix::ordinal_type, - typename AMatrix::device_type, Kokkos::MemoryTraits, - typename AMatrix::size_type> - Acrs("bsr_to_crs", A.numCols(), A.values, A.graph); - KokkosSparse::spmv(space, controls, mode, alpha, Acrs, x, beta, y, - RANK_TWO()); - return; + // Check that A, X, Y types match that of the Handle + // But only check this if Handle is the user-facing type (SPMVHandle). + // We may internally call spmv with SPMVHandleImpl, which does not include + // the matrix and vector types. + if constexpr (KokkosSparse::Impl::is_spmv_handle_v) { + static_assert( + std::is_same_v, + "KokkosSparse::spmv: AMatrix must be identical to Handle::AMatrixType"); + static_assert( + std::is_same_v, + "KokkosSparse::spmv: XVector must be identical to Handle::XVectorType"); + static_assert( + std::is_same_v, + "KokkosSparse::spmv: YVector must be identical to Handle::YVectorType"); } + + constexpr bool isBSR = Experimental::is_bsr_matrix_v; + // Check compatibility of dimensions at run time. + size_t m, n; + + if constexpr (!isBSR) { + m = A.numRows(); + n = A.numCols(); + } else { + m = A.numRows() * A.blockDim(); + n = A.numCols() * A.blockDim(); + } + if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) { - if ((x.extent(1) != y.extent(1)) || - (static_cast(A.numCols() * A.blockDim()) != - static_cast(x.extent(0))) || - (static_cast(A.numRows() * A.blockDim()) != - static_cast(y.extent(0)))) { + if ((x.extent(1) != y.extent(1)) || (n != x.extent(0)) || + (m != y.extent(0))) { std::ostringstream os; - os << "KokkosSparse::spmv (BsrMatrix): Dimensions do not match: " - << ", A: " << A.numRows() * A.blockDim() << " x " - << A.numCols() * A.blockDim() << ", x: " << x.extent(0) << " x " + os << "KokkosSparse::spmv: Dimensions do not match: " + << ", A: " << m << " x " << n << ", x: " << x.extent(0) << " x " << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1); - KokkosKernels::Impl::throw_runtime_exception(os.str()); } } else { - if ((x.extent(1) != y.extent(1)) || - (static_cast(A.numCols() * A.blockDim()) != - static_cast(y.extent(0))) || - (static_cast(A.numRows() * A.blockDim()) != - static_cast(x.extent(0)))) { + if ((x.extent(1) != y.extent(1)) || (m != x.extent(0)) || + (n != y.extent(0))) { std::ostringstream os; - os << "KokkosSparse::spmv (BsrMatrix): Dimensions do not match " - "(transpose): " - << ", A: " << A.numRows() * A.blockDim() << " x " - << A.numCols() * A.blockDim() << ", x: " << x.extent(0) << " x " - << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1); - + os << "KokkosSparse::spmv: Dimensions do not match (transpose): " + << ", A: " << A.numRows() << " x " << A.numCols() + << ", x: " << x.extent(0) << " x " << x.extent(1) + << ", y: " << y.extent(0) << " x " << y.extent(1); KokkosKernels::Impl::throw_runtime_exception(os.str()); } } - // - typedef KokkosSparse::Experimental::BsrMatrix< - typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type, - typename AMatrix::device_type, Kokkos::MemoryTraits, - typename AMatrix::const_size_type> - AMatrix_Internal; - AMatrix_Internal A_i(A); - - typedef Kokkos::View< - typename XVector::const_value_type**, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, - Kokkos::MemoryTraits > - XVector_Internal; - XVector_Internal x_i(x); - typedef Kokkos::View< - typename YVector::non_const_value_type**, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename YVector::device_type, Kokkos::MemoryTraits > - YVector_Internal; - YVector_Internal y_i(y); - // - if (alpha == Kokkos::ArithTraits::zero() || A_i.numRows() == 0 || - A_i.numCols() == 0 || A_i.nnz() == 0) { + // Efficiently handle cases where alpha*Op(A) is equivalent to the zero matrix + if (alpha == Kokkos::ArithTraits::zero() || m == 0 || n == 0 || + A.nnz() == 0) { // This is required to maintain semantics of KokkosKernels native SpMV: // if y contains NaN but beta = 0, the result y should be filled with 0. // For example, this is useful for passing in uninitialized y and beta=0. if (beta == Kokkos::ArithTraits::zero()) - Kokkos::deep_copy(space, y_i, Kokkos::ArithTraits::zero()); + Kokkos::deep_copy(space, y, Kokkos::ArithTraits::zero()); else - KokkosBlas::scal(space, y_i, beta, y_i); + KokkosBlas::scal(space, y, beta, y); return; } - // - // Call single-vector version if appropriate - // - if (x.extent(1) == 1) { - typedef Kokkos::View< - typename XVector::const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, - Kokkos::MemoryTraits > - XVector_SubInternal; - typedef Kokkos::View< - typename YVector::non_const_value_type*, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename YVector::device_type, Kokkos::MemoryTraits > - YVector_SubInternal; - XVector_SubInternal x_0 = Kokkos::subview(x_i, Kokkos::ALL(), 0); - YVector_SubInternal y_0 = Kokkos::subview(y_i, Kokkos::ALL(), 0); + // Get the "impl" parent class of Handle, if it's not already the impl + using HandleImpl = typename Handle::ImplType; - return spmv(space, controls, mode, alpha, A_i, x_0, beta, y_0, RANK_ONE()); - } + using ACrs_Internal = CrsMatrix< + typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type, + typename AMatrix::device_type, Kokkos::MemoryTraits, + typename AMatrix::const_size_type>; + using ABsr_Internal = Experimental::BsrMatrix< + typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type, + typename AMatrix::device_type, Kokkos::MemoryTraits, + typename AMatrix::const_size_type>; + + using AMatrix_Internal = + std::conditional_t; + + // Intercept special case: A is a BsrMatrix with blockDim() == 1 + // This is exactly equivalent to CrsMatrix (more performant) + // and cuSPARSE actually errors out in that case. // - // Whether to call KokkosKernel's native implementation, even if a TPL impl is - // available - bool useFallback = controls.isParameter("algorithm") && - (controls.getParameter("algorithm") != "tpl"); + // This relies on the fact that this codepath will always be taken for + // this particular matrix (so internally, this handle is only ever used for + // Crs) + if constexpr (isBSR) { + if (A.blockDim() == 1) { + // Construct an ACrs_Internal (unmanaged memory) from A's views + typename ACrs_Internal::row_map_type rowmap(A.graph.row_map); + typename ACrs_Internal::index_type entries(A.graph.entries); + typename ACrs_Internal::values_type values(A.values); + ACrs_Internal ACrs(std::string{}, A.numRows(), A.numCols(), A.nnz(), + values, rowmap, entries); + spmv(space, handle->get_impl(), mode, alpha, ACrs, x, beta, y); + return; + } + } + + AMatrix_Internal A_i(A); + + // Note: data_type of a View includes both the scalar and rank + using XVector_Internal = Kokkos::View< + typename XVector::const_data_type, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename XVector::device_type, + Kokkos::MemoryTraits>; + + using YVector_Internal = Kokkos::View< + typename YVector::non_const_data_type, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename YVector::device_type, Kokkos::MemoryTraits>; + + XVector_Internal x_i(x); + YVector_Internal y_i(y); + bool useNative = is_spmv_algorithm_native(handle->get_algorithm()); + // Also use the native algorithm if SPMV_FAST_SETUP was selected and + // rocSPARSE is the possible TPL to use. Native is faster in this case. +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + if (handle->get_algorithm() == SPMV_FAST_SETUP && + std::is_same_v) + useNative = true; +#endif + + // Now call the proper implementation depending on isBSR and the rank of X/Y + if constexpr (!isBSR) { + if constexpr (XVector::rank() == 1) { +///////////////// +// CRS, rank 1 // +///////////////// #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - // cuSPARSE does not support the modes (C), (T), (H) - if (std::is_same::value || - std::is_same::value) { - useFallback = useFallback || (mode[0] != NoTranspose[0]); - } + // cuSPARSE does not support the conjugate mode (C) + if constexpr (std::is_same_v || + std::is_same_v) { + useNative = useNative || (mode[0] == Conjugate[0]); + } + // cuSPARSE 12 requires that the output (y) vector is 16-byte aligned for + // all scalar types +#if defined(CUSPARSE_VER_MAJOR) && (CUSPARSE_VER_MAJOR == 12) + uintptr_t yptr = uintptr_t((void*)y.data()); + if (yptr % 16 != 0) useNative = true; +#endif +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + if (std::is_same::value) { + useNative = useNative || (mode[0] != NoTranspose[0]); + } #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - if (std::is_same::value) { - useFallback = useFallback || (mode[0] == Conjugate[0]); - } + if (std::is_same_v) { + useNative = useNative || (mode[0] == Conjugate[0]); + } +#ifdef KOKKOS_ENABLE_SYCL + if (std::is_same_v) { + useNative = useNative || (mode[0] == Conjugate[0]); + } +#endif +#endif + if (useNative) { + // Explicitly call the non-TPL SPMV implementation + std::string label = + "KokkosSparse::spmv[NATIVE," + + Kokkos::ArithTraits< + typename AMatrix_Internal::non_const_value_type>::name() + + "]"; + Kokkos::Profiling::pushRegion(label); + Impl::SPMV::spmv(space, + handle, + mode, alpha, + A_i, x_i, + beta, y_i); + Kokkos::Profiling::popRegion(); + } else { + // note: the cuSPARSE spmv wrapper defines a profiling region, so one is + // not needed here. + Impl::SPMV::spmv(space, handle, + mode, alpha, A_i, + x_i, beta, y_i); + } + } else { +///////////////// +// CRS, rank 2 // +///////////////// +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + useNative = useNative || (Conjugate[0] == mode[0]); #endif - if (useFallback) { - // Explicitly call the non-TPL SPMV_BSRMATRIX implementation - std::string label = - "KokkosSparse::spmv[NATIVE,BSMATRIX," + - Kokkos::ArithTraits< - typename AMatrix_Internal::non_const_value_type>::name() + - "]"; - Kokkos::Profiling::pushRegion(label); - Experimental::Impl::SPMV_MV_BSRMATRIX< - ExecutionSpace, AMatrix_Internal, XVector_Internal, YVector_Internal, - std::is_integral::value, - false>::spmv_mv_bsrmatrix(space, controls, mode, alpha, A_i, x_i, beta, - y_i); - Kokkos::Profiling::popRegion(); + if (useNative) { + std::string label = + "KokkosSparse::spmv[NATIVE,MV," + + Kokkos::ArithTraits< + typename AMatrix_Internal::non_const_value_type>::name() + + "]"; + Kokkos::Profiling::pushRegion(label); + return Impl::SPMV_MV< + ExecutionSpace, HandleImpl, AMatrix_Internal, XVector_Internal, + YVector_Internal, + std::is_integral::value, + false>::spmv_mv(space, handle, mode, alpha, A_i, x_i, beta, y_i); + Kokkos::Profiling::popRegion(); + } else { + return Impl::SPMV_MV::spmv_mv(space, handle, mode, + alpha, A_i, x_i, beta, + y_i); + } + } } else { - Experimental::Impl::SPMV_MV_BSRMATRIX< - ExecutionSpace, AMatrix_Internal, XVector_Internal, YVector_Internal, - std::is_integral::value>:: - spmv_mv_bsrmatrix(space, controls, mode, alpha, A_i, x_i, beta, y_i); - } -} - -template ::value>::type* = nullptr> -void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], - const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y, const RANK_TWO& tag) { - spmv(typename AMatrix::execution_space{}, controls, mode, alpha, A, x, beta, - y, tag); -} + if constexpr (XVector::rank() == 1) { +///////////////// +// BSR, rank 1 // +///////////////// +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + // cuSPARSE does not support the modes (C), (T), (H) + if (std::is_same::value || + std::is_same::value) { + useNative = useNative || (mode[0] != NoTranspose[0]); + } #endif -/// \brief Public interface to local sparse matrix-vector multiply. -/// -/// Compute y := beta*y + alpha*Op(A)*x, where x and y are either both -/// rank 1 (single vectors) or rank 2 (multivectors) Kokkos::View -/// instances, and Op(A) is determined -/// by \c mode. If beta == 0, ignore and overwrite the initial -/// entries of y; if alpha == 0, ignore the entries of A and x. -/// -/// If \c AMatrix is a KokkosSparse::Experimental::BsrMatrix, controls may have -/// \c "algorithm" = \c "experimental_bsr_tc" to use Nvidia tensor cores on -/// Volta or Ampere architectures. On Volta-architecture GPUs the only available -/// precision is mixed-precision fp32 accumulator from fp16 inputs. On -/// Ampere-architecture GPUs (cc >= 80), mixed precision is used when A is fp16, -/// x is fp16, and y is fp32. Otherwise, double-precision is used. The caller -/// may override this by setting the \c "tc_precision" = \c "mixed" or -/// \c "double" as desired. -/// -/// For mixed precision, performance will degrade for blockDim < 16. -/// For double precision, for blockDim < 8. -/// For such cases, consider an alternate SpMV algorithm. -/// -/// May have \c "algorithm" set to \c "native" to bypass TPLs if they are -/// enabled for Kokkos::CrsMatrix and Kokkos::Experimental::BsrMatrix on a -/// single vector, or for Kokkos::Experimental::BsrMatrix with a multivector. -/// -/// \tparam ExecutionSpace A Kokkos execution space. Must be able to access -/// the memory spaces of A, x, and y. -/// \tparam AlphaType Type of coefficient alpha. Must be convertible to -/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or -/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a -/// rank 1 or 2 Kokkos::View \tparam BetaType Type of coefficient beta. Must be -/// convertible to YVector::value_type. \tparam YVector Type of y, must be a -/// rank 1 or 2 Kokkos::View and its rank must match that of XVector -/// -/// \param space [in] The execution space instance on which to run the -/// kernel. -/// \param controls [in] kokkos-kernels control structure -/// \param mode [in] Select A's operator mode: "N" for normal, "T" for -/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha -/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. -/// \param x [in] Either a single vector (rank-1 Kokkos::View) or -/// multivector (rank-2 Kokkos::View). -/// \param beta [in] Scalar multiplier for the (multi)vector y. -/// \param y [in/out] Either a single vector (rank-1 Kokkos::View) or -/// multivector (rank-2 Kokkos::View). It must have the same number -/// of columns as x. -template -void spmv(const ExecutionSpace& space, - KokkosKernels::Experimental::Controls controls, const char mode[], - const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y) { - // Make sure that x and y are Views. - static_assert(Kokkos::is_view::value, - "KokkosSparse::spmv: XVector must be a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosSparse::spmv: YVector must be a Kokkos::View."); - // Make sure A, x, y are accessible to ExecutionSpace - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: AMatrix must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: XVector must be accessible from ExecutionSpace"); - static_assert( - Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv: YVector must be accessible from ExecutionSpace"); - // Make sure that both x and y have the same rank. - static_assert( - static_cast(XVector::rank) == static_cast(YVector::rank), - "KokkosSparse::spmv: Vector ranks do not match."); - // Make sure that y is non-const. - static_assert(std::is_same::value, - "KokkosSparse::spmv: Output Vector must be non-const."); - - // Check compatibility of dimensions at run time. - if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) { - if ((x.extent(1) != y.extent(1)) || - (static_cast(A.numPointCols()) != - static_cast(x.extent(0))) || - (static_cast(A.numPointRows()) != - static_cast(y.extent(0)))) { - std::ostringstream os; - os << "KokkosSparse::spmv (Generic): Dimensions do not match: " - << ", A: " << A.numPointRows() << " x " << A.numPointCols() - << ", x: " << x.extent(0) << " x " << x.extent(1) - << ", y: " << y.extent(0) << " x " << y.extent(1); +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + if (std::is_same::value) { + useNative = useNative || (mode[0] == Conjugate[0]); + } +#endif - KokkosKernels::Impl::throw_runtime_exception(os.str()); - } - } else { - if ((x.extent(1) != y.extent(1)) || - (static_cast(A.numPointCols()) != - static_cast(y.extent(0))) || - (static_cast(A.numPointRows()) != - static_cast(x.extent(0)))) { - std::ostringstream os; - os << "KokkosSparse::spmv (Generic): Dimensions do not match " - "(transpose): " - << ", A: " << A.numPointRows() << " x " << A.numPointCols() - << ", x: " << x.extent(0) << " x " << x.extent(1) - << ", y: " << y.extent(0) << " x " << y.extent(1); +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + // rocSparse does not support the modes (C), (T), (H) + if constexpr (std::is_same_v) { + useNative = useNative || (mode[0] != NoTranspose[0]); + } +#endif + if (useNative) { + // Explicitly call the non-TPL SPMV_BSRMATRIX implementation + std::string label = + "KokkosSparse::spmv[NATIVE,BSRMATRIX," + + Kokkos::ArithTraits< + typename AMatrix_Internal::non_const_value_type>::name() + + "]"; + Kokkos::Profiling::pushRegion(label); + Impl::SPMV_BSRMATRIX::spmv_bsrmatrix(space, handle, mode, alpha, + A_i, x_i, beta, y_i); + Kokkos::Profiling::popRegion(); + } else { + Impl::SPMV_BSRMATRIX::spmv_bsrmatrix(space, handle, + mode, alpha, A_i, + x_i, beta, y_i); + } + } else { + ///////////////// + // BSR, rank 2 // + ///////////////// +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + // cuSPARSE does not support the modes (C), (T), (H) + if (std::is_same::value || + std::is_same::value) { + useNative = useNative || (mode[0] != NoTranspose[0]); + } +#endif - KokkosKernels::Impl::throw_runtime_exception(os.str()); +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + if (std::is_same::value) { + useNative = useNative || (mode[0] == Conjugate[0]); + } +#endif + if (useNative) { + // Explicitly call the non-TPL SPMV_BSRMATRIX implementation + std::string label = + "KokkosSparse::spmv[NATIVE,MV,BSMATRIX," + + Kokkos::ArithTraits< + typename AMatrix_Internal::non_const_value_type>::name() + + "]"; + Kokkos::Profiling::pushRegion(label); + Impl::SPMV_MV_BSRMATRIX< + ExecutionSpace, HandleImpl, AMatrix_Internal, XVector_Internal, + YVector_Internal, + std::is_integral< + typename AMatrix_Internal::const_value_type>::value, + false>::spmv_mv_bsrmatrix(space, handle, mode, alpha, A_i, x_i, + beta, y_i); + Kokkos::Profiling::popRegion(); + } else { + Impl::SPMV_MV_BSRMATRIX< + ExecutionSpace, HandleImpl, AMatrix_Internal, XVector_Internal, + YVector_Internal, + std::is_integral:: + value>::spmv_mv_bsrmatrix(space, handle, mode, alpha, A_i, x_i, + beta, y_i); + } } } - - if (alpha == Kokkos::ArithTraits::zero() || A.numRows() == 0 || - A.numCols() == 0 || A.nnz() == 0) { - // This is required to maintain semantics of KokkosKernels native SpMV: - // if y contains NaN but beta = 0, the result y should be filled with 0. - // For example, this is useful for passing in uninitialized y and beta=0. - if (beta == Kokkos::ArithTraits::zero()) - Kokkos::deep_copy(space, y, Kokkos::ArithTraits::zero()); - else - KokkosBlas::scal(space, y, beta, y); - return; - } - // - using RANK_SPECIALISE = - typename std::conditional(XVector::rank) == 2, RANK_TWO, - RANK_ONE>::type; - spmv(space, controls, mode, alpha, A, x, beta, y, RANK_SPECIALISE()); } -/// \brief Public interface to local sparse matrix-vector multiply. -/// -/// Compute y = beta*y + alpha*Op(A)*x, where x and y are either both -/// rank 1 (single vectors) or rank 2 (multivectors) Kokkos::View -/// instances, and Op(A) is determined -/// by \c mode. If beta == 0, ignore and overwrite the initial -/// entries of y; if alpha == 0, ignore the entries of A and x. -/// -/// If \c AMatrix is a KokkosSparse::Experimental::BsrMatrix, controls may have -/// \c "algorithm" = \c "experimental_bsr_tc" to use Nvidia tensor cores on -/// Volta or Ampere architectures. On Volta-architecture GPUs the only available -/// precision is mixed-precision fp32 accumulator from fp16 inputs. On -/// Ampere-architecture GPUs (cc >= 80), mixed precision is used when A is fp16, -/// x is fp16, and y is fp32. Otherwise, double-precision is used. The caller -/// may override this by setting the \c "tc_precision" = \c "mixed" or -/// \c "double" as desired. -/// -/// For mixed precision, performance will degrade for blockDim < 16. -/// For double precision, for blockDim < 8. -/// For such cases, consider an alternate SpMV algorithm. -/// -/// May have \c "algorithm" set to \c "native" to bypass TPLs if they are -/// enabled for Kokkos::CrsMatrix and Kokkos::Experimental::BsrMatrix on a -/// single vector, or for Kokkos::Experimental::BsrMatrix with a multivector. +// clang-format off +/// \brief Kokkos sparse matrix-vector multiply. +/// Computes y := alpha*Op(A)*x + beta*y, where Op(A) is controlled by mode +/// (see below). /// -/// \tparam AMatrix KokkosSparse::CrsMatrix or -/// KokkosSparse::Experimental::BsrMatrix +/// \tparam ExecutionSpace A Kokkos execution space. Must be able to access +/// the memory spaces of A, x, and y. +/// \tparam AlphaType Type of coefficient alpha. Must be convertible to +/// YVector::value_type. +/// \tparam AMatrix A KokkosSparse::CrsMatrix, or KokkosSparse::Experimental::BsrMatrix +/// \tparam XVector Type of x, must be a rank-1 or rank-2 Kokkos::View +/// \tparam BetaType Type of coefficient beta. Must be convertible to YVector::value_type. +/// \tparam YVector Type of y, must be a Kokkos::View and its rank must match that of XVector /// -/// \param controls [in] kokkos-kernels control structure -/// \param mode [in] "N" for no transpose, "T" for transpose, or "C" -/// for conjugate transpose. +/// \param space [in] The execution space instance on which to run the kernel. +/// \param mode [in] Select A's operator mode: "N" for normal, "T" for +/// transpose, "C" for conjugate or "H" for conjugate transpose. /// \param alpha [in] Scalar multiplier for the matrix A. /// \param A [in] The sparse matrix A. -/// \param x [in] Either a single vector (rank-1 Kokkos::View) or -/// multivector (rank-2 Kokkos::View). -/// \param beta [in] Scalar multiplier for the (multi)vector y. -/// \param y [in/out] Either a single vector (rank-1 Kokkos::View) or -/// multivector (rank-2 Kokkos::View). It must have the same number -/// of columns as x. -template -void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], - const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y) { - spmv(typename AMatrix::execution_space{}, controls, mode, alpha, A, x, beta, - y); -} - -#ifndef DOXY -/// \brief Catch-all public interface to error on invalid Kokkos::Sparse spmv -/// argument types -/// -/// This is a catch-all interface that throws a compile-time error if \c -/// AMatrix is not a CrsMatrix, or BsrMatrix -/// -template ::value && - !KokkosSparse::is_crs_matrix::value>::type* = nullptr> -void spmv(KokkosKernels::Experimental::Controls /*controls*/, - const char[] /*mode*/, const AlphaType& /*alpha*/, - const AMatrix& /*A*/, const XVector& /*x*/, const BetaType& /*beta*/, - const YVector& /*y*/) { - // have to arrange this so that the compiler can't tell this is false until - // instantiation - static_assert(KokkosSparse::is_crs_matrix::value || - KokkosSparse::Experimental::is_bsr_matrix::value, - "SpMV: AMatrix must be CrsMatrix or BsrMatrix"); -} - -/// \brief Catch-all public interface to error on invalid Kokkos::Sparse spmv -/// argument types -/// -/// This is a catch-all interface that throws a compile-time error if \c -/// AMatrix is not a CrsMatrix, or BsrMatrix -/// +/// \param x [in] A vector to multiply on the left by A. +/// \param beta [in] Scalar multiplier for the vector y. +/// \param y [in/out] Result vector. +// clang-format on template ::value && - !KokkosSparse::is_crs_matrix::value>::type* = nullptr> -void spmv(const ExecutionSpace& /* space */, - KokkosKernels::Experimental::Controls /*controls*/, - const char[] /*mode*/, const AlphaType& /*alpha*/, - const AMatrix& /*A*/, const XVector& /*x*/, const BetaType& /*beta*/, - const YVector& /*y*/) { - // have to arrange this so that the compiler can't tell this is false until - // instantiation - static_assert(KokkosSparse::is_crs_matrix::value || - KokkosSparse::Experimental::is_bsr_matrix::value, - "SpMV: AMatrix must be CrsMatrix or BsrMatrix"); + typename = std::enable_if_t< + Kokkos::is_execution_space::value>> +void spmv(const ExecutionSpace& space, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y) { + SPMVAlgorithm algo = SPMV_FAST_SETUP; + // Without handle reuse, native is overall faster than rocSPARSE +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + if constexpr (std::is_same_v) + algo = SPMV_NATIVE; +#endif + SPMVHandle + handle(algo); + spmv(space, &handle, mode, alpha, A, x, beta, y); } -#endif // ifndef DOXY +// clang-format off /// \brief Kokkos sparse matrix-vector multiply. /// Computes y := alpha*Op(A)*x + beta*y, where Op(A) is controlled by mode /// (see below). /// +/// \tparam Handle Specialization of KokkosSparse::SPMVHandle /// \tparam AlphaType Type of coefficient alpha. Must be convertible to -/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or -/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a -/// rank-2 Kokkos::View \tparam BetaType Type of coefficient beta. Must be -/// convertible to YVector::value_type. \tparam YVector Type of y, must be a -/// rank-2 Kokkos::View and its rank must match that of XVector +/// YVector::value_type. +/// \tparam AMatrix A KokkosSparse::CrsMatrix, or +/// KokkosSparse::Experimental::BsrMatrix. Must be identical to Handle::AMatrixType. +/// \tparam XVector Type of x. Must be a rank-1 or 2 Kokkos::View and be identical to Handle::XVectorType. +/// \tparam BetaType Type of coefficient beta. Must be convertible to YVector::value_type. +/// \tparam YVector Type of y. Must have the same rank as XVector and be identical to Handle::YVectorType. /// +/// \param handle [in/out] a pointer to a KokkosSparse::SPMVHandle. On the first call to spmv with +/// a given handle instance, the handle's internal data will be initialized automatically. +/// On all later calls to spmv, this internal data will be reused. /// \param mode [in] Select A's operator mode: "N" for normal, "T" for -/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha -/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. +/// transpose, "C" for conjugate or "H" for conjugate transpose. +/// \param alpha [in] Scalar multiplier for the matrix A. +/// \param A [in] The sparse matrix A. /// \param x [in] A vector to multiply on the left by A. /// \param beta [in] Scalar multiplier for the vector y. /// \param y [in/out] Result vector. -template -void spmv(const char mode[], const AlphaType& alpha, const AMatrix& A, - const XVector& x, const BetaType& beta, const YVector& y) { - KokkosKernels::Experimental::Controls controls; - spmv(controls, mode, alpha, A, x, beta, y); +// clang-format on +template < + class Handle, class AlphaType, class AMatrix, class XVector, class BetaType, + class YVector, + typename = std::enable_if_t::value>> +void spmv(Handle* handle, const char mode[], const AlphaType& alpha, + const AMatrix& A, const XVector& x, const BetaType& beta, + const YVector& y) { + spmv(typename Handle::ExecutionSpaceType(), handle, mode, alpha, A, x, beta, + y); } +// clang-format off /// \brief Kokkos sparse matrix-vector multiply. /// Computes y := alpha*Op(A)*x + beta*y, where Op(A) is controlled by mode /// (see below). /// -/// \tparam ExecutionSpace A Kokkos execution space. Must be able to access -/// the memory spaces of A, x, and y. -/// \tparam AlphaType Type of coefficient alpha. Must be convertible to -/// YVector::value_type. \tparam AMatrix A KokkosSparse::CrsMatrix, or -/// KokkosSparse::Experimental::BsrMatrix \tparam XVector Type of x, must be a -/// rank-2 Kokkos::View \tparam BetaType Type of coefficient beta. Must be -/// convertible to YVector::value_type. \tparam YVector Type of y, must be a -/// rank-2 Kokkos::View and its rank must match that of XVector +/// \tparam AlphaType Type of coefficient alpha. Must be convertible to YVector::value_type. +/// \tparam AMatrix A KokkosSparse::CrsMatrix, or KokkosSparse::Experimental::BsrMatrix +/// \tparam XVector Type of x, must be a rank-1 or rank-2 Kokkos::View +/// \tparam BetaType Type of coefficient beta. Must be convertible to YVector::value_type. +/// \tparam YVector Type of y, must be a Kokkos::View and its rank must match that of XVector /// -/// \param space [in] The execution space instance on which to run the -/// kernel. /// \param mode [in] Select A's operator mode: "N" for normal, "T" for -/// transpose, "C" for conjugate or "H" for conjugate transpose. \param alpha -/// [in] Scalar multiplier for the matrix A. \param A [in] The sparse matrix A. +/// transpose, "C" for conjugate or "H" for conjugate transpose. +/// \param alpha [in] Scalar multiplier for the matrix A. +/// \param A [in] The sparse matrix A. /// \param x [in] A vector to multiply on the left by A. /// \param beta [in] Scalar multiplier for the vector y. /// \param y [in/out] Result vector. -template -void spmv(const ExecutionSpace& space, const char mode[], - const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y) { - KokkosKernels::Experimental::Controls controls; - spmv(space, controls, mode, alpha, A, x, beta, y); +// clang-format on +template +void spmv(const char mode[], const AlphaType& alpha, const AMatrix& A, + const XVector& x, const BetaType& beta, const YVector& y) { + SPMVAlgorithm algo = SPMV_FAST_SETUP; + // Without handle reuse, native is overall faster than rocSPARSE +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + if constexpr (std::is_same_v) + algo = SPMV_NATIVE; +#endif + SPMVHandle + handle(algo); + spmv(typename AMatrix::execution_space(), &handle, mode, alpha, A, x, beta, + y); } namespace Experimental { @@ -1332,17 +553,17 @@ void spmv_struct(const ExecutionSpace& space, const char mode[], static_assert( Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv_struct: AMatrix must be accessible from " + "KokkosSparse::spmv_struct: AMatrix must be accessible from " "ExecutionSpace"); static_assert( Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv_struct: XVector must be accessible from " + "KokkosSparse::spmv_struct: XVector must be accessible from " "ExecutionSpace"); static_assert( Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv_struct: YVector must be accessible from " + "KokkosSparse::spmv_struct: YVector must be accessible from " "ExecutionSpace"); // Make sure that x (and therefore y) is rank 1. static_assert( @@ -1391,13 +612,13 @@ void spmv_struct(const ExecutionSpace& space, const char mode[], typename XVector::const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, typename XVector::device_type, - Kokkos::MemoryTraits > + Kokkos::MemoryTraits> XVector_Internal; typedef Kokkos::View< typename YVector::non_const_value_type*, typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename YVector::device_type, Kokkos::MemoryTraits > + typename YVector::device_type, Kokkos::MemoryTraits> YVector_Internal; AMatrix_Internal A_i = A; @@ -1627,25 +848,25 @@ void spmv_struct(const ExecutionSpace& space, const char mode[], static_assert( Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv_struct: AMatrix must be accessible from " + "KokkosSparse::spmv_struct: AMatrix must be accessible from " "ExecutionSpace"); static_assert( Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv_struct: XVector must be accessible from " + "KokkosSparse::spmv_struct: XVector must be accessible from " "ExecutionSpace"); static_assert( Kokkos::SpaceAccessibility::accessible, - "KokkosBlas::spmv_struct: YVector must be accessible from " + "KokkosSparse::spmv_struct: YVector must be accessible from " "ExecutionSpace"); // Make sure that both x and y have the same rank. static_assert(XVector::rank == YVector::rank, - "KokkosBlas::spmv: Vector ranks do not match."); + "KokkosSparse::spmv: Vector ranks do not match."); // Make sure that y is non-const. static_assert(std::is_same::value, - "KokkosBlas::spmv: Output Vector must be non-const."); + "KokkosSparse::spmv: Output Vector must be non-const."); // Check compatibility of dimensions at run time. if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) { @@ -1653,7 +874,7 @@ void spmv_struct(const ExecutionSpace& space, const char mode[], (static_cast(A.numCols()) > static_cast(x.extent(0))) || (static_cast(A.numRows()) > static_cast(y.extent(0)))) { std::ostringstream os; - os << "KokkosBlas::spmv: Dimensions do not match: " + os << "KokkosSparse::spmv: Dimensions do not match: " << ", A: " << A.numRows() << " x " << A.numCols() << ", x: " << x.extent(0) << " x " << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1); @@ -1664,7 +885,7 @@ void spmv_struct(const ExecutionSpace& space, const char mode[], (static_cast(A.numCols()) > static_cast(y.extent(0))) || (static_cast(A.numRows()) > static_cast(x.extent(0)))) { std::ostringstream os; - os << "KokkosBlas::spmv: Dimensions do not match (transpose): " + os << "KokkosSparse::spmv: Dimensions do not match (transpose): " << ", A: " << A.numRows() << " x " << A.numCols() << ", x: " << x.extent(0) << " x " << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1); @@ -1685,11 +906,11 @@ void spmv_struct(const ExecutionSpace& space, const char mode[], typedef Kokkos::View< typename XVector::const_value_type*, typename YVector::array_layout, typename XVector::device_type, - Kokkos::MemoryTraits > + Kokkos::MemoryTraits> XVector_SubInternal; typedef Kokkos::View< typename YVector::non_const_value_type*, typename YVector::array_layout, - typename YVector::device_type, Kokkos::MemoryTraits > + typename YVector::device_type, Kokkos::MemoryTraits> YVector_SubInternal; XVector_SubInternal x_i = Kokkos::subview(x, Kokkos::ALL(), 0); @@ -1706,28 +927,7 @@ void spmv_struct(const ExecutionSpace& space, const char mode[], } // Call true rank 2 vector implementation - { - typedef Kokkos::View< - typename XVector::const_value_type**, typename XVector::array_layout, - typename XVector::device_type, - Kokkos::MemoryTraits > - XVector_Internal; - - typedef Kokkos::View > - YVector_Internal; - - XVector_Internal x_i = x; - YVector_Internal y_i = y; - - return KokkosSparse::Impl::SPMV_MV< - ExecutionSpace, AMatrix_Internal, XVector_Internal, - YVector_Internal>::spmv_mv(space, - KokkosKernels::Experimental::Controls(), - mode, alpha, A_i, x_i, beta, y_i); - } + spmv(space, mode, alpha, A, x, beta, y); } template +struct SPMV2D1D { + static bool spmv2d1d(const char mode[], const AlphaType& alpha, + const AMatrix& A, const XVector& x, const BetaType& beta, + const YVector& y); + + template + static bool spmv2d1d(const ExecutionSpace& space, const char mode[], + const AlphaType& alpha, const AMatrix& A, + const XVector& x, const BetaType& beta, + const YVector& y); +}; + +#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || !defined(KOKKOSKERNELS_ETI_ONLY) +template +struct SPMV2D1D { + static bool spmv2d1d(const char mode[], const AlphaType& alpha, + const AMatrix& A, const XVector& x, const BetaType& beta, + const YVector& y) { + spmv(typename AMatrix::execution_space{}, mode, alpha, A, x, beta, y); + return true; + } + + template + static bool spmv2d1d(const ExecutionSpace& space, const char mode[], + const AlphaType& alpha, const AMatrix& A, + const XVector& x, const BetaType& beta, + const YVector& y) { + spmv(space, mode, alpha, A, x, beta, y); + return true; + } +}; + +#else + +template +struct SPMV2D1D { + static bool spmv2d1d(const char /*mode*/[], const AlphaType& /*alpha*/, + const AMatrix& /*A*/, const XVector& /*x*/, + const BetaType& /*beta*/, const YVector& /*y*/) { + return false; + } + + template + static bool spmv2d1d(const ExecutionSpace& /* space */, const char /*mode*/[], + const AlphaType& /*alpha*/, const AMatrix& /*A*/, + const XVector& /*x*/, const BetaType& /*beta*/, + const YVector& /*y*/) { + return false; + } +}; +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || !defined(KOKKOSKERNELS_ETI_ONLY) +template +struct SPMV2D1D { + static bool spmv2d1d(const char mode[], const AlphaType& alpha, + const AMatrix& A, const XVector& x, const BetaType& beta, + const YVector& y) { + spmv(typename AMatrix::execution_space{}, mode, alpha, A, x, beta, y); + return true; + } + + template + static bool spmv2d1d(const ExecutionSpace& space, const char mode[], + const AlphaType& alpha, const AMatrix& A, + const XVector& x, const BetaType& beta, + const YVector& y) { + spmv(space, mode, alpha, A, x, beta, y); + return true; + } +}; + +#else + +template +struct SPMV2D1D { + static bool spmv2d1d(const char /*mode*/[], const AlphaType& /*alpha*/, + const AMatrix& /*A*/, const XVector& /*x*/, + const BetaType& /*beta*/, const YVector& /*y*/) { + return false; + } + + template + static bool spmv2d1d(const ExecutionSpace& /* space */, const char /*mode*/[], + const AlphaType& /*alpha*/, const AMatrix& /*A*/, + const XVector& /*x*/, const BetaType& /*beta*/, + const YVector& /*y*/) { + return false; + } +}; +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || !defined(KOKKOSKERNELS_ETI_ONLY) +template +struct SPMV2D1D { + static bool spmv2d1d(const char mode[], const AlphaType& alpha, + const AMatrix& A, const XVector& x, const BetaType& beta, + const YVector& y) { + spmv(typename AMatrix::execution_space{}, mode, alpha, A, x, beta, y); + return true; + } + + template + static bool spmv2d1d(const ExecutionSpace& space, const char mode[], + const AlphaType& alpha, const AMatrix& A, + const XVector& x, const BetaType& beta, + const YVector& y) { + spmv(space, mode, alpha, A, x, beta, y); + return true; + } +}; + +#else + +template +struct SPMV2D1D { + static bool spmv2d1d(const char /*mode*/[], const AlphaType& /*alpha*/, + const AMatrix& /*A*/, const XVector& /*x*/, + const BetaType& /*beta*/, const YVector& /*y*/) { + return false; + } + + template + static bool spmv2d1d(const ExecutionSpace& /* space */, const char /*mode*/[], + const AlphaType& /*alpha*/, const AMatrix& /*A*/, + const XVector& /*x*/, const BetaType& /*beta*/, + const YVector& /*y*/) { + return false; + } +}; +#endif +} // namespace Impl + +template +using SPMV2D1D + [[deprecated("KokkosSparse::SPMV2D1D is not part of the public interface - " + "use KokkosSparse::spmv instead")]] = + Impl::SPMV2D1D; + +template +[ + [deprecated("Use the version of spmv that takes a SPMVHandle instead of " + "Controls")]] void +spmv(const ExecutionSpace& space, + KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y) { + // Default to fast setup, since this handle can't be reused + SPMVAlgorithm algo = SPMV_FAST_SETUP; + // Translate the Controls algorithm selection to the SPMVHandle algorithm. + // This maintains the old behavior, where any manually set name that isn't + // "tpl" gives native. + // + // This also uses the behavior set by #2021: "merge" was a hint to use + // cuSPARSE merge path, but that path is gone so just use the normal TPL. + // "merge-path" means to use the KK merge-path implementation. + // + // And also support the 3 different BSR algorithms by their old names. + if (controls.isParameter("algorithm")) { + std::string algoName = controls.getParameter("algorithm"); + if (algoName == "merge" || algoName == "tpl") + algo = SPMV_FAST_SETUP; + else if (algoName == "native-merge") + algo = SPMV_MERGE_PATH; + else if (algoName == "v4.1") + algo = SPMV_BSR_V41; + else if (algoName == "v4.2") + algo = SPMV_BSR_V41; + else if (algoName == "experimental_bsr_tc" || algoName == "experimental_tc") + algo = SPMV_BSR_TC; + else + throw std::invalid_argument( + std::string("KokkosSparse::spmv: controls algorithm name '") + + algoName + "' is not supported.\n"); + } + KokkosSparse::SPMVHandle handle( + algo); + // Pull out any expert tuning parameters + if (controls.isParameter("schedule")) { + if (controls.getParameter("schedule") == "dynamic") { + handle.force_dynamic_schedule = true; + } else if (controls.getParameter("schedule") == "static") { + handle.force_static_schedule = true; + } + } + if (controls.isParameter("team size")) + handle.team_size = std::stoi(controls.getParameter("team size")); + if (controls.isParameter("vector length")) + handle.vector_length = std::stoi(controls.getParameter("vector length")); + if (controls.isParameter("rows per thread")) + handle.rows_per_thread = + std::stoll(controls.getParameter("rows per thread")); + spmv(space, &handle, mode, alpha, A, x, beta, y); +} + +template +[ + [deprecated("Use the version of spmv that takes a SPMVHandle instead of " + "Controls")]] void +spmv(KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y) { + spmv(typename AMatrix::execution_space{}, controls, mode, alpha, A, x, beta, + y); +} + +template +[ + [deprecated("Use the version of spmv that takes a SPMVHandle instead of " + "Controls")]] void +spmv(const ExecutionSpace& space, + KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y, const RANK_ONE&) { + spmv(space, controls, mode, alpha, A, x, beta, y); +} + +template +[ + [deprecated("Use the version of spmv that takes a SPMVHandle instead of " + "Controls")]] void +spmv(KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y, const RANK_ONE&) { + spmv(controls, mode, alpha, A, x, beta, y); +} + +template +[ + [deprecated("Use the version of spmv that takes a SPMVHandle instead of " + "Controls")]] void +spmv(const ExecutionSpace& space, + KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y, const RANK_TWO&) { + spmv(space, controls, mode, alpha, A, x, beta, y); +} + +template +[ + [deprecated("Use the version of spmv that takes a SPMVHandle instead of " + "Controls")]] void +spmv(KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y, const RANK_TWO&) { + spmv(controls, mode, alpha, A, x, beta, y); +} + +} // namespace KokkosSparse + +#endif diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_spmv_handle.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_spmv_handle.hpp new file mode 100644 index 000000000000..9e7295c72c31 --- /dev/null +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_spmv_handle.hpp @@ -0,0 +1,389 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_SPMV_HANDLE_HPP_ +#define KOKKOSSPARSE_SPMV_HANDLE_HPP_ + +#include +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_BsrMatrix.hpp" +// Use TPL utilities for safely finalizing matrix descriptors, etc. +#include "KokkosSparse_Utils_cusparse.hpp" +#include "KokkosSparse_Utils_rocsparse.hpp" +#include "KokkosSparse_Utils_mkl.hpp" + +namespace KokkosSparse { + +/// SPMVAlgorithm values can be used to select different algorithms/methods for +/// performing SpMV computations. +enum SPMVAlgorithm { + SPMV_DEFAULT, /// Default algorithm: best overall performance for repeated + /// applications of SpMV. + SPMV_FAST_SETUP, /// Best performance in the non-reuse case, where the handle + /// is only used once. + SPMV_NATIVE, /// Use the best KokkosKernels implementation, even if a TPL + /// implementation is available. + SPMV_MERGE_PATH, /// Use load-balancing merge path algorithm (for CrsMatrix + /// only) + SPMV_BSR_V41, /// Use experimental version 4.1 algorithm (for BsrMatrix only) + SPMV_BSR_V42, /// Use experimental version 4.2 algorithm (for BsrMatrix only) + SPMV_BSR_TC /// Use experimental tensor core algorithm (for BsrMatrix only) +}; + +namespace Experimental { +/// Precision to use in the tensor core implementation of Bsr SpMV +enum class Bsr_TC_Precision { + Automatic, ///< Use Double, unless operations match mixed precision + Double, ///< fp64 += fp64 * fp64 + Mixed ///< fp32 += fp16 * fp16 +}; +} // namespace Experimental + +/// Get the name of a SPMVAlgorithm enum constant +inline const char* get_spmv_algorithm_name(SPMVAlgorithm a) { + switch (a) { + case SPMV_DEFAULT: return "SPMV_DEFAULT"; + case SPMV_FAST_SETUP: return "SPMV_FAST_SETUP"; + case SPMV_NATIVE: return "SPMV_NATIVE"; + case SPMV_MERGE_PATH: return "SPMV_MERGE_PATH"; + case SPMV_BSR_V41: return "SPMV_BSR_V41"; + case SPMV_BSR_V42: return "SPMV_BSR_V42"; + case SPMV_BSR_TC: return "SPMV_BSR_TC"; + } + throw std::invalid_argument( + "SPMVHandle::get_algorithm_name: unknown algorithm"); + return ""; +} + +/// Return true if the given algorithm is always a native (KokkosKernels) +/// implementation, and false if it may be implemented by a TPL. +inline bool is_spmv_algorithm_native(SPMVAlgorithm a) { + switch (a) { + case SPMV_NATIVE: + case SPMV_MERGE_PATH: + case SPMV_BSR_V41: + case SPMV_BSR_V42: + case SPMV_BSR_TC: return true; + default: return false; + } +} + +namespace Impl { + +template +struct TPL_SpMV_Data { + // Disallow default construction: must provide the initial execution space + TPL_SpMV_Data() = delete; + TPL_SpMV_Data(const ExecutionSpace& exec_) : exec(exec_) {} + void set_exec_space(const ExecutionSpace& new_exec) { + // Check if new_exec is different from (old) exec. + // If it is, fence the old exec now. + // That way, SPMVHandle cleanup doesn't need + // to worry about resources still being in use on the old exec. + if (exec != new_exec) { + exec.fence(); + exec = new_exec; + } + } + virtual ~TPL_SpMV_Data() {} + ExecutionSpace exec; +}; + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +#if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) +// Data used by cuSPARSE >=10.3 for both single-vector (SpMV) and multi-vector +// (SpMM). +// TODO: in future, this can also be used for BSR (cuSPARSE >=12.2) +struct CuSparse10_SpMV_Data : public TPL_SpMV_Data { + CuSparse10_SpMV_Data(const Kokkos::Cuda& exec_) : TPL_SpMV_Data(exec_) {} + ~CuSparse10_SpMV_Data() { + // Prefer cudaFreeAsync on the stream that last executed a spmv, but + // async memory management was introduced in 11.2 +#if (CUDA_VERSION >= 11020) + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeAsync(buffer, exec.cuda_stream())); +#else + // Fence here to ensure spmv is not still using buffer + // (cudaFree does not do a device synchronize) + exec.fence(); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(buffer)); +#endif + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(mat)); + } + + cusparseSpMatDescr_t mat; + size_t bufferSize = 0; + void* buffer = nullptr; +}; +#endif + +// Data used by cuSPARSE <10.3 for CRS, and >=9 for BSR +struct CuSparse9_SpMV_Data : public TPL_SpMV_Data { + CuSparse9_SpMV_Data(const Kokkos::Cuda& exec_) : TPL_SpMV_Data(exec_) {} + ~CuSparse9_SpMV_Data() { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(mat)); + } + + cusparseMatDescr_t mat; +}; +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE +struct RocSparse_CRS_SpMV_Data : public TPL_SpMV_Data { + RocSparse_CRS_SpMV_Data(const Kokkos::HIP& exec_) : TPL_SpMV_Data(exec_) {} + ~RocSparse_CRS_SpMV_Data() { + // note: hipFree includes an implicit device synchronize + KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(buffer)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_spmat_descr(mat)); + } + + rocsparse_spmat_descr mat; + size_t bufferSize = 0; + void* buffer = nullptr; +}; + +struct RocSparse_BSR_SpMV_Data : public TPL_SpMV_Data { + RocSparse_BSR_SpMV_Data(const Kokkos::HIP& exec_) : TPL_SpMV_Data(exec_) {} + ~RocSparse_BSR_SpMV_Data() { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_mat_descr(mat)); +#if (KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50400) + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_mat_info(info)); +#endif + } + + rocsparse_mat_descr mat; +#if (KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50400) + rocsparse_mat_info info; +#endif +}; +#endif + +// note: header defining __INTEL_MKL__ is pulled in above by Utils_mkl.hpp +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + +#if (__INTEL_MKL__ > 2017) +// Data for classic MKL (both CRS and BSR) +template +struct MKL_SpMV_Data : public TPL_SpMV_Data { + MKL_SpMV_Data(const ExecutionSpace& exec_) + : TPL_SpMV_Data(exec_) {} + ~MKL_SpMV_Data() { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(mat)); + // descr is just a plain-old-data struct, no cleanup to do + } + + sparse_matrix_t mat; + matrix_descr descr; +}; +#endif + +#if defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) +struct OneMKL_SpMV_Data : public TPL_SpMV_Data { + OneMKL_SpMV_Data(const Kokkos::Experimental::SYCL& exec_) + : TPL_SpMV_Data(exec_) {} + ~OneMKL_SpMV_Data() { + // Make sure no spmv is still running with this handle, if exec uses an + // out-of-order queue (rare case) + if (!exec.sycl_queue().is_in_order()) exec.fence(); +#if INTEL_MKL_VERSION >= 20230200 + // MKL 2023.2 and up make this async release okay even though it takes a + // pointer to mat, which is going out of scope after this destructor + oneapi::mkl::sparse::release_matrix_handle(exec.sycl_queue(), &mat); +#else + // But in older versions, wait on ev_release before letting mat go out of + // scope + auto ev_release = + oneapi::mkl::sparse::release_matrix_handle(exec.sycl_queue(), &mat); + ev_release.wait(); +#endif + } + + oneapi::mkl::sparse::matrix_handle_t mat; +}; +#endif +#endif + +template +struct SPMVHandleImpl { + using ExecutionSpaceType = ExecutionSpace; + // This is its own ImplType + using ImplType = + SPMVHandleImpl; + // Do not allow const qualifier on Scalar, Ordinal, Offset (otherwise this + // type won't match the ETI'd type). Users should not use SPMVHandleImpl + // directly and SPMVHandle explicitly removes const, so this should never + // happen in practice. + static_assert(!std::is_const_v, + "SPMVHandleImpl: Scalar must not be a const type"); + static_assert(!std::is_const_v, + "SPMVHandleImpl: Offset must not be a const type"); + static_assert(!std::is_const_v, + "SPMVHandleImpl: Ordinal must not be a const type"); + SPMVHandleImpl(SPMVAlgorithm algo_) : algo(algo_) {} + ~SPMVHandleImpl() { + if (tpl) delete tpl; + } + void set_exec_space(const ExecutionSpace& exec) { + if (tpl) tpl->set_exec_space(exec); + } + + /// Get the SPMVAlgorithm used by this handle + SPMVAlgorithm get_algorithm() const { return this->algo; } + + bool is_set_up = false; + const SPMVAlgorithm algo = SPMV_DEFAULT; + TPL_SpMV_Data* tpl = nullptr; + // Expert tuning parameters for native SpMV + // TODO: expose a proper Experimental interface to set these. Currently they + // can be assigned directly in the SPMVHandle as they are public members. + int team_size = -1; + int vector_length = -1; + int64_t rows_per_thread = -1; + bool force_static_schedule = false; + bool force_dynamic_schedule = false; + KokkosSparse::Experimental::Bsr_TC_Precision bsr_tc_precision = + KokkosSparse::Experimental::Bsr_TC_Precision::Automatic; +}; +} // namespace Impl + +// clang-format off +/// \class SPMVHandle +/// \brief Opaque handle type for KokkosSparse::spmv. It passes the choice of +/// algorithm to the spmv implementation, and also may store internal data which can be used to +/// speed up the spmv computation. +/// \tparam DeviceType A Kokkos::Device or execution space where the spmv computation will be run. +/// Does not necessarily need to match AMatrix's device type, but its execution space needs to be able +/// to access the memory spaces of AMatrix, XVector and YVector. +/// \tparam AMatrix A specialization of KokkosSparse::CrsMatrix or +/// KokkosSparse::BsrMatrix. +/// +/// SPMVHandle's internal resources are lazily allocated and initialized by the first +/// spmv call. +/// +/// SPMVHandle automatically cleans up all allocated resources when it is destructed. +/// No fencing by the user is required between the final spmv and cleanup. +/// +/// A SPMVHandle instance can be used in any number of calls, with any execution space +/// instance and any X/Y vectors (with matching types) each call. +/// +/// \warning However, all calls to spmv with a given instance of SPMVHandle must use the +/// same matrix. +// clang-format on + +template +struct SPMVHandle + : public Impl::SPMVHandleImpl { + using ImplType = + Impl::SPMVHandleImpl; + // Note: these typedef names cannot shadow template parameters + using AMatrixType = AMatrix; + using XVectorType = XVector; + using YVectorType = YVector; + using ExecutionSpaceType = typename DeviceType::execution_space; + // Check all template parameters for compatibility with each other + // NOTE: we do not require that ExecutionSpace matches + // AMatrix::execution_space. For example, if the matrix's device is it is allowed to run spmv on Serial. + static_assert(is_crs_matrix_v || + Experimental::is_bsr_matrix_v, + "SPMVHandle: AMatrix must be a specialization of CrsMatrix or " + "BsrMatrix."); + static_assert(Kokkos::is_view::value, + "SPMVHandle: XVector must be a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "SPMVHandle: YVector must be a Kokkos::View."); + static_assert(XVector::rank() == YVector::rank(), + "SPMVHandle: ranks of XVector and YVector must match."); + static_assert( + XVector::rank() == size_t(1) || YVector::rank() == size_t(2), + "SPMVHandle: XVector and YVector must be both rank-1 or both rank-2."); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "SPMVHandle: AMatrix must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "SPMVHandle: XVector must be accessible from ExecutionSpace"); + static_assert( + Kokkos::SpaceAccessibility::accessible, + "SPMVHandle: YVector must be accessible from ExecutionSpace"); + + // Prevent copying (this object does not support reference counting) + SPMVHandle(const SPMVHandle&) = delete; + SPMVHandle& operator=(const SPMVHandle&) = delete; + + /// \brief Create a new SPMVHandle using the given algorithm. + SPMVHandle(SPMVAlgorithm algo_ = SPMV_DEFAULT) : ImplType(algo_) { + // Validate the choice of algorithm based on A's type + if constexpr (is_crs_matrix_v) { + switch (get_algorithm()) { + case SPMV_BSR_V41: + case SPMV_BSR_V42: + case SPMV_BSR_TC: + throw std::invalid_argument(std::string("SPMVHandle: algorithm ") + + get_spmv_algorithm_name(get_algorithm()) + + " cannot be used if A is a CrsMatrix"); + default:; + } + } else { + switch (get_algorithm()) { + case SPMV_MERGE_PATH: + throw std::invalid_argument(std::string("SPMVHandle: algorithm ") + + get_spmv_algorithm_name(get_algorithm()) + + " cannot be used if A is a BsrMatrix"); + default:; + } + } + } + + /// Get the SPMVAlgorithm used by this handle + SPMVAlgorithm get_algorithm() const { + // Note: get_algorithm is also a method of parent ImplType, but for + // documentation purposes it should appear directly in the public interface + // of SPMVHandle + return this->algo; + } + + /// Get pointer to this as the impl type + ImplType* get_impl() { return static_cast(this); } +}; + +namespace Impl { +template +struct is_spmv_handle : public std::false_type {}; +template +struct is_spmv_handle> : public std::true_type {}; +template +struct is_spmv_handle> : public std::true_type {}; + +template +inline constexpr bool is_spmv_handle_v = is_spmv_handle::value; +} // namespace Impl + +} // namespace KokkosSparse + +#endif diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_sptrsv.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_sptrsv.hpp index 859918c58d0a..1fef3e9f1b25 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_sptrsv.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_sptrsv.hpp @@ -40,10 +40,23 @@ namespace Experimental { std::is_same::type, \ typename std::remove_const::type>::value -template -void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, - lno_nnz_view_t_ entries) { +/** + * @brief sptrsv symbolic phase for linear system Ax=b + * + * @tparam ExecutionSpace This kernels execution space type + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @param space The execution space instance this kernel will run on + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + */ +template +void sptrsv_symbolic(const ExecutionSpace &space, KernelHandle *handle, + lno_row_view_t_ rowmap, lno_nnz_view_t_ entries) { typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; @@ -94,8 +107,9 @@ void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, Entries_Internal entries_i = entries; KokkosSparse::Impl::SPTRSV_SYMBOLIC< - const_handle_type, RowMap_Internal, - Entries_Internal>::sptrsv_symbolic(&tmp_handle, rowmap_i, entries_i); + ExecutionSpace, const_handle_type, RowMap_Internal, + Entries_Internal>::sptrsv_symbolic(space, &tmp_handle, rowmap_i, + entries_i); #ifdef KK_TRISOLVE_TIMERS std::cout << " > sptrsv_symbolic time = " << timer_sptrsv.seconds() @@ -103,14 +117,54 @@ void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, #endif } // sptrsv_symbolic +/** + * @brief sptrsv symbolic phase for linear system Ax=b + * + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + */ template + typename lno_nnz_view_t_> void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, - lno_nnz_view_t_ entries, scalar_nnz_view_t_ values) { + lno_nnz_view_t_ entries) { + using ExecutionSpace = typename KernelHandle::HandleExecSpace; + auto my_exec_space = ExecutionSpace(); + sptrsv_symbolic(my_exec_space, handle, rowmap, entries); +} + +/** + * @brief sptrsv symbolic phase for linear system Ax=b + * + * @tparam ExecutionSpace This kernels execution space type + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @param space The execution space instance this kernel will run on + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + * @param values The CRS matrix's (A) values + */ +template +void sptrsv_symbolic(ExecutionSpace &space, KernelHandle *handle, + lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, + scalar_nnz_view_t_ values) { typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; typedef typename KernelHandle::nnz_scalar_t scalar_type; + static_assert( + std::is_same_v, + "sptrsv_symbolic: ExecutionSpace and HandleExecSpace need to match!"); + static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE( typename lno_row_view_t_::non_const_value_type, size_type), "sptrsv_symbolic: A size_type must match KernelHandle " @@ -140,50 +194,60 @@ void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, const_handle_type; const_handle_type tmp_handle(*handle); - typedef Kokkos::View< - typename lno_row_view_t_::const_value_type *, - typename KokkosKernels::Impl::GetUnifiedLayout< - lno_row_view_t_>::array_layout, - typename lno_row_view_t_::device_type, - Kokkos::MemoryTraits > - RowMap_Internal; - - typedef Kokkos::View< - typename lno_nnz_view_t_::const_value_type *, - typename KokkosKernels::Impl::GetUnifiedLayout< - lno_nnz_view_t_>::array_layout, - typename lno_nnz_view_t_::device_type, - Kokkos::MemoryTraits > - Entries_Internal; - - typedef Kokkos::View< - typename scalar_nnz_view_t_::const_value_type *, - typename KokkosKernels::Impl::GetUnifiedLayout< - scalar_nnz_view_t_>::array_layout, - typename scalar_nnz_view_t_::device_type, - Kokkos::MemoryTraits > - Values_Internal; - #ifdef KK_TRISOLVE_TIMERS Kokkos::Timer timer_sptrsv; #endif auto sptrsv_handle = handle->get_sptrsv_handle(); if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) { - RowMap_Internal rowmap_i = rowmap; - Entries_Internal entries_i = entries; - Values_Internal values_i = values; - - typedef typename KernelHandle::SPTRSVHandleType sptrsvHandleType; - sptrsvHandleType *sh = handle->get_sptrsv_handle(); - auto nrows = sh->get_nrows(); - - KokkosSparse::Impl::sptrsvcuSPARSE_symbolic< - sptrsvHandleType, RowMap_Internal, Entries_Internal, Values_Internal>( - sh, nrows, rowmap_i, entries_i, values_i, false); - +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + if constexpr (std::is_same_v) { + using RowMap_Internal = Kokkos::View< + typename lno_row_view_t_::const_value_type *, + typename KokkosKernels::Impl::GetUnifiedLayout< + lno_row_view_t_>::array_layout, + typename lno_row_view_t_::device_type, + Kokkos::MemoryTraits >; + + using Entries_Internal = Kokkos::View< + typename lno_nnz_view_t_::const_value_type *, + typename KokkosKernels::Impl::GetUnifiedLayout< + lno_nnz_view_t_>::array_layout, + typename lno_nnz_view_t_::device_type, + Kokkos::MemoryTraits >; + + using Values_Internal = Kokkos::View< + typename scalar_nnz_view_t_::const_value_type *, + typename KokkosKernels::Impl::GetUnifiedLayout< + scalar_nnz_view_t_>::array_layout, + typename scalar_nnz_view_t_::device_type, + Kokkos::MemoryTraits >; + + RowMap_Internal rowmap_i = rowmap; + Entries_Internal entries_i = entries; + Values_Internal values_i = values; + + typedef typename KernelHandle::SPTRSVHandleType sptrsvHandleType; + sptrsvHandleType *sh = handle->get_sptrsv_handle(); + auto nrows = sh->get_nrows(); + + KokkosSparse::Impl::sptrsvcuSPARSE_symbolic< + ExecutionSpace, sptrsvHandleType, RowMap_Internal, Entries_Internal, + Values_Internal>(space, sh, nrows, rowmap_i, entries_i, values_i, + false); + } else { + (void)values; + KokkosSparse::Experimental::sptrsv_symbolic(space, handle, rowmap, + entries); + } + +#else // We better go to the native implementation + (void)values; + KokkosSparse::Experimental::sptrsv_symbolic(space, handle, rowmap, entries); +#endif } else { - KokkosSparse::Experimental::sptrsv_symbolic(handle, rowmap, entries); + (void)values; + KokkosSparse::Experimental::sptrsv_symbolic(space, handle, rowmap, entries); } #ifdef KK_TRISOLVE_TIMERS std::cout << " + sptrsv_symbolic time = " << timer_sptrsv.seconds() @@ -191,16 +255,61 @@ void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, #endif } // sptrsv_symbolic +/** + * @brief sptrsv symbolic phase for linear system Ax=b + * + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + * @param values The CRS matrix's (A) values + */ template -void sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap, - lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, BType b, - XType x) { + typename lno_nnz_view_t_, typename scalar_nnz_view_t_> +void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, + lno_nnz_view_t_ entries, scalar_nnz_view_t_ values) { + using ExecutionSpace = typename KernelHandle::HandleExecSpace; + auto my_exec_space = ExecutionSpace(); + + sptrsv_symbolic(my_exec_space, handle, rowmap, entries, values); +} + +/** + * @brief sptrsv solve phase of x for linear system Ax=b + * + * @tparam ExecutionSpace This kernels execution space + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @tparam scalar_nnz_view_t_ The CRS matrix's (A) values type + * @tparam BType The b vector type + * @tparam XType The x vector type + * @param space The execution space instance this kernel will be run on + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + * @param values The CRS matrix's (A) values + * @param b The b vector + * @param x The x vector + */ +template +void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, + lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, + scalar_nnz_view_t_ values, BType b, XType x) { typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; typedef typename KernelHandle::nnz_scalar_t scalar_type; + static_assert( + std::is_same_v, + "sptrsv solve: ExecutionSpace and HandleExecSpace need to match"); + static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE( typename lno_row_view_t_::non_const_value_type, size_type), "sptrsv_solve: A size_type must match KernelHandle size_type " @@ -301,29 +410,84 @@ void sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap, auto sptrsv_handle = handle->get_sptrsv_handle(); if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) { - typedef typename KernelHandle::SPTRSVHandleType sptrsvHandleType; - sptrsvHandleType *sh = handle->get_sptrsv_handle(); - auto nrows = sh->get_nrows(); - - KokkosSparse::Impl::sptrsvcuSPARSE_solve( - sh, nrows, rowmap_i, entries_i, values_i, b_i, x_i, false); - +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + if constexpr (std::is_same_v) { + typedef typename KernelHandle::SPTRSVHandleType sptrsvHandleType; + sptrsvHandleType *sh = handle->get_sptrsv_handle(); + auto nrows = sh->get_nrows(); + + KokkosSparse::Impl::sptrsvcuSPARSE_solve< + ExecutionSpace, sptrsvHandleType, RowMap_Internal, Entries_Internal, + Values_Internal, BType_Internal, XType_Internal>( + space, sh, nrows, rowmap_i, entries_i, values_i, b_i, x_i, false); + } else { + KokkosSparse::Impl::SPTRSV_SOLVE< + ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, + Values_Internal, BType_Internal, + XType_Internal>::sptrsv_solve(space, &tmp_handle, rowmap_i, entries_i, + values_i, b_i, x_i); + } +#else + KokkosSparse::Impl::SPTRSV_SOLVE< + ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, + Values_Internal, BType_Internal, + XType_Internal>::sptrsv_solve(space, &tmp_handle, rowmap_i, entries_i, + values_i, b_i, x_i); +#endif } else { KokkosSparse::Impl::SPTRSV_SOLVE< - typename scalar_nnz_view_t_::execution_space, const_handle_type, - RowMap_Internal, Entries_Internal, Values_Internal, BType_Internal, - XType_Internal>::sptrsv_solve(&tmp_handle, rowmap_i, entries_i, + ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, + Values_Internal, BType_Internal, + XType_Internal>::sptrsv_solve(space, &tmp_handle, rowmap_i, entries_i, values_i, b_i, x_i); } } // sptrsv_solve -#if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) -// --------------------------------------------------------------------- -template -void sptrsv_solve(KernelHandle *handle, XType x, XType b) { +/** + * @brief sptrsv solve phase of x for linear system Ax=b + * + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @tparam scalar_nnz_view_t_ The CRS matrix's (A) values type + * @tparam BType The b vector type + * @tparam XType The x vector type + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + * @param values The CRS matrix's (A) values + * @param b The b vector + * @param x The x vector + */ +template +void sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap, + lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, BType b, + XType x) { + using ExecutionSpace = typename KernelHandle::HandleExecSpace; + auto my_exec_space = ExecutionSpace(); + sptrsv_solve(my_exec_space, handle, rowmap, entries, values, b, x); +} + +#if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) || defined(DOXY) +/** + * @brief Supernodal sptrsv solve phase of x for linear system Ax=b + * + * @tparam ExecutionSpace This kernels execution space + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam XType The x and b vector type + * @param space The execution space instance this kernel will run on + * @param handle KernelHandle instance + * @param x The x vector + * @param b The b vector + */ +template +void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, XType x, + XType b) { auto crsmat = handle->get_sptrsv_handle()->get_crsmat(); auto values = crsmat.values; auto graph = crsmat.graph; @@ -341,31 +505,79 @@ void sptrsv_solve(KernelHandle *handle, XType x, XType b) { if (handle->is_sptrsv_lower_tri()) { // apply forward pivoting - Kokkos::deep_copy(x, b); + Kokkos::deep_copy(space, x, b); // the fifth argument (i.e., first x) is not used - sptrsv_solve(handle, row_map, entries, values, x, x); + sptrsv_solve(space, handle, row_map, entries, values, x, x); } else { // the fifth argument (i.e., first x) is not used - sptrsv_solve(handle, row_map, entries, values, b, b); + sptrsv_solve(space, handle, row_map, entries, values, b, b); // apply backward pivoting - Kokkos::deep_copy(x, b); + Kokkos::deep_copy(space, x, b); } } -// --------------------------------------------------------------------- +/** + * @brief Supernodal sptrsv solve phase of x for linear system Ax=b + * + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam XType The x and b vector type + * @param handle KernelHandle instance + * @param x The x vector + * @param b The b vector + */ template -void sptrsv_solve(KernelHandle *handleL, KernelHandle *handleU, XType x, - XType b) { +void sptrsv_solve(KernelHandle *handle, XType x, XType b) { + using ExecutionSpace = typename KernelHandle::HandleExecSpace; + auto my_exec_space = ExecutionSpace(); + sptrsv_solve(my_exec_space, handle, x, b); +} + +/** + * @brief Supernodal sptrsv solve phase of x for linear system Ax=b + * + * @tparam ExecutionSpace This kernels execution space + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam XType The x and b vector type + * @param space The execution space instance this kernel will run on + * @param handleL KernelHandle instance for lower triangular matrix + * @param handleU KernelHandle instance for upper triangular matrix + * @param x The x vector + * @param b The b vector + */ +template +void sptrsv_solve(ExecutionSpace &space, KernelHandle *handleL, + KernelHandle *handleU, XType x, XType b) { // Lower-triangular solve - sptrsv_solve(handleL, x, b); + sptrsv_solve(space, handleL, x, b); // copy the solution to rhs - Kokkos::deep_copy(b, x); + Kokkos::deep_copy(space, b, x); // uper-triangular solve - sptrsv_solve(handleU, x, b); + sptrsv_solve(space, handleU, x, b); +} + +/** + * @brief Supernodal sptrsv solve phase of x for linear system Ax=b + * + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam XType The x and b vector type + * @param handleL KernelHandle instance for lower triangular matrix + * @param handleU KernelHandle instance for upper triangular matrix + * @param x The x vector + * @param b The b vector + */ +template +void sptrsv_solve(KernelHandle *handleL, KernelHandle *handleU, XType x, + XType b) { + using ExecutionSpace = typename KernelHandle::HandleExecSpace; + auto my_exec_space = ExecutionSpace(); + sptrsv_solve(my_exec_space, handleL, handleU, x, b); } #endif @@ -569,13 +781,21 @@ void sptrsv_solve_streams(const std::vector &execspace_v, if (handle_v[0]->get_sptrsv_handle()->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) { +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE // NOTE: assume all streams use the same SPTRSV_CUSPARSE algo. KokkosSparse::Impl::sptrsvcuSPARSE_solve_streams< ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, Values_Internal, BType_Internal, XType_Internal>( execspace_v, handle_i_v, rowmap_i_v, entries_i_v, values_i_v, b_i_v, x_i_v, false); - +#else + KokkosSparse::Impl::SPTRSV_SOLVE< + ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, + Values_Internal, BType_Internal, + XType_Internal>::sptrsv_solve_streams(execspace_v, handle_i_v, + rowmap_i_v, entries_i_v, + values_i_v, b_i_v, x_i_v); +#endif } else { KokkosSparse::Impl::SPTRSV_SOLVE< ExecutionSpace, const_handle_type, RowMap_Internal, Entries_Internal, diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_sptrsv_handle.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_sptrsv_handle.hpp index 7c9027d24ac8..cf23bfdc1f7b 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_sptrsv_handle.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_sptrsv_handle.hpp @@ -476,6 +476,22 @@ class SPTRSVHandle { this->set_if_algm_require_symb_lvlsched(); this->set_if_algm_require_symb_chain(); + // Check a few prerequisites before allowing users + // to run with the cusparse implementation of sptrsv. + if (algm == SPTRSVAlgorithm::SPTRSV_CUSPARSE) { +#if !defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) + throw( + std::runtime_error("sptrsv handle: SPTRSV_CUSPARSE requested but " + "cuSPARSE TPL not enabled.")); +#else + if (!std::is_same_v) { + throw( + std::runtime_error("sptrsv handle: SPTRSV_CUSPARSE requested but " + "HandleExecSpace is not Kokkos::CUDA.")); + } +#endif + } + #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV if (lower_tri) { // lower-triangular is stored in CSC diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_trsv.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_trsv.hpp index 1363542f1b49..9b25811d1019 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_trsv.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_trsv.hpp @@ -68,11 +68,15 @@ void trsv(const char uplo[], const char trans[], const char diag[], typename XMV::non_const_value_type>::value, "KokkosBlas::trsv: The output x must be nonconst."); + static_assert(KokkosSparse::is_crs_matrix::value || + KokkosSparse::Experimental::is_bsr_matrix::value, + "KokkosBlas::trsv: A is not a CRS or BSR matrix."); + // The following three code lines have been moved up by Massimiliano Lupo // Pasini typedef typename BMV::size_type size_type; - const size_type numRows = static_cast(A.numRows()); - const size_type numCols = static_cast(A.numCols()); + const size_type numRows = static_cast(A.numPointRows()); + const size_type numCols = static_cast(A.numPointCols()); const size_type zero = static_cast(0); if (zero != numRows && uplo[0] != 'U' && uplo[0] != 'u' && uplo[0] != 'L' && @@ -117,13 +121,21 @@ void trsv(const char uplo[], const char trans[], const char diag[], KokkosKernels::Impl::throw_runtime_exception(os.str()); } - typedef KokkosSparse::CrsMatrix< + using AMatrix_Bsr_Internal = KokkosSparse::Experimental::BsrMatrix< typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type, typename AMatrix::device_type, Kokkos::MemoryTraits, - typename AMatrix::const_size_type> - AMatrix_Internal; - - AMatrix_Internal A_i = A; + typename AMatrix::const_size_type>; + + using AMatrix_Internal = std::conditional_t< + KokkosSparse::is_crs_matrix::value, + KokkosSparse::CrsMatrix, + typename AMatrix::const_size_type>, + AMatrix_Bsr_Internal>; + + AMatrix_Internal A_i(A); typedef Kokkos::View< typename BMV::const_value_type**, typename BMV::array_layout, diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spadd_numeric_tpl_spec_decl.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spadd_numeric_tpl_spec_decl.hpp new file mode 100644 index 000000000000..0952654bdf04 --- /dev/null +++ b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spadd_numeric_tpl_spec_decl.hpp @@ -0,0 +1,282 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_HPP_ +#define KOKKOSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_HPP_ + +namespace KokkosSparse { +namespace Impl { + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + +#define KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE( \ + TOKEN, KOKKOS_SCALAR_TYPE, TPL_SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, \ + LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE, ETI_SPEC_AVAIL) \ + template <> \ + struct SPADD_NUMERIC< \ + EXEC_SPACE_TYPE, \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const KOKKOS_SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using kernelhandle_t = KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const KOKKOS_SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>; \ + using rowmap_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using non_const_rowmap_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using colidx_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using non_const_colidx_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using scalar_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using non_const_scalar_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void spadd_numeric( \ + const EXEC_SPACE_TYPE &exec, kernelhandle_t *handle, ORDINAL_TYPE m, \ + ORDINAL_TYPE n, const KOKKOS_SCALAR_TYPE alpha, rowmap_view_t rowmapA, \ + colidx_view_t colidxA, scalar_view_t valuesA, \ + const KOKKOS_SCALAR_TYPE beta, rowmap_view_t rowmapB, \ + colidx_view_t colidxB, scalar_view_t valuesB, rowmap_view_t rowmapC, \ + non_const_colidx_view_t colidxC, non_const_scalar_view_t valuesC) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosSparse::spadd_numeric[TPL_CUSPARSE," + \ + Kokkos::ArithTraits::name() + "]"); \ + \ + auto addHandle = handle->get_spadd_handle(); \ + auto &cuspData = addHandle->cusparseData; \ + auto &cuspHandle = \ + KokkosKernels::Impl::CusparseSingleton::singleton().cusparseHandle; \ + cusparsePointerMode_t oldPtrMode; \ + \ + KOKKOS_CUSPARSE_SAFE_CALL( \ + cusparseSetStream(cuspHandle, exec.cuda_stream())); \ + KOKKOS_CUSPARSE_SAFE_CALL( \ + cusparseGetPointerMode(cuspHandle, &oldPtrMode)); \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetPointerMode( \ + cuspHandle, CUSPARSE_POINTER_MODE_HOST)); /* alpha, beta on host*/ \ + OFFSET_TYPE nnzA = colidxA.extent(0); \ + OFFSET_TYPE nnzB = colidxB.extent(0); \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparse##TOKEN##csrgeam2( \ + cuspHandle, m, n, reinterpret_cast(&alpha), \ + cuspData.descrA, nnzA, \ + reinterpret_cast(valuesA.data()), \ + rowmapA.data(), colidxA.data(), \ + reinterpret_cast(&beta), cuspData.descrB, \ + nnzB, reinterpret_cast(valuesB.data()), \ + rowmapB.data(), colidxB.data(), cuspData.descrC, \ + reinterpret_cast(valuesC.data()), \ + const_cast(rowmapC.data()), colidxC.data(), \ + cuspData.workspace)); \ + KOKKOS_CUSPARSE_SAFE_CALL( \ + cusparseSetPointerMode(cuspHandle, oldPtrMode)); \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetStream(cuspHandle, NULL)); \ + \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE_EXT(ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE( \ + S, float, float, int, int, Kokkos::LayoutLeft, Kokkos::Cuda, \ + Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE( \ + D, double, double, int, int, Kokkos::LayoutLeft, Kokkos::Cuda, \ + Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE( \ + C, Kokkos::complex, cuComplex, int, int, Kokkos::LayoutLeft, \ + Kokkos::Cuda, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE( \ + Z, Kokkos::complex, cuDoubleComplex, int, int, \ + Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, ETI_SPEC_AVAIL) + +KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE_EXT(true) +KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE_EXT(false) +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + +#define KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_ROCSPARSE( \ + TOKEN, KOKKOS_SCALAR_TYPE, TPL_SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, \ + LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE, ETI_SPEC_AVAIL) \ + template <> \ + struct SPADD_NUMERIC< \ + EXEC_SPACE_TYPE, \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const KOKKOS_SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using kernelhandle_t = KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const KOKKOS_SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>; \ + using rowmap_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using non_const_rowmap_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using colidx_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using non_const_colidx_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using scalar_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using non_const_scalar_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void spadd_numeric( \ + const EXEC_SPACE_TYPE &exec, kernelhandle_t *handle, ORDINAL_TYPE m, \ + ORDINAL_TYPE n, const KOKKOS_SCALAR_TYPE alpha, rowmap_view_t rowmapA, \ + colidx_view_t colidxA, scalar_view_t valuesA, \ + const KOKKOS_SCALAR_TYPE beta, rowmap_view_t rowmapB, \ + colidx_view_t colidxB, scalar_view_t valuesB, rowmap_view_t rowmapC, \ + non_const_colidx_view_t colidxC, non_const_scalar_view_t valuesC) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosSparse::spadd_numeric[TPL_ROCSPARSE," + \ + Kokkos::ArithTraits::name() + "]"); \ + \ + auto addHandle = handle->get_spadd_handle(); \ + auto &rocData = addHandle->rocsparseData; \ + auto &rocspHandle = KokkosKernels::Impl::RocsparseSingleton::singleton() \ + .rocsparseHandle; \ + rocsparse_pointer_mode oldPtrMode; \ + \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_set_stream(rocspHandle, exec.hip_stream())); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_get_pointer_mode(rocspHandle, &oldPtrMode)); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_pointer_mode( \ + rocspHandle, rocsparse_pointer_mode_host)); /* alpha, beta on host*/ \ + OFFSET_TYPE nnzA = colidxA.extent(0); \ + OFFSET_TYPE nnzB = colidxB.extent(0); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_##TOKEN##csrgeam( \ + rocspHandle, m, n, \ + reinterpret_cast(&alpha), rocData.descrA, \ + nnzA, reinterpret_cast(valuesA.data()), \ + rowmapA.data(), colidxA.data(), \ + reinterpret_cast(&beta), rocData.descrB, \ + nnzB, reinterpret_cast(valuesB.data()), \ + rowmapB.data(), colidxB.data(), rocData.descrC, \ + reinterpret_cast(valuesC.data()), \ + const_cast(rowmapC.data()), colidxC.data())); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_set_pointer_mode(rocspHandle, oldPtrMode)); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_set_stream(rocspHandle, NULL)); \ + \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_ROCSPARSE_EXT(ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_ROCSPARSE( \ + s, float, float, int, int, Kokkos::LayoutLeft, Kokkos::HIP, \ + Kokkos::HIPSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_ROCSPARSE( \ + d, double, double, int, int, Kokkos::LayoutLeft, Kokkos::HIP, \ + Kokkos::HIPSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_ROCSPARSE( \ + c, Kokkos::complex, rocsparse_float_complex, int, int, \ + Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_ROCSPARSE( \ + z, Kokkos::complex, rocsparse_double_complex, int, int, \ + Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, ETI_SPEC_AVAIL) + +KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_ROCSPARSE_EXT(true) +KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_ROCSPARSE_EXT(false) +#endif + +} // namespace Impl +} // namespace KokkosSparse + +#endif diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spadd_symbolic_tpl_spec_decl.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spadd_symbolic_tpl_spec_decl.hpp new file mode 100644 index 000000000000..fe6b51207f7a --- /dev/null +++ b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spadd_symbolic_tpl_spec_decl.hpp @@ -0,0 +1,238 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_HPP_ +#define KOKKOSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_HPP_ + +namespace KokkosSparse { +namespace Impl { + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + +#define KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE( \ + TOKEN, KOKKOS_SCALAR_TYPE, TPL_SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, \ + LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE, ETI_SPEC_AVAIL) \ + template <> \ + struct SPADD_SYMBOLIC< \ + EXEC_SPACE_TYPE, \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const KOKKOS_SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + using kernelhandle_t = KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const KOKKOS_SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>; \ + using rowmap_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using non_const_rowmap_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using colidx_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits >; \ + static void spadd_symbolic(const EXEC_SPACE_TYPE& exec, \ + kernelhandle_t* handle, const ORDINAL_TYPE m, \ + const ORDINAL_TYPE n, rowmap_view_t rowmapA, \ + colidx_view_t colidxA, rowmap_view_t rowmapB, \ + colidx_view_t colidxB, \ + non_const_rowmap_view_t rowmapC) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosSparse::spadd_symbolic[TPL_CUSPARSE," + \ + Kokkos::ArithTraits::name() + "]"); \ + \ + auto addHandle = handle->get_spadd_handle(); \ + auto& cuspData = addHandle->cusparseData; \ + auto& cuspHandle = \ + KokkosKernels::Impl::CusparseSingleton::singleton().cusparseHandle; \ + \ + /* Not easy to init 'one' for cuda complex, so we don't init it. Anyway, \ + * the uninit'ed var won't affect C's pattern. \ + */ \ + TPL_SCALAR_TYPE one; \ + size_t nbytes; \ + OFFSET_TYPE nnzA = colidxA.extent(0); \ + OFFSET_TYPE nnzB = colidxB.extent(0); \ + OFFSET_TYPE nnzC = 0; \ + \ + KOKKOS_CUSPARSE_SAFE_CALL( \ + cusparseSetStream(cuspHandle, exec.cuda_stream())); \ + \ + /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparsecreatematdescr \ + It sets the fields MatrixType and IndexBase to the default values \ + CUSPARSE_MATRIX_TYPE_GENERAL and CUSPARSE_INDEX_BASE_ZERO, \ + respectively, while leaving other fields uninitialized. */ \ + \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&cuspData.descrA)); \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&cuspData.descrB)); \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&cuspData.descrC)); \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparse##TOKEN##csrgeam2_bufferSizeExt( \ + cuspHandle, m, n, &one, cuspData.descrA, nnzA, NULL, rowmapA.data(), \ + colidxA.data(), &one, cuspData.descrB, nnzB, NULL, rowmapB.data(), \ + colidxB.data(), cuspData.descrC, NULL, rowmapC.data(), NULL, \ + &nbytes)); \ + cuspData.nbytes = nbytes; \ + cuspData.workspace = Kokkos::kokkos_malloc(nbytes); \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseXcsrgeam2Nnz( \ + cuspHandle, m, n, cuspData.descrA, nnzA, rowmapA.data(), \ + colidxA.data(), cuspData.descrB, nnzB, rowmapB.data(), \ + colidxB.data(), cuspData.descrC, rowmapC.data(), &nnzC, \ + cuspData.workspace)); \ + addHandle->set_c_nnz(nnzC); \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetStream(cuspHandle, NULL)); \ + \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE_EXT(ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE( \ + S, float, float, int, int, Kokkos::LayoutLeft, Kokkos::Cuda, \ + Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE( \ + D, double, double, int, int, Kokkos::LayoutLeft, Kokkos::Cuda, \ + Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE( \ + C, Kokkos::complex, cuComplex, int, int, Kokkos::LayoutLeft, \ + Kokkos::Cuda, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE( \ + Z, Kokkos::complex, cuDoubleComplex, int, int, \ + Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, ETI_SPEC_AVAIL) + +KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE_EXT(true) +KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE_EXT(false) +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + +#define KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_ROCSPARSE( \ + KOKKOS_SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, ETI_SPEC_AVAIL) \ + template <> \ + struct SPADD_SYMBOLIC< \ + EXEC_SPACE_TYPE, \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const KOKKOS_SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + using kernelhandle_t = KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const KOKKOS_SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>; \ + using rowmap_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using non_const_rowmap_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using colidx_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits >; \ + static void spadd_symbolic(const EXEC_SPACE_TYPE& exec, \ + kernelhandle_t* handle, const ORDINAL_TYPE m, \ + const ORDINAL_TYPE n, rowmap_view_t rowmapA, \ + colidx_view_t colidxA, rowmap_view_t rowmapB, \ + colidx_view_t colidxB, \ + non_const_rowmap_view_t rowmapC) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosSparse::spadd_symbolic[TPL_ROCSPARSE," + \ + Kokkos::ArithTraits::name() + "]"); \ + \ + auto addHandle = handle->get_spadd_handle(); \ + auto& rocData = addHandle->rocsparseData; \ + auto& rocspHandle = KokkosKernels::Impl::RocsparseSingleton::singleton() \ + .rocsparseHandle; \ + OFFSET_TYPE nnzA = colidxA.extent(0); \ + OFFSET_TYPE nnzB = colidxB.extent(0); \ + OFFSET_TYPE nnzC = 0; \ + \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_set_stream(rocspHandle, exec.hip_stream())); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_create_mat_descr(&rocData.descrA)); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_create_mat_descr(&rocData.descrB)); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_create_mat_descr(&rocData.descrC)); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_csrgeam_nnz( \ + rocspHandle, m, n, rocData.descrA, nnzA, rowmapA.data(), \ + colidxA.data(), rocData.descrB, nnzB, rowmapB.data(), \ + colidxB.data(), rocData.descrC, rowmapC.data(), &nnzC)); \ + addHandle->set_c_nnz(nnzC); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_set_stream(rocspHandle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_ROCSPARSE_EXT( \ + ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_ROCSPARSE( \ + float, rocsparse_int, rocsparse_int, Kokkos::LayoutLeft, Kokkos::HIP, \ + Kokkos::HIPSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_ROCSPARSE( \ + double, rocsparse_int, rocsparse_int, Kokkos::LayoutLeft, Kokkos::HIP, \ + Kokkos::HIPSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_ROCSPARSE( \ + Kokkos::complex, rocsparse_int, rocsparse_int, \ + Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_ROCSPARSE( \ + Kokkos::complex, rocsparse_int, rocsparse_int, \ + Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, ETI_SPEC_AVAIL) + +KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_ROCSPARSE_EXT(true) +KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_ROCSPARSE_EXT(false) +#endif + +} // namespace Impl +} // namespace KokkosSparse + +#endif diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spadd_tpl_spec_avail.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spadd_tpl_spec_avail.hpp index b654c4331c45..6d4db8731fcd 100644 --- a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spadd_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spadd_tpl_spec_avail.hpp @@ -21,20 +21,125 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists // -template +template struct spadd_symbolic_tpl_spec_avail { enum : bool { value = false }; }; -template +template struct spadd_numeric_tpl_spec_avail { enum : bool { value = false }; }; +#define KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template <> \ + struct spadd_symbolic_tpl_spec_avail< \ + EXEC_SPACE_TYPE, \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +#define KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template <> \ + struct spadd_numeric_tpl_spec_avail< \ + EXEC_SPACE_TYPE, \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +#define KOKKOSSPARSE_SPADD_TPL_SPEC_AVAIL( \ + ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_AVAIL(float, ORDINAL_TYPE, OFFSET_TYPE, \ + LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_AVAIL(double, ORDINAL_TYPE, \ + OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_AVAIL( \ + Kokkos::complex, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_AVAIL( \ + Kokkos::complex, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_AVAIL(float, ORDINAL_TYPE, OFFSET_TYPE, \ + LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_AVAIL(double, ORDINAL_TYPE, OFFSET_TYPE, \ + LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_AVAIL( \ + Kokkos::complex, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_AVAIL( \ + Kokkos::complex, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +KOKKOSSPARSE_SPADD_TPL_SPEC_AVAIL(int, int, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE +KOKKOSSPARSE_SPADD_TPL_SPEC_AVAIL(rocsparse_int, rocsparse_int, + Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +#endif + } // namespace Impl } // namespace KokkosSparse diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp index e144b531628a..517e10498879 100644 --- a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spgemm_numeric_tpl_spec_avail.hpp @@ -82,10 +82,12 @@ struct spgemm_numeric_tpl_spec_avail { SPGEMM_NUMERIC_AVAIL_CUSPARSE(SCALAR, Kokkos::CudaSpace) \ SPGEMM_NUMERIC_AVAIL_CUSPARSE(SCALAR, Kokkos::CudaUVMSpace) +#if (CUDA_VERSION < 11000) || (CUDA_VERSION >= 11040) SPGEMM_NUMERIC_AVAIL_CUSPARSE_S(float) SPGEMM_NUMERIC_AVAIL_CUSPARSE_S(double) SPGEMM_NUMERIC_AVAIL_CUSPARSE_S(Kokkos::complex) SPGEMM_NUMERIC_AVAIL_CUSPARSE_S(Kokkos::complex) +#endif #endif diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp index b8c545ffe23e..41e8802214c6 100644 --- a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spgemm_symbolic_tpl_spec_avail.hpp @@ -67,11 +67,13 @@ struct spgemm_symbolic_tpl_spec_avail { SPGEMM_SYMBOLIC_AVAIL_CUSPARSE(SCALAR, Kokkos::CudaSpace) \ SPGEMM_SYMBOLIC_AVAIL_CUSPARSE(SCALAR, Kokkos::CudaUVMSpace) +#if (CUDA_VERSION < 11000) || (CUDA_VERSION >= 11040) SPGEMM_SYMBOLIC_AVAIL_CUSPARSE_S(float) SPGEMM_SYMBOLIC_AVAIL_CUSPARSE_S(double) SPGEMM_SYMBOLIC_AVAIL_CUSPARSE_S(Kokkos::complex) SPGEMM_SYMBOLIC_AVAIL_CUSPARSE_S(Kokkos::complex) #endif +#endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE #define SPGEMM_SYMBOLIC_AVAIL_ROCSPARSE(SCALAR) \ diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp index 07bb0a0f0ac1..16bf1abecfb8 100644 --- a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp @@ -22,10 +22,10 @@ #endif namespace KokkosSparse { -namespace Experimental { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spmv_bsrmatrix_tpl_spec_avail { enum : bool { value = false }; }; @@ -41,6 +41,8 @@ struct spmv_bsrmatrix_tpl_spec_avail { template <> \ struct spmv_bsrmatrix_tpl_spec_avail< \ Kokkos::Cuda, \ + KokkosSparse::Impl::SPMVHandleImpl, \ ::KokkosSparse::Experimental::BsrMatrix< \ const SCALAR, const ORDINAL, Kokkos::Device, \ Kokkos::MemoryTraits, const OFFSET>, \ @@ -127,22 +129,24 @@ KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, #endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ - template <> \ - struct spmv_bsrmatrix_tpl_spec_avail< \ - EXECSPACE, \ - ::KokkosSparse::Experimental::BsrMatrix< \ - const SCALAR, const MKL_INT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const MKL_INT>, \ - Kokkos::View< \ - const SCALAR*, Kokkos::LayoutLeft, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ + template <> \ + struct spmv_bsrmatrix_tpl_spec_avail< \ + EXECSPACE, \ + KokkosSparse::Impl::SPMVHandleImpl, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR, const MKL_INT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const MKL_INT>, \ + Kokkos::View< \ + const SCALAR*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL @@ -166,7 +170,8 @@ KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex, #endif // Specialization struct which defines whether a specialization exists -template > struct spmv_mv_bsrmatrix_tpl_spec_avail { @@ -184,6 +189,8 @@ struct spmv_mv_bsrmatrix_tpl_spec_avail { template <> \ struct spmv_mv_bsrmatrix_tpl_spec_avail< \ Kokkos::Cuda, \ + KokkosSparse::Impl::SPMVHandleImpl, \ ::KokkosSparse::Experimental::BsrMatrix< \ const SCALAR, const ORDINAL, Kokkos::Device, \ Kokkos::MemoryTraits, const OFFSET>, \ @@ -231,23 +238,25 @@ KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, #endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ - template <> \ - struct spmv_mv_bsrmatrix_tpl_spec_avail< \ - EXECSPACE, \ - ::KokkosSparse::Experimental::BsrMatrix< \ - const SCALAR, const int, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const int>, \ - Kokkos::View< \ - const SCALAR*, Kokkos::LayoutLeft, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true> { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ + template <> \ + struct spmv_mv_bsrmatrix_tpl_spec_avail< \ + EXECSPACE, \ + KokkosSparse::Impl::SPMVHandleImpl, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + const SCALAR, const int, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const int>, \ + Kokkos::View< \ + const SCALAR*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true> { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL @@ -279,6 +288,8 @@ KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex, template <> \ struct spmv_bsrmatrix_tpl_spec_avail< \ Kokkos::HIP, \ + KokkosSparse::Impl::SPMVHandleImpl, \ ::KokkosSparse::Experimental::BsrMatrix< \ const SCALAR, const ORDINAL, Kokkos::Device, \ Kokkos::MemoryTraits, const OFFSET>, \ @@ -336,7 +347,6 @@ KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_ROCSPARSE(Kokkos::complex, #endif // defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) } // namespace Impl -} // namespace Experimental } // namespace KokkosSparse #endif // KOKKOSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_HPP_ diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 75752190e7f4..9c844ff9105f 100644 --- a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -18,228 +18,222 @@ #define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP #include "KokkosKernels_AlwaysFalse.hpp" -#include "KokkosKernels_Controls.hpp" #include "KokkosSparse_Utils_mkl.hpp" #include "KokkosSparse_Utils_cusparse.hpp" +#include "KokkosKernels_tpl_handles_decl.hpp" -#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && (__INTEL_MKL__ > 2017) #include namespace KokkosSparse { -namespace Experimental { namespace Impl { -#if (__INTEL_MKL__ > 2017) // MKL 2018 and above: use new interface: sparse_matrix_t and mkl_sparse_?_mv() -using KokkosSparse::Impl::mode_kk_to_mkl; - -inline matrix_descr getDescription() { - matrix_descr A_descr; - A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; - A_descr.mode = SPARSE_FILL_MODE_FULL; - A_descr.diag = SPARSE_DIAG_NON_UNIT; - return A_descr; -} - -inline void spmv_block_impl_mkl(sparse_operation_t op, float alpha, float beta, - MKL_INT m, MKL_INT n, MKL_INT b, - const MKL_INT* Arowptrs, - const MKL_INT* Aentries, const float* Avalues, - const float* x, float* y) { - sparse_matrix_t A_mkl; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_bsr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), const_cast(Avalues))); - - matrix_descr A_descr = getDescription(); - KOKKOSKERNELS_MKL_SAFE_CALL( - mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y)); -} - -inline void spmv_block_impl_mkl(sparse_operation_t op, double alpha, - double beta, MKL_INT m, MKL_INT n, MKL_INT b, - const MKL_INT* Arowptrs, - const MKL_INT* Aentries, const double* Avalues, - const double* x, double* y) { - sparse_matrix_t A_mkl; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_bsr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), const_cast(Avalues))); - - matrix_descr A_descr = getDescription(); - KOKKOSKERNELS_MKL_SAFE_CALL( - mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y)); -} - -inline void spmv_block_impl_mkl(sparse_operation_t op, - Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, - MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, - const Kokkos::complex* Avalues, - const Kokkos::complex* x, - Kokkos::complex* y) { - sparse_matrix_t A_mkl; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_bsr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), (MKL_Complex8*)Avalues)); - - MKL_Complex8 alpha_mkl{alpha.real(), alpha.imag()}; - MKL_Complex8 beta_mkl{beta.real(), beta.imag()}; - matrix_descr A_descr = getDescription(); - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_mv( - op, alpha_mkl, A_mkl, A_descr, reinterpret_cast(x), - beta_mkl, reinterpret_cast(y))); -} - -inline void spmv_block_impl_mkl(sparse_operation_t op, - Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, - MKL_INT n, MKL_INT b, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, - const Kokkos::complex* Avalues, - const Kokkos::complex* x, - Kokkos::complex* y) { - sparse_matrix_t A_mkl; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_bsr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), (MKL_Complex16*)Avalues)); - - matrix_descr A_descr = getDescription(); - MKL_Complex16 alpha_mkl{alpha.real(), alpha.imag()}; - MKL_Complex16 beta_mkl{beta.real(), beta.imag()}; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_mv( - op, alpha_mkl, A_mkl, A_descr, reinterpret_cast(x), - beta_mkl, reinterpret_cast(y))); -} - -inline void spm_mv_block_impl_mkl(sparse_operation_t op, float alpha, - float beta, MKL_INT m, MKL_INT n, MKL_INT b, - const MKL_INT* Arowptrs, - const MKL_INT* Aentries, const float* Avalues, - const float* x, MKL_INT colx, MKL_INT ldx, - float* y, MKL_INT ldy) { - sparse_matrix_t A_mkl; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_bsr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), const_cast(Avalues))); - - matrix_descr A_descr = getDescription(); - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_mm(op, alpha, A_mkl, A_descr, - SPARSE_LAYOUT_ROW_MAJOR, x, colx, - ldx, beta, y, ldy)); -} - -inline void spm_mv_block_impl_mkl(sparse_operation_t op, double alpha, - double beta, MKL_INT m, MKL_INT n, MKL_INT b, - const MKL_INT* Arowptrs, - const MKL_INT* Aentries, - const double* Avalues, const double* x, - MKL_INT colx, MKL_INT ldx, double* y, - MKL_INT ldy) { - sparse_matrix_t A_mkl; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_bsr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), const_cast(Avalues))); - - matrix_descr A_descr = getDescription(); - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_mm(op, alpha, A_mkl, A_descr, - SPARSE_LAYOUT_ROW_MAJOR, x, colx, - ldx, beta, y, ldy)); -} - -inline void spm_mv_block_impl_mkl( - sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, MKL_INT n, MKL_INT b, - const MKL_INT* Arowptrs, const MKL_INT* Aentries, - const Kokkos::complex* Avalues, const Kokkos::complex* x, - MKL_INT colx, MKL_INT ldx, Kokkos::complex* y, MKL_INT ldy) { - sparse_matrix_t A_mkl; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_bsr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), (MKL_Complex8*)Avalues)); - - MKL_Complex8 alpha_mkl{alpha.real(), alpha.imag()}; - MKL_Complex8 beta_mkl{beta.real(), beta.imag()}; - matrix_descr A_descr = getDescription(); - KOKKOSKERNELS_MKL_SAFE_CALL( - mkl_sparse_c_mm(op, alpha_mkl, A_mkl, A_descr, SPARSE_LAYOUT_ROW_MAJOR, - reinterpret_cast(x), colx, ldx, - beta_mkl, reinterpret_cast(y), ldy)); +// Note: Scalar here is the Kokkos type, not the MKL type +template +inline void spmv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, + Scalar beta, MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, + const Scalar* Avalues, const Scalar* x, Scalar* y) { + using MKLScalar = + typename KokkosSparse::Impl::KokkosToMKLScalar::type; + using ExecSpace = typename Handle::ExecutionSpaceType; + using Subhandle = KokkosSparse::Impl::MKL_SpMV_Data; + Subhandle* subhandle; + const MKLScalar* x_mkl = reinterpret_cast(x); + MKLScalar* y_mkl = reinterpret_cast(y); + if (handle->is_set_up) { + subhandle = dynamic_cast(handle->tpl); + if (!subhandle) + throw std::runtime_error( + "KokkosSparse::spmv: subhandle is not set up for MKL BSR"); + } else { + // Use the default execution space instance, as classic MKL does not use + // a specific instance. + subhandle = new Subhandle(ExecSpace()); + handle->tpl = subhandle; + subhandle->descr.type = SPARSE_MATRIX_TYPE_GENERAL; + subhandle->descr.mode = SPARSE_FILL_MODE_FULL; + subhandle->descr.diag = SPARSE_DIAG_NON_UNIT; + // Note: the create_csr routine requires non-const values even though + // they're not actually modified + MKLScalar* Avalues_mkl = + reinterpret_cast(const_cast(Avalues)); + if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_bsr( + &subhandle->mat, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, + n, b, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), + Avalues_mkl)); + } else if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_bsr( + &subhandle->mat, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, + n, b, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), + Avalues_mkl)); + } else if constexpr (std::is_same_v>) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_bsr( + &subhandle->mat, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, + n, b, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), + Avalues_mkl)); + } else if constexpr (std::is_same_v>) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_bsr( + &subhandle->mat, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, + n, b, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), + Avalues_mkl)); + } + handle->is_set_up = true; + } + MKLScalar alpha_mkl = KokkosSparse::Impl::KokkosToMKLScalar(alpha); + MKLScalar beta_mkl = KokkosSparse::Impl::KokkosToMKLScalar(beta); + if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_mv(op, alpha_mkl, subhandle->mat, + subhandle->descr, x_mkl, + beta_mkl, y_mkl)); + } else if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_mv(op, alpha_mkl, subhandle->mat, + subhandle->descr, x_mkl, + beta_mkl, y_mkl)); + } else if constexpr (std::is_same_v>) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_mv(op, alpha_mkl, subhandle->mat, + subhandle->descr, x_mkl, + beta_mkl, y_mkl)); + } else if constexpr (std::is_same_v>) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_mv(op, alpha_mkl, subhandle->mat, + subhandle->descr, x_mkl, + beta_mkl, y_mkl)); + } } -inline void spm_mv_block_impl_mkl( - sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, MKL_INT n, MKL_INT b, - const MKL_INT* Arowptrs, const MKL_INT* Aentries, - const Kokkos::complex* Avalues, const Kokkos::complex* x, - MKL_INT colx, MKL_INT ldx, Kokkos::complex* y, MKL_INT ldy) { - sparse_matrix_t A_mkl; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_bsr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, - const_cast(Arowptrs), const_cast(Arowptrs + 1), - const_cast(Aentries), (MKL_Complex16*)Avalues)); - - matrix_descr A_descr = getDescription(); - MKL_Complex16 alpha_mkl{alpha.real(), alpha.imag()}; - MKL_Complex16 beta_mkl{beta.real(), beta.imag()}; - KOKKOSKERNELS_MKL_SAFE_CALL( - mkl_sparse_z_mm(op, alpha_mkl, A_mkl, A_descr, SPARSE_LAYOUT_ROW_MAJOR, - reinterpret_cast(x), colx, ldx, - beta_mkl, reinterpret_cast(y), ldy)); +// Note: Scalar here is the Kokkos type, not the MKL type +template +inline void spmv_mv_bsr_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, + Scalar beta, MKL_INT m, MKL_INT n, MKL_INT b, + const MKL_INT* Arowptrs, const MKL_INT* Aentries, + const Scalar* Avalues, const Scalar* x, + MKL_INT colx, MKL_INT ldx, Scalar* y, MKL_INT ldy) { + using MKLScalar = + typename KokkosSparse::Impl::KokkosToMKLScalar::type; + using ExecSpace = typename Handle::ExecutionSpaceType; + using Subhandle = KokkosSparse::Impl::MKL_SpMV_Data; + Subhandle* subhandle; + const MKLScalar* x_mkl = reinterpret_cast(x); + MKLScalar* y_mkl = reinterpret_cast(y); + if (handle->is_set_up) { + subhandle = dynamic_cast(handle->tpl); + if (!subhandle) + throw std::runtime_error( + "KokkosSparse::spmv: subhandle is not set up for MKL BSR"); + } else { + // Use the default execution space instance, as classic MKL does not use + // a specific instance. + subhandle = new Subhandle(ExecSpace()); + handle->tpl = subhandle; + subhandle->descr.type = SPARSE_MATRIX_TYPE_GENERAL; + subhandle->descr.mode = SPARSE_FILL_MODE_FULL; + subhandle->descr.diag = SPARSE_DIAG_NON_UNIT; + // Note: the create_csr routine requires non-const values even though + // they're not actually modified + MKLScalar* Avalues_mkl = + reinterpret_cast(const_cast(Avalues)); + if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_bsr( + &subhandle->mat, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, + n, b, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), + Avalues_mkl)); + } else if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_bsr( + &subhandle->mat, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, + n, b, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), + Avalues_mkl)); + } else if constexpr (std::is_same_v>) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_bsr( + &subhandle->mat, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, + n, b, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), + Avalues_mkl)); + } else if constexpr (std::is_same_v>) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_bsr( + &subhandle->mat, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, + n, b, const_cast(Arowptrs), + const_cast(Arowptrs + 1), const_cast(Aentries), + Avalues_mkl)); + } + handle->is_set_up = true; + } + MKLScalar alpha_mkl = KokkosSparse::Impl::KokkosToMKLScalar(alpha); + MKLScalar beta_mkl = KokkosSparse::Impl::KokkosToMKLScalar(beta); + if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_mm( + op, alpha_mkl, subhandle->mat, subhandle->descr, + SPARSE_LAYOUT_ROW_MAJOR, x_mkl, colx, ldx, beta_mkl, y_mkl, ldy)); + } else if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_mm( + op, alpha_mkl, subhandle->mat, subhandle->descr, + SPARSE_LAYOUT_ROW_MAJOR, x_mkl, colx, ldx, beta_mkl, y_mkl, ldy)); + } else if constexpr (std::is_same_v>) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_mm( + op, alpha_mkl, subhandle->mat, subhandle->descr, + SPARSE_LAYOUT_ROW_MAJOR, x_mkl, colx, ldx, beta_mkl, y_mkl, ldy)); + } else if constexpr (std::is_same_v>) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_mm( + op, alpha_mkl, subhandle->mat, subhandle->descr, + SPARSE_LAYOUT_ROW_MAJOR, x_mkl, colx, ldx, beta_mkl, y_mkl, ldy)); + } } -#endif - -#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ - template <> \ - struct SPMV_BSRMATRIX< \ - EXECSPACE, \ - ::KokkosSparse::Experimental::BsrMatrix< \ - SCALAR const, MKL_INT const, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, MKL_INT const>, \ - Kokkos::View< \ - SCALAR const*, Kokkos::LayoutLeft, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, COMPILE_LIBRARY> { \ - using device_type = Kokkos::Device; \ - using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ - SCALAR const, MKL_INT const, device_type, \ - Kokkos::MemoryTraits, MKL_INT const>; \ - using XVector = Kokkos::View< \ - SCALAR const*, Kokkos::LayoutLeft, device_type, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View>; \ - using coefficient_type = typename YVector::non_const_value_type; \ - \ - static void spmv_bsrmatrix( \ - const EXECSPACE&, \ - const KokkosKernels::Experimental::Controls& /*controls*/, \ - const char mode[], const coefficient_type& alpha, const AMatrix& A, \ - const XVector& X, const coefficient_type& beta, const YVector& Y) { \ - std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX" + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spmv_block_impl_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), \ - A.numCols(), A.blockDim(), A.graph.row_map.data(), \ - A.graph.entries.data(), A.values.data(), X.data(), \ - Y.data()); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ + template <> \ + struct SPMV_BSRMATRIX< \ + EXECSPACE, \ + KokkosSparse::Impl::SPMVHandleImpl, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, MKL_INT const, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, MKL_INT const>, \ + Kokkos::View< \ + SCALAR const*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, COMPILE_LIBRARY> { \ + using device_type = Kokkos::Device; \ + using Handle = \ + KokkosSparse::Impl::SPMVHandleImpl; \ + using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, MKL_INT const, device_type, \ + Kokkos::MemoryTraits, MKL_INT const>; \ + using XVector = Kokkos::View< \ + SCALAR const*, Kokkos::LayoutLeft, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View>; \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + static void spmv_bsrmatrix(const EXECSPACE&, Handle* handle, \ + const char mode[], \ + const coefficient_type& alpha, \ + const AMatrix& A, const XVector& X, \ + const coefficient_type& beta, \ + const YVector& Y) { \ + std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX" + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_bsr_mkl(handle, mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), \ + A.numCols(), A.blockDim(), A.graph.row_map.data(), \ + A.graph.entries.data(), A.values.data(), X.data(), \ + Y.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #ifdef KOKKOS_ENABLE_SERIAL @@ -268,6 +262,8 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, template <> \ struct SPMV_MV_BSRMATRIX< \ EXECSPACE, \ + KokkosSparse::Impl::SPMVHandleImpl, \ ::KokkosSparse::Experimental::BsrMatrix< \ SCALAR const, MKL_INT const, \ Kokkos::Device, \ @@ -281,9 +277,12 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, Kokkos::MemoryTraits>, \ true, true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ - using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ - SCALAR const, MKL_INT const, device_type, \ - Kokkos::MemoryTraits, MKL_INT const>; \ + using Handle = \ + KokkosSparse::Impl::SPMVHandleImpl; \ + using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, MKL_INT const, device_type, \ + Kokkos::MemoryTraits, MKL_INT const>; \ using XVector = Kokkos::View< \ SCALAR const**, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ @@ -291,21 +290,22 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, Kokkos::MemoryTraits>; \ using coefficient_type = typename YVector::non_const_value_type; \ \ - static void spmv_mv_bsrmatrix( \ - const EXECSPACE&, \ - const KokkosKernels::Experimental::Controls& /*controls*/, \ - const char mode[], const coefficient_type& alpha, const AMatrix& A, \ - const XVector& X, const coefficient_type& beta, const YVector& Y) { \ + static void spmv_mv_bsrmatrix(const EXECSPACE&, Handle* handle, \ + const char mode[], \ + const coefficient_type& alpha, \ + const AMatrix& A, const XVector& X, \ + const coefficient_type& beta, \ + const YVector& Y) { \ std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX" + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ MKL_INT colx = static_cast(X.extent(1)); \ MKL_INT ldx = static_cast(X.stride_1()); \ MKL_INT ldy = static_cast(Y.stride_1()); \ - spm_mv_block_impl_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), \ - A.numCols(), A.blockDim(), A.graph.row_map.data(), \ - A.graph.entries.data(), A.values.data(), X.data(), \ - colx, ldx, Y.data(), ldy); \ + spmv_mv_bsr_mkl(handle, mode_kk_to_mkl(mode[0]), alpha, beta, \ + A.numRows(), A.numCols(), A.blockDim(), \ + A.graph.row_map.data(), A.graph.entries.data(), \ + A.values.data(), X.data(), colx, ldx, Y.data(), ldy); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -335,15 +335,13 @@ KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex, Kokkos::OpenMP, #undef KOKKOSSPARSE_SPMV_MV_MKL } // namespace Impl -} // namespace Experimental } // namespace KokkosSparse -#endif // KOKKOSKERNELS_ENABLE_TPL_MKL +#endif // defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && (__INTEL_MKL__ > 2017) // cuSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #include "cusparse.h" -#include "KokkosSparse_Utils_cusparse.hpp" // // From https://docs.nvidia.com/cuda/cusparse/index.html#bsrmv @@ -352,25 +350,29 @@ KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex, Kokkos::OpenMP, // - Only CUSPARSE_OPERATION_NON_TRANSPOSE is supported // - Only CUSPARSE_MATRIX_TYPE_GENERAL is supported. // +#if (9000 <= CUDA_VERSION) + +#include "KokkosSparse_Utils_cusparse.hpp" + namespace KokkosSparse { -namespace Experimental { namespace Impl { -template -void spmv_block_impl_cusparse( - const Kokkos::Cuda& exec, - const KokkosKernels::Experimental::Controls& controls, const char mode[], - typename YVector::non_const_value_type const& alpha, const AMatrix& A, - const XVector& x, typename YVector::non_const_value_type const& beta, - const YVector& y) { +template +void spmv_bsr_cusparse(const Kokkos::Cuda& exec, Handle* handle, + const char mode[], + typename YVector::non_const_value_type const& alpha, + const AMatrix& A, const XVector& x, + typename YVector::non_const_value_type const& beta, + const YVector& y) { using offset_type = typename AMatrix::non_const_size_type; using entry_type = typename AMatrix::non_const_ordinal_type; using value_type = typename AMatrix::non_const_value_type; /* initialize cusparse library */ - cusparseHandle_t cusparseHandle = controls.getCusparseHandle(); + cusparseHandle_t cusparseHandle = + KokkosKernels::Impl::CusparseSingleton::singleton().cusparseHandle; /* Set cuSPARSE to use the given stream until this function exits */ - KokkosSparse::Impl::TemporarySetCusparseStream(cusparseHandle, exec); + KokkosSparse::Impl::TemporarySetCusparseStream tscs(cusparseHandle, exec); /* Set the operation mode */ cusparseOperation_t myCusparseOperation; @@ -382,70 +384,75 @@ void spmv_block_impl_cusparse( } } -#if (9000 <= CUDA_VERSION) + KokkosSparse::Impl::CuSparse9_SpMV_Data* subhandle; + + if (handle->is_set_up) { + subhandle = + dynamic_cast(handle->tpl); + if (!subhandle) + throw std::runtime_error( + "KokkosSparse::spmv: subhandle is not set up for cusparse"); + } else { + /* create and set the subhandle and matrix descriptor */ + subhandle = new KokkosSparse::Impl::CuSparse9_SpMV_Data(exec); + handle->tpl = subhandle; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&subhandle->mat)); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetMatType(subhandle->mat, CUSPARSE_MATRIX_TYPE_GENERAL)); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetMatIndexBase(subhandle->mat, CUSPARSE_INDEX_BASE_ZERO)); + handle->is_set_up = true; + } - /* create and set the matrix descriptor */ - cusparseMatDescr_t descrA = 0; - KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descrA)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); cusparseDirection_t dirA = CUSPARSE_DIRECTION_ROW; /* perform the actual SpMV operation */ - if ((std::is_same::value) && - (std::is_same::value)) { - if (std::is_same::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseSbsrmv( - cusparseHandle, dirA, myCusparseOperation, A.numRows(), A.numCols(), - A.nnz(), reinterpret_cast(&alpha), descrA, - reinterpret_cast(A.values.data()), - A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), - reinterpret_cast(x.data()), - reinterpret_cast(&beta), - reinterpret_cast(y.data()))); - } else if (std::is_same::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseDbsrmv( - cusparseHandle, dirA, myCusparseOperation, A.numRows(), A.numCols(), - A.nnz(), reinterpret_cast(&alpha), descrA, - reinterpret_cast(A.values.data()), - A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), - reinterpret_cast(x.data()), - reinterpret_cast(&beta), - reinterpret_cast(y.data()))); - } else if (std::is_same>::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseCbsrmv( - cusparseHandle, dirA, myCusparseOperation, A.numRows(), A.numCols(), - A.nnz(), reinterpret_cast(&alpha), descrA, - reinterpret_cast(A.values.data()), - A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), - reinterpret_cast(x.data()), - reinterpret_cast(&beta), - reinterpret_cast(y.data()))); - } else if (std::is_same>::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseZbsrmv( - cusparseHandle, dirA, myCusparseOperation, A.numRows(), A.numCols(), - A.nnz(), reinterpret_cast(&alpha), descrA, - reinterpret_cast(A.values.data()), - A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), - reinterpret_cast(x.data()), - reinterpret_cast(&beta), - reinterpret_cast(y.data()))); - } else { - throw std::logic_error( - "Trying to call cusparse[*]bsrmv with a scalar type not " - "float/double, " - "nor complex of either!"); - } + static_assert( + std::is_same_v && std::is_same_v, + "With cuSPARSE non-generic API, offset and entry types must both be int. " + "Something wrong with TPL avail logic."); + if constexpr (std::is_same_v) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSbsrmv( + cusparseHandle, dirA, myCusparseOperation, A.numRows(), A.numCols(), + A.nnz(), reinterpret_cast(&alpha), subhandle->mat, + reinterpret_cast(A.values.data()), A.graph.row_map.data(), + A.graph.entries.data(), A.blockDim(), + reinterpret_cast(x.data()), + reinterpret_cast(&beta), + reinterpret_cast(y.data()))); + } else if constexpr (std::is_same_v) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDbsrmv( + cusparseHandle, dirA, myCusparseOperation, A.numRows(), A.numCols(), + A.nnz(), reinterpret_cast(&alpha), subhandle->mat, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), + reinterpret_cast(x.data()), + reinterpret_cast(&beta), + reinterpret_cast(y.data()))); + } else if constexpr (std::is_same_v>) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCbsrmv( + cusparseHandle, dirA, myCusparseOperation, A.numRows(), A.numCols(), + A.nnz(), reinterpret_cast(&alpha), subhandle->mat, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), + reinterpret_cast(x.data()), + reinterpret_cast(&beta), + reinterpret_cast(y.data()))); + } else if constexpr (std::is_same_v>) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseZbsrmv( + cusparseHandle, dirA, myCusparseOperation, A.numRows(), A.numCols(), + A.nnz(), reinterpret_cast(&alpha), + subhandle->mat, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), + reinterpret_cast(x.data()), + reinterpret_cast(&beta), + reinterpret_cast(y.data()))); } else { - throw std::logic_error( - "With cuSPARSE pre-10.0, offset and entry types must be int. " - "Something wrong with TPL avail logic."); + static_assert(KokkosKernels::Impl::always_false_v, + "Trying to call cusparse[*]bsrmv with a scalar type not " + "float/double, nor complex of either!"); } - - KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(descrA)); -#endif // (9000 <= CUDA_VERSION) } // Reference @@ -463,29 +470,24 @@ void spmv_block_impl_cusparse( // -> C = t(t(B)) * t(A) + C // -> C = B * t(A) + C // This is impossible in cuSparse without explicitly transposing A, -// so we just do not support LayoutRight in cuSparse TPL now -// -template < - class AMatrix, class XVector, class YVector, - std::enable_if_t::value && - std::is_same::value, - bool> = true> -void spm_mv_block_impl_cusparse( - const Kokkos::Cuda& exec, - const KokkosKernels::Experimental::Controls& controls, const char mode[], - typename YVector::non_const_value_type const& alpha, const AMatrix& A, - const XVector& x, typename YVector::non_const_value_type const& beta, - const YVector& y) { +// so we just do not support LayoutRight in cuSparse TPL now (this is +// statically asserted here) +template +void spmv_mv_bsr_cusparse(const Kokkos::Cuda& exec, Handle* handle, + const char mode[], + typename YVector::non_const_value_type const& alpha, + const AMatrix& A, const XVector& x, + typename YVector::non_const_value_type const& beta, + const YVector& y) { using offset_type = typename AMatrix::non_const_size_type; using entry_type = typename AMatrix::non_const_ordinal_type; using value_type = typename AMatrix::non_const_value_type; /* initialize cusparse library */ - cusparseHandle_t cusparseHandle = controls.getCusparseHandle(); + cusparseHandle_t cusparseHandle = + KokkosKernels::Impl::CusparseSingleton::singleton().cusparseHandle; /* Set cuSPARSE to use the given stream until this function exits */ - KokkosSparse::Impl::TemporarySetCusparseStream(cusparseHandle, exec); + KokkosSparse::Impl::TemporarySetCusparseStream tscs(cusparseHandle, exec); /* Set the operation mode */ cusparseOperation_t myCusparseOperation; @@ -499,123 +501,136 @@ void spm_mv_block_impl_cusparse( int colx = static_cast(x.extent(1)); - // ldx and ldy should be the leading dimension of X,Y respectively - const int ldx = static_cast(x.extent(0)); - const int ldy = static_cast(y.extent(0)); + // ldx and ldy should be the leading dimension (stride between columns) of X,Y + // respectively + const int ldx = static_cast(x.stride(1)); + const int ldy = static_cast(y.stride(1)); -#if (9000 <= CUDA_VERSION) - - /* create and set the matrix descriptor */ - cusparseMatDescr_t descrA = 0; - KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descrA)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); + static_assert( + std::is_same_v && + std::is_same_v, + "cuSPARSE requires both X and Y to be LayoutLeft."); + + KokkosSparse::Impl::CuSparse9_SpMV_Data* subhandle; + + if (handle->is_set_up) { + subhandle = + dynamic_cast(handle->tpl); + if (!subhandle) + throw std::runtime_error( + "KokkosSparse::spmv: subhandle is not set up for cusparse"); + } else { + /* create and set the subhandle and matrix descriptor */ + subhandle = new KokkosSparse::Impl::CuSparse9_SpMV_Data(exec); + handle->tpl = subhandle; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&subhandle->mat)); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetMatType(subhandle->mat, CUSPARSE_MATRIX_TYPE_GENERAL)); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetMatIndexBase(subhandle->mat, CUSPARSE_INDEX_BASE_ZERO)); + handle->is_set_up = true; + } cusparseDirection_t dirA = CUSPARSE_DIRECTION_ROW; /* perform the actual SpMV operation */ - if ((std::is_same::value) && - (std::is_same::value)) { - if (std::is_same::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseSbsrmm( - cusparseHandle, dirA, myCusparseOperation, - CUSPARSE_OPERATION_NON_TRANSPOSE, A.numRows(), colx, A.numCols(), - A.nnz(), reinterpret_cast(&alpha), descrA, - reinterpret_cast(A.values.data()), - A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), - reinterpret_cast(x.data()), ldx, - reinterpret_cast(&beta), - reinterpret_cast(y.data()), ldy)); - } else if (std::is_same::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseDbsrmm( - cusparseHandle, dirA, myCusparseOperation, - CUSPARSE_OPERATION_NON_TRANSPOSE, A.numRows(), colx, A.numCols(), - A.nnz(), reinterpret_cast(&alpha), descrA, - reinterpret_cast(A.values.data()), - A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), - reinterpret_cast(x.data()), ldx, - reinterpret_cast(&beta), - reinterpret_cast(y.data()), ldy)); - } else if (std::is_same>::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseCbsrmm( - cusparseHandle, dirA, myCusparseOperation, - CUSPARSE_OPERATION_NON_TRANSPOSE, A.numRows(), colx, A.numCols(), - A.nnz(), reinterpret_cast(&alpha), descrA, - reinterpret_cast(A.values.data()), - A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), - reinterpret_cast(x.data()), ldx, - reinterpret_cast(&beta), - reinterpret_cast(y.data()), ldy)); - } else if (std::is_same>::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseZbsrmm( - cusparseHandle, dirA, myCusparseOperation, - CUSPARSE_OPERATION_NON_TRANSPOSE, A.numRows(), colx, A.numCols(), - A.nnz(), reinterpret_cast(&alpha), descrA, - reinterpret_cast(A.values.data()), - A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), - reinterpret_cast(x.data()), ldx, - reinterpret_cast(&beta), - reinterpret_cast(y.data()), ldy)); - } else { - throw std::logic_error( - "Trying to call cusparse[*]bsrmm with a scalar type not " - "float/double, " - "nor complex of either!"); - } + static_assert( + std::is_same_v && std::is_same_v, + "With cuSPARSE non-generic API, offset and entry types must both be int. " + "Something wrong with TPL avail logic."); + if constexpr (std::is_same_v) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSbsrmm( + cusparseHandle, dirA, myCusparseOperation, + CUSPARSE_OPERATION_NON_TRANSPOSE, A.numRows(), colx, A.numCols(), + A.nnz(), reinterpret_cast(&alpha), subhandle->mat, + reinterpret_cast(A.values.data()), A.graph.row_map.data(), + A.graph.entries.data(), A.blockDim(), + reinterpret_cast(x.data()), ldx, + reinterpret_cast(&beta), + reinterpret_cast(y.data()), ldy)); + } else if constexpr (std::is_same_v) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDbsrmm( + cusparseHandle, dirA, myCusparseOperation, + CUSPARSE_OPERATION_NON_TRANSPOSE, A.numRows(), colx, A.numCols(), + A.nnz(), reinterpret_cast(&alpha), subhandle->mat, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), + reinterpret_cast(x.data()), ldx, + reinterpret_cast(&beta), + reinterpret_cast(y.data()), ldy)); + } else if constexpr (std::is_same_v>) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCbsrmm( + cusparseHandle, dirA, myCusparseOperation, + CUSPARSE_OPERATION_NON_TRANSPOSE, A.numRows(), colx, A.numCols(), + A.nnz(), reinterpret_cast(&alpha), subhandle->mat, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), + reinterpret_cast(x.data()), ldx, + reinterpret_cast(&beta), + reinterpret_cast(y.data()), ldy)); + } else if constexpr (std::is_same_v>) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseZbsrmm( + cusparseHandle, dirA, myCusparseOperation, + CUSPARSE_OPERATION_NON_TRANSPOSE, A.numRows(), colx, A.numCols(), + A.nnz(), reinterpret_cast(&alpha), + subhandle->mat, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), + reinterpret_cast(x.data()), ldx, + reinterpret_cast(&beta), + reinterpret_cast(y.data()), ldy)); } else { - throw std::logic_error( - "With cuSPARSE pre-10.0, offset and entry types must be int. " - "Something wrong with TPL avail logic."); + static_assert(KokkosKernels::Impl::always_false_v, + "Trying to call cusparse[*]bsrmm with a scalar type not " + "float/double, nor complex of either!"); } - - KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(descrA)); -#endif // (9000 <= CUDA_VERSION) } -#define KOKKOSSPARSE_SPMV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ - COMPILE_LIBRARY) \ - template <> \ - struct SPMV_BSRMATRIX< \ - Kokkos::Cuda, \ - ::KokkosSparse::Experimental::BsrMatrix< \ - SCALAR const, ORDINAL const, Kokkos::Device, \ - Kokkos::MemoryTraits, OFFSET const>, \ - Kokkos::View< \ - SCALAR const*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, COMPILE_LIBRARY> { \ - using device_type = Kokkos::Device; \ - using memory_trait_type = Kokkos::MemoryTraits; \ - using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ - SCALAR const, ORDINAL const, device_type, memory_trait_type, \ - OFFSET const>; \ - using XVector = Kokkos::View< \ - SCALAR const*, LAYOUT, device_type, \ - Kokkos::MemoryTraits>; \ - using YVector = \ - Kokkos::View; \ - using Controls = KokkosKernels::Experimental::Controls; \ - \ - using coefficient_type = typename YVector::non_const_value_type; \ - \ - static void spmv_bsrmatrix(const Kokkos::Cuda& exec, \ - const Controls& controls, const char mode[], \ - const coefficient_type& alpha, \ - const AMatrix& A, const XVector& x, \ - const coefficient_type& beta, \ - const YVector& y) { \ - std::string label = "KokkosSparse::spmv[TPL_CUSPARSE,BSRMATRIX" + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spmv_block_impl_cusparse(exec, controls, mode, alpha, A, x, beta, y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSSPARSE_SPMV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ + COMPILE_LIBRARY) \ + template <> \ + struct SPMV_BSRMATRIX< \ + Kokkos::Cuda, \ + KokkosSparse::Impl::SPMVHandleImpl, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, ORDINAL const, Kokkos::Device, \ + Kokkos::MemoryTraits, OFFSET const>, \ + Kokkos::View< \ + SCALAR const*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, COMPILE_LIBRARY> { \ + using device_type = Kokkos::Device; \ + using memory_trait_type = Kokkos::MemoryTraits; \ + using Handle = \ + KokkosSparse::Impl::SPMVHandleImpl; \ + using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, ORDINAL const, device_type, memory_trait_type, \ + OFFSET const>; \ + using XVector = Kokkos::View< \ + SCALAR const*, LAYOUT, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View; \ + \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + static void spmv_bsrmatrix(const Kokkos::Cuda& exec, Handle* handle, \ + const char mode[], \ + const coefficient_type& alpha, \ + const AMatrix& A, const XVector& x, \ + const coefficient_type& beta, \ + const YVector& y) { \ + std::string label = "KokkosSparse::spmv[TPL_CUSPARSE,BSRMATRIX" + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_bsr_cusparse(exec, handle, mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#if (9000 <= CUDA_VERSION) KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, Kokkos::CudaSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) @@ -664,57 +679,59 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -#endif // (9000 <= CUDA_VERSION) #undef KOKKOSSPARSE_SPMV_CUSPARSE // cuSparse TPL does not support LayoutRight for this operation // only specialize for LayoutLeft -#define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, SPACE, \ - ETI_AVAIL) \ - template <> \ - struct SPMV_MV_BSRMATRIX< \ - Kokkos::Cuda, \ - ::KokkosSparse::Experimental::BsrMatrix< \ - SCALAR const, ORDINAL const, Kokkos::Device, \ - Kokkos::MemoryTraits, OFFSET const>, \ - Kokkos::View< \ - SCALAR const**, Kokkos::LayoutLeft, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - false, true, ETI_AVAIL> { \ - using device_type = Kokkos::Device; \ - using memory_trait_type = Kokkos::MemoryTraits; \ - using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ - SCALAR const, ORDINAL const, device_type, memory_trait_type, \ - OFFSET const>; \ - using XVector = Kokkos::View< \ - SCALAR const**, Kokkos::LayoutLeft, device_type, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View \ + struct SPMV_MV_BSRMATRIX< \ + Kokkos::Cuda, \ + KokkosSparse::Impl::SPMVHandleImpl, \ + ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, ORDINAL const, Kokkos::Device, \ + Kokkos::MemoryTraits, OFFSET const>, \ + Kokkos::View< \ + SCALAR const**, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true, ETI_AVAIL> { \ + using device_type = Kokkos::Device; \ + using memory_trait_type = Kokkos::MemoryTraits; \ + using Handle = \ + KokkosSparse::Impl::SPMVHandleImpl; \ + using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, ORDINAL const, device_type, memory_trait_type, \ + OFFSET const>; \ + using XVector = Kokkos::View< \ + SCALAR const**, Kokkos::LayoutLeft, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View; \ - using Controls = KokkosKernels::Experimental::Controls; \ - \ - using coefficient_type = typename YVector::non_const_value_type; \ - \ - static void spmv_mv_bsrmatrix(const Kokkos::Cuda& exec, \ - const Controls& controls, const char mode[], \ - const coefficient_type& alpha, \ - const AMatrix& A, const XVector& x, \ - const coefficient_type& beta, \ - const YVector& y) { \ - std::string label = "KokkosSparse::spmv[TPL_CUSPARSE,BSRMATRIX" + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spm_mv_block_impl_cusparse(exec, controls, mode, alpha, A, x, beta, y); \ - Kokkos::Profiling::popRegion(); \ - } \ + \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + static void spmv_mv_bsrmatrix(const Kokkos::Cuda& exec, Handle* handle, \ + const char mode[], \ + const coefficient_type& alpha, \ + const AMatrix& A, const XVector& x, \ + const coefficient_type& beta, \ + const YVector& y) { \ + std::string label = "KokkosSparse::spmv[TPL_CUSPARSE,BSRMATRIX" + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_mv_bsr_cusparse(exec, handle, mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#if (9000 <= CUDA_VERSION) KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::CudaSpace, true) KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::CudaSpace, false) KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::CudaSpace, true) @@ -740,13 +757,11 @@ KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, Kokkos::CudaUVMSpace, false) -#endif // (9000 <= CUDA_VERSION) - #undef KOKKOSSPARSE_SPMV_MV_CUSPARSE } // namespace Impl -} // namespace Experimental } // namespace KokkosSparse +#endif // (9000 <= CUDA_VERSION) #endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE @@ -760,16 +775,15 @@ KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, #include "KokkosSparse_Utils_rocsparse.hpp" namespace KokkosSparse { -namespace Experimental { namespace Impl { -template -void spmv_block_impl_rocsparse( - const Kokkos::HIP& exec, - const KokkosKernels::Experimental::Controls& controls, const char mode[], - typename YVector::non_const_value_type const& alpha, const AMatrix& A, - const XVector& x, typename YVector::non_const_value_type const& beta, - const YVector& y) { +template +void spmv_bsr_rocsparse(const Kokkos::HIP& exec, Handle* handle, + const char mode[], + typename YVector::non_const_value_type const& alpha, + const AMatrix& A, const XVector& x, + typename YVector::non_const_value_type const& beta, + const YVector& y) { /* rocm 5.4.0 rocsparse_*bsrmv reference: https://rocsparse.readthedocs.io/en/rocm-5.4.0/usermanual.html#rocsparse-bsrmv-ex @@ -818,9 +832,10 @@ void spmv_block_impl_rocsparse( Kokkos::LayoutStride>, "A entries must be contiguous"); - rocsparse_handle handle = controls.getRocsparseHandle(); + rocsparse_handle rocsparseHandle = + KokkosKernels::Impl::RocsparseSingleton::singleton().rocsparseHandle; // resets handle stream to NULL when out of scope - KokkosSparse::Impl::TemporarySetRocsparseStream tsrs(handle, exec); + KokkosSparse::Impl::TemporarySetRocsparseStream tsrs(rocsparseHandle, exec); // set the mode rocsparse_operation trans; @@ -864,45 +879,94 @@ void spmv_block_impl_rocsparse( reinterpret_cast(&beta); rocsparse_value_type* y_ = reinterpret_cast(y.data()); - rocsparse_mat_descr descr; - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&descr)); - rocsparse_mat_info info; - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_info(&info)); + KokkosSparse::Impl::RocSparse_BSR_SpMV_Data* subhandle; + if (handle->is_set_up) { + subhandle = + dynamic_cast(handle->tpl); + if (!subhandle) + throw std::runtime_error( + "KokkosSparse::spmv: subhandle is not set up for rocsparse BSR"); + } else { + subhandle = new KokkosSparse::Impl::RocSparse_BSR_SpMV_Data(exec); + handle->tpl = subhandle; + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_create_mat_descr(&subhandle->mat)); + // *_ex* functions deprecated in introduced in 6+ +#if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 60000 + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_create_mat_info(&subhandle->info)); + if constexpr (std::is_same_v) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv_analysis( + rocsparseHandle, dir, trans, mb, nb, nnzb, subhandle->mat, bsr_val, + bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info)); + } else if constexpr (std::is_same_v) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv_analysis( + rocsparseHandle, dir, trans, mb, nb, nnzb, subhandle->mat, bsr_val, + bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info)); + } else if constexpr (std::is_same_v>) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv_analysis( + rocsparseHandle, dir, trans, mb, nb, nnzb, subhandle->mat, bsr_val, + bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info)); + } else if constexpr (std::is_same_v>) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv_analysis( + rocsparseHandle, dir, trans, mb, nb, nnzb, subhandle->mat, bsr_val, + bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info)); + } else { + static_assert(KokkosKernels::Impl::always_false_v, + "unsupported value type for rocsparse_*bsrmv"); + } + // *_ex* functions introduced in 5.4.0 +#elif KOKKOSSPARSE_IMPL_ROCM_VERSION < 50400 + // No analysis step in the older versions +#else + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_create_mat_info(&subhandle->info)); + if constexpr (std::is_same_v) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv_ex_analysis( + rocsparseHandle, dir, trans, mb, nb, nnzb, subhandle->mat, bsr_val, + bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info)); + } else if constexpr (std::is_same_v) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv_ex_analysis( + rocsparseHandle, dir, trans, mb, nb, nnzb, subhandle->mat, bsr_val, + bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info)); + } else if constexpr (std::is_same_v>) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv_ex_analysis( + rocsparseHandle, dir, trans, mb, nb, nnzb, subhandle->mat, bsr_val, + bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info)); + } else if constexpr (std::is_same_v>) { + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv_ex_analysis( + rocsparseHandle, dir, trans, mb, nb, nnzb, subhandle->mat, bsr_val, + bsr_row_ptr, bsr_col_ind, block_dim, subhandle->info)); + } else { + static_assert(KokkosKernels::Impl::always_false_v, + "unsupported value type for rocsparse_*bsrmv"); + } +#endif + handle->is_set_up = true; + } // *_ex* functions deprecated in introduced in 6+ #if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 60000 if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv_analysis( - handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv( - handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info, x_, beta_, y_)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_sbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv_analysis( - handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv( - handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info, x_, beta_, y_)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_dbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv_analysis( - handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv( - handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info, x_, beta_, y_)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_cbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv_analysis( - handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv( - handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info, x_, beta_, y_)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_zbsrmv(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info, x_, beta_, y_)); } else { static_assert(KokkosKernels::Impl::always_false_v, "unsupported value type for rocsparse_*bsrmv"); @@ -911,72 +975,59 @@ void spmv_block_impl_rocsparse( #elif KOKKOSSPARSE_IMPL_ROCM_VERSION < 50400 if constexpr (std::is_same_v) { KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv( - handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, x_, beta_, y_)); + rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, subhandle->mat, + bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, x_, beta_, y_)); } else if constexpr (std::is_same_v) { KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv( - handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, x_, beta_, y_)); + rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, subhandle->mat, + bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, x_, beta_, y_)); } else if constexpr (std::is_same_v>) { KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv( - handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, x_, beta_, y_)); + rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, subhandle->mat, + bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, x_, beta_, y_)); } else if constexpr (std::is_same_v>) { KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv( - handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, x_, beta_, y_)); + rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, subhandle->mat, + bsr_val, bsr_row_ptr, bsr_col_ind, block_dim, x_, beta_, y_)); } else { static_assert(KokkosKernels::Impl::always_false_v, "unsupported value type for rocsparse_*bsrmv"); } #else if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv_ex_analysis( - handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_sbsrmv_ex( - handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info, x_, beta_, y_)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_sbsrmv_ex(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv_ex_analysis( - handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_dbsrmv_ex( - handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info, x_, beta_, y_)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_dbsrmv_ex(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv_ex_analysis( - handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_cbsrmv_ex( - handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info, x_, beta_, y_)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_cbsrmv_ex(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info, x_, beta_, y_)); } else if constexpr (std::is_same_v>) { - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv_ex_analysis( - handle, dir, trans, mb, nb, nnzb, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_zbsrmv_ex( - handle, dir, trans, mb, nb, nnzb, alpha_, descr, bsr_val, bsr_row_ptr, - bsr_col_ind, block_dim, info, x_, beta_, y_)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_bsrsv_clear(handle, info)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( + rocsparse_zbsrmv_ex(rocsparseHandle, dir, trans, mb, nb, nnzb, alpha_, + subhandle->mat, bsr_val, bsr_row_ptr, bsr_col_ind, + block_dim, subhandle->info, x_, beta_, y_)); } else { static_assert(KokkosKernels::Impl::always_false_v, "unsupported value type for rocsparse_*bsrmv"); } #endif - rocsparse_destroy_mat_descr(descr); - rocsparse_destroy_mat_info(info); - -} // spmv_block_impl_rocsparse +} // spmv_bsr_rocsparse #define KOKKOSSPARSE_SPMV_ROCSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ COMPILE_LIBRARY) \ template <> \ struct SPMV_BSRMATRIX< \ Kokkos::HIP, \ + KokkosSparse::Impl::SPMVHandleImpl, \ ::KokkosSparse::Experimental::BsrMatrix< \ SCALAR const, ORDINAL const, Kokkos::Device, \ Kokkos::MemoryTraits, OFFSET const>, \ @@ -988,20 +1039,22 @@ void spmv_block_impl_rocsparse( true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using memory_trait_type = Kokkos::MemoryTraits; \ - using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ - SCALAR const, ORDINAL const, device_type, memory_trait_type, \ - OFFSET const>; \ + using Handle = \ + KokkosSparse::Impl::SPMVHandleImpl; \ + using AMatrix = ::KokkosSparse::Experimental::BsrMatrix< \ + SCALAR const, ORDINAL const, device_type, memory_trait_type, \ + OFFSET const>; \ using XVector = Kokkos::View< \ SCALAR const*, LAYOUT, device_type, \ Kokkos::MemoryTraits>; \ using YVector = \ Kokkos::View; \ - using Controls = KokkosKernels::Experimental::Controls; \ \ using coefficient_type = typename YVector::non_const_value_type; \ \ - static void spmv_bsrmatrix(const Kokkos::HIP& exec, \ - const Controls& controls, const char mode[], \ + static void spmv_bsrmatrix(const Kokkos::HIP& exec, Handle* handle, \ + const char mode[], \ const coefficient_type& alpha, \ const AMatrix& A, const XVector& x, \ const coefficient_type& beta, \ @@ -1009,7 +1062,7 @@ void spmv_block_impl_rocsparse( std::string label = "KokkosSparse::spmv[TPL_ROCSPARSE,BSRMATRIX" + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ - spmv_block_impl_rocsparse(exec, controls, mode, alpha, A, x, beta, y); \ + spmv_bsr_rocsparse(exec, handle, mode, alpha, A, x, beta, y); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -1044,7 +1097,6 @@ KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, rocsparse_int, #undef KOKKOSSPARSE_SPMV_ROCSPARSE } // namespace Impl -} // namespace Experimental } // namespace KokkosSparse #endif // defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp index 5e33df1fa3de..88fef4421a3a 100644 --- a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp @@ -21,7 +21,8 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template > struct spmv_mv_tpl_spec_avail { @@ -33,6 +34,8 @@ struct spmv_mv_tpl_spec_avail { template <> \ struct spmv_mv_tpl_spec_avail< \ Kokkos::Cuda, \ + KokkosSparse::Impl::SPMVHandleImpl, \ KokkosSparse::CrsMatrix< \ const SCALAR, const ORDINAL, Kokkos::Device, \ Kokkos::MemoryTraits, const OFFSET>, \ @@ -48,7 +51,14 @@ struct spmv_mv_tpl_spec_avail { non-transpose that produces incorrect result. This is cusparse distributed with CUDA 10.1.243. The bug seems to be resolved by CUSPARSE 10301 (present by CUDA 10.2.89) */ -#if defined(CUSPARSE_VERSION) && (10301 <= CUSPARSE_VERSION) + +/* cusparseSpMM also produces incorrect results for some inputs in CUDA 11.6.1. + * (CUSPARSE_VERSION 11702). + * ALG1 and ALG3 produce completely incorrect results for one set of inputs. + * ALG2 works for that case, but has low numerical accuracy in another case. + */ +#if defined(CUSPARSE_VERSION) && (10301 <= CUSPARSE_VERSION) && \ + (CUSPARSE_VERSION != 11702) KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp index 30e0b6e2434d..47b7d47f8ea5 100644 --- a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp @@ -18,15 +18,18 @@ #define KOKKOSPARSE_SPMV_MV_TPL_SPEC_DECL_HPP_ #include - -#include "KokkosKernels_Controls.hpp" +#include "KokkosKernels_tpl_handles_decl.hpp" #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE /* CUSPARSE_VERSION < 10301 either doesn't have cusparseSpMM or the non-tranpose version produces incorrect results. + + Version 11702 corresponds to CUDA 11.6.1, which also produces incorrect + results. 11701 (CUDA 11.6.0) is OK. */ -#if defined(CUSPARSE_VERSION) && (10301 <= CUSPARSE_VERSION) +#if defined(CUSPARSE_VERSION) && (10301 <= CUSPARSE_VERSION) && \ + (CUSPARSE_VERSION != 11702) #include "cusparse.h" #include "KokkosSparse_Utils_cusparse.hpp" @@ -64,9 +67,14 @@ inline cudaDataType compute_type() { */ template = true> cusparseDnMatDescr_t make_cusparse_dn_mat_descr_t(ViewType &view) { - const int64_t rows = view.extent(0); - const int64_t cols = view.extent(1); - const int64_t ld = view.extent(0); + // If the view is LayoutRight, we still need to create descr as column-major + // but it should be an implicit transpose, meaning dimensions and strides are + // swapped + bool transpose = + std::is_same_v; + const size_t rows = transpose ? view.extent(1) : view.extent(0); + const size_t cols = transpose ? view.extent(0) : view.extent(1); + const size_t ld = transpose ? view.stride(0) : view.stride(1); // cusparseCreateCsr notes it is safe to const_cast this away for input // pointers to a descriptor as long as that descriptor is not an output @@ -84,15 +92,15 @@ cusparseDnMatDescr_t make_cusparse_dn_mat_descr_t(ViewType &view) { const cusparseOrder_t order = CUSPARSE_ORDER_COL; cusparseDnMatDescr_t descr; - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseCreateDnMat(&descr, rows, cols, ld, values, valueType, order)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnMat( + &descr, static_cast(rows), static_cast(cols), + static_cast(ld), values, valueType, order)); return descr; } -template -void spmv_mv_cusparse(const Kokkos::Cuda &exec, - const KokkosKernels::Experimental::Controls &controls, +template +void spmv_mv_cusparse(const Kokkos::Cuda &exec, Handle *handle, const char mode[], typename YVector::non_const_value_type const &alpha, const AMatrix &A, const XVector &x, @@ -110,9 +118,17 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, using y_value_type = typename YVector::non_const_value_type; /* initialize cusparse library */ - cusparseHandle_t cusparseHandle = controls.getCusparseHandle(); + cusparseHandle_t cusparseHandle = + KokkosKernels::Impl::CusparseSingleton::singleton().cusparseHandle; /* Set cuSPARSE to use the given stream until this function exits */ - TemporarySetCusparseStream(cusparseHandle, exec); + TemporarySetCusparseStream tscs(cusparseHandle, exec); + + /* Check that cusparse can handle the types of the input Kokkos::CrsMatrix */ + const cusparseIndexType_t myCusparseOffsetType = + cusparse_index_type_t_from(); + const cusparseIndexType_t myCusparseEntryType = + cusparse_index_type_t_from(); + const cudaDataType aCusparseType = cuda_data_type_from(); /* Set the operation mode */ cusparseOperation_t opA; @@ -127,21 +143,6 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, } } - /* Check that cusparse can handle the types of the input Kokkos::CrsMatrix */ - const cusparseIndexType_t myCusparseOffsetType = - cusparse_index_type_t_from(); - const cusparseIndexType_t myCusparseEntryType = - cusparse_index_type_t_from(); - const cudaDataType aCusparseType = cuda_data_type_from(); - - /* create matrix */ - cusparseSpMatDescr_t A_cusparse; - KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr( - &A_cusparse, A.numRows(), A.numCols(), A.nnz(), - (void *)A.graph.row_map.data(), (void *)A.graph.entries.data(), - (void *)A.values.data(), myCusparseOffsetType, myCusparseEntryType, - CUSPARSE_INDEX_BASE_ZERO, aCusparseType)); - /* create lhs and rhs NOTE: The descriptions always say vecX and vecY are column-major cusparse order. For CUSPARSE_VERSION 10301 this is the only supported ordering. if X @@ -152,16 +153,20 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, constexpr bool xIsLR = std::is_same::value; static_assert(xIsLL || xIsLR, "X multivector was not LL or LR (TPL error)"); + static_assert( + std::is_same_v, + "Y multivector was not LL (TPL error)"); cusparseDnMatDescr_t vecX = make_cusparse_dn_mat_descr_t(x); cusparseDnMatDescr_t vecY = make_cusparse_dn_mat_descr_t(y); cusparseOperation_t opB = xIsLL ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; -// CUSPARSE_MM_ALG_DEFAULT was deprecated as early as 11.1 (maybe earlier) -#if CUSPARSE_VERSION < 11010 - const cusparseSpMMAlg_t alg = CUSPARSE_MM_ALG_DEFAULT; +// CUSPARSE_MM_ALG_DEFAULT was deprecated in CUDA 11.0.1 / cuSPARSE 11.0.0 and +// removed in CUDA 12.0.0 / cuSPARSE 12.0.0 +#if CUSPARSE_VERSION < 11000 + cusparseSpMMAlg_t algo = CUSPARSE_MM_ALG_DEFAULT; #else - const cusparseSpMMAlg_t alg = CUSPARSE_SPMM_ALG_DEFAULT; + cusparseSpMMAlg_t algo = CUSPARSE_SPMM_ALG_DEFAULT; #endif // the precision of the SpMV @@ -180,21 +185,39 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, } } - size_t bufferSize = 0; - KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMM_bufferSize( - cusparseHandle, opA, opB, &alpha, A_cusparse, vecX, &beta, vecY, - computeType, alg, &bufferSize)); + KokkosSparse::Impl::CuSparse10_SpMV_Data *subhandle; + if (handle->is_set_up) { + subhandle = + dynamic_cast(handle->tpl); + if (!subhandle) + throw std::runtime_error( + "KokkosSparse::spmv: subhandle is not set up for cusparse"); + } else { + subhandle = new KokkosSparse::Impl::CuSparse10_SpMV_Data(exec); + handle->tpl = subhandle; + /* create matrix */ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr( + &subhandle->mat, A.numRows(), A.numCols(), A.nnz(), + (void *)A.graph.row_map.data(), (void *)A.graph.entries.data(), + (void *)A.values.data(), myCusparseOffsetType, myCusparseEntryType, + CUSPARSE_INDEX_BASE_ZERO, aCusparseType)); + + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMM_bufferSize( + cusparseHandle, opA, opB, &alpha, subhandle->mat, vecX, &beta, vecY, + computeType, algo, &subhandle->bufferSize)); + + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMalloc(&subhandle->buffer, subhandle->bufferSize)); + + handle->is_set_up = true; + } - void *dBuffer = nullptr; - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(&dBuffer, bufferSize)); KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMM(cusparseHandle, opA, opB, &alpha, - A_cusparse, vecX, &beta, vecY, - computeType, alg, dBuffer)); + subhandle->mat, vecX, &beta, vecY, + computeType, algo, subhandle->buffer)); - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(dBuffer)); KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnMat(vecX)); KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnMat(vecY)); - KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(A_cusparse)); } #define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, XL, YL, SPACE, \ @@ -202,6 +225,8 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, template <> \ struct SPMV_MV< \ Kokkos::Cuda, \ + KokkosSparse::Impl::SPMVHandleImpl, \ KokkosSparse::CrsMatrix< \ SCALAR const, ORDINAL const, Kokkos::Device, \ Kokkos::MemoryTraits, OFFSET const>, \ @@ -213,6 +238,9 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, false, true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ using memory_trait_type = Kokkos::MemoryTraits; \ + using Handle = \ + KokkosSparse::Impl::SPMVHandleImpl; \ using AMatrix = CrsMatrix; \ using XVector = Kokkos::View< \ @@ -223,15 +251,14 @@ void spmv_mv_cusparse(const Kokkos::Cuda &exec, \ using coefficient_type = typename YVector::non_const_value_type; \ \ - using Controls = KokkosKernels::Experimental::Controls; \ - static void spmv_mv(const Kokkos::Cuda &exec, const Controls &controls, \ + static void spmv_mv(const Kokkos::Cuda &exec, Handle *handle, \ const char mode[], const coefficient_type &alpha, \ const AMatrix &A, const XVector &x, \ const coefficient_type &beta, const YVector &y) { \ std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ - spmv_mv_cusparse(exec, controls, mode, alpha, A, x, beta, y); \ + spmv_mv_cusparse(exec, handle, mode, alpha, A, x, beta, y); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp index 653ec9481160..854c2f2b263c 100644 --- a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp +++ b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp @@ -24,7 +24,8 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spmv_tpl_spec_avail { enum : bool { value = false }; }; @@ -40,6 +41,8 @@ struct spmv_tpl_spec_avail { template <> \ struct spmv_tpl_spec_avail< \ Kokkos::Cuda, \ + KokkosSparse::Impl::SPMVHandleImpl, \ KokkosSparse::CrsMatrix< \ const SCALAR, const ORDINAL, Kokkos::Device, \ Kokkos::MemoryTraits, const OFFSET>, \ @@ -187,6 +190,9 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, template <> \ struct spmv_tpl_spec_avail< \ Kokkos::HIP, \ + KokkosSparse::Impl::SPMVHandleImpl, \ KokkosSparse::CrsMatrix, \ Kokkos::MemoryTraits, \ @@ -217,22 +223,24 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(Kokkos::complex, #endif // KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ - template <> \ - struct spmv_tpl_spec_avail< \ - EXECSPACE, \ - KokkosSparse::CrsMatrix, \ - Kokkos::MemoryTraits, \ - const MKL_INT>, \ - Kokkos::View< \ - const SCALAR*, Kokkos::LayoutLeft, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ + template <> \ + struct spmv_tpl_spec_avail< \ + EXECSPACE, \ + KokkosSparse::Impl::SPMVHandleImpl, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + const MKL_INT>, \ + Kokkos::View< \ + const SCALAR*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; #ifdef KOKKOS_ENABLE_SERIAL @@ -251,45 +259,57 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) #if defined(KOKKOS_ENABLE_SYCL) && \ !defined(KOKKOSKERNELS_ENABLE_TPL_MKL_SYCL_OVERRIDE) -#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, ORDINAL, MEMSPACE) \ - template <> \ - struct spmv_tpl_spec_avail< \ - Kokkos::Experimental::SYCL, \ - KokkosSparse::CrsMatrix< \ - const SCALAR, const ORDINAL, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, const ORDINAL>, \ - Kokkos::View< \ - const SCALAR*, Kokkos::LayoutLeft, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL(SCALAR, ORDINAL, MEMSPACE) \ + template <> \ + struct spmv_tpl_spec_avail< \ + Kokkos::Experimental::SYCL, \ + KokkosSparse::Impl::SPMVHandleImpl, \ + KokkosSparse::CrsMatrix< \ + const SCALAR, const ORDINAL, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const ORDINAL>, \ + Kokkos::View< \ + const SCALAR*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; +// intel-oneapi-mkl/2023.2.0: spmv with complex data types produce: +// oneapi::mkl::sparse::gemv: unimplemented functionality: currently does not +// support complex data types. +// TODO: Revisit with later versions and selectively enable this if it's +// working. + KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( float, std::int32_t, Kokkos::Experimental::SYCLDeviceUSMSpace) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( double, std::int32_t, Kokkos::Experimental::SYCLDeviceUSMSpace) +/* KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( Kokkos::complex, std::int32_t, Kokkos::Experimental::SYCLDeviceUSMSpace) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( Kokkos::complex, std::int32_t, Kokkos::Experimental::SYCLDeviceUSMSpace) +*/ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( float, std::int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( double, std::int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace) +/* KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( Kokkos::complex, std::int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ONEMKL( Kokkos::complex, std::int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace) +*/ #endif #endif // KOKKOSKERNELS_ENABLE_TPL_MKL diff --git a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index a4c50e437f85..926d201a52fd 100644 --- a/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/packages/kokkos-kernels/sparse/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -18,8 +18,7 @@ #define KOKKOSPARSE_SPMV_TPL_SPEC_DECL_HPP_ #include - -#include "KokkosKernels_Controls.hpp" +#include "KokkosKernels_tpl_handles_decl.hpp" // cuSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE @@ -29,10 +28,8 @@ namespace KokkosSparse { namespace Impl { -template -void spmv_cusparse(const Kokkos::Cuda& exec, - const KokkosKernels::Experimental::Controls& controls, - const char mode[], +template +void spmv_cusparse(const Kokkos::Cuda& exec, Handle* handle, const char mode[], typename YVector::non_const_value_type const& alpha, const AMatrix& A, const XVector& x, typename YVector::non_const_value_type const& beta, @@ -41,9 +38,10 @@ void spmv_cusparse(const Kokkos::Cuda& exec, using value_type = typename AMatrix::non_const_value_type; /* initialize cusparse library */ - cusparseHandle_t cusparseHandle = controls.getCusparseHandle(); + cusparseHandle_t cusparseHandle = + KokkosKernels::Impl::CusparseSingleton::singleton().cusparseHandle; /* Set cuSPARSE to use the given stream until this function exits */ - TemporarySetCusparseStream(cusparseHandle, exec); + TemporarySetCusparseStream tscs(cusparseHandle, exec); /* Set the operation mode */ cusparseOperation_t myCusparseOperation; @@ -65,14 +63,11 @@ void spmv_cusparse(const Kokkos::Cuda& exec, !Kokkos::ArithTraits::isComplex) myCusparseOperation = CUSPARSE_OPERATION_TRANSPOSE; +// Hopefully this corresponds to CUDA reelase 10.1, which is the first to +// include the "generic" API #if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) using entry_type = typename AMatrix::non_const_ordinal_type; - /* Check that cusparse can handle the types of the input Kokkos::CrsMatrix */ - const cusparseIndexType_t myCusparseOffsetType = - cusparse_index_type_t_from(); - const cusparseIndexType_t myCusparseEntryType = - cusparse_index_type_t_from(); cudaDataType myCudaDataType; if (std::is_same::value) @@ -88,13 +83,11 @@ void spmv_cusparse(const Kokkos::Cuda& exec, "Scalar (data) type of CrsMatrix isn't supported by cuSPARSE, yet TPL " "layer says it is"); - /* create matrix */ - cusparseSpMatDescr_t A_cusparse; - KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr( - &A_cusparse, A.numRows(), A.numCols(), A.nnz(), - (void*)A.graph.row_map.data(), (void*)A.graph.entries.data(), - (void*)A.values.data(), myCusparseOffsetType, myCusparseEntryType, - CUSPARSE_INDEX_BASE_ZERO, myCudaDataType)); + /* Check that cusparse can handle the types of the input Kokkos::CrsMatrix */ + const cusparseIndexType_t myCusparseOffsetType = + cusparse_index_type_t_from(); + const cusparseIndexType_t myCusparseEntryType = + cusparse_index_type_t_from(); /* create lhs and rhs */ cusparseDnVecDescr_t vecX, vecY; @@ -103,150 +96,170 @@ void spmv_cusparse(const Kokkos::Cuda& exec, KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec( &vecY, y.extent_int(0), (void*)y.data(), myCudaDataType)); - size_t bufferSize = 0; - void* dBuffer = NULL; -#if CUSPARSE_VERSION >= 11301 - cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT; -#else - cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT; -#endif - if (controls.isParameter("algorithm")) { - const std::string algName = controls.getParameter("algorithm"); - if (algName == "default") -#if CUSPARSE_VERSION >= 11301 - alg = CUSPARSE_SPMV_ALG_DEFAULT; + // use default cusparse algo for best performance +#if CUSPARSE_VERSION >= 11400 + cusparseSpMVAlg_t algo = CUSPARSE_SPMV_ALG_DEFAULT; #else - alg = CUSPARSE_MV_ALG_DEFAULT; + cusparseSpMVAlg_t algo = CUSPARSE_MV_ALG_DEFAULT; #endif - else if (algName == "merge") -#if CUSPARSE_VERSION >= 11301 - alg = CUSPARSE_SPMV_CSR_ALG2; + + KokkosSparse::Impl::CuSparse10_SpMV_Data* subhandle; + + if (handle->is_set_up) { + subhandle = + dynamic_cast(handle->tpl); + if (!subhandle) + throw std::runtime_error( + "KokkosSparse::spmv: subhandle is not set up for cusparse"); + } else { + subhandle = new KokkosSparse::Impl::CuSparse10_SpMV_Data(exec); + handle->tpl = subhandle; + + /* create matrix */ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr( + &subhandle->mat, A.numRows(), A.numCols(), A.nnz(), + (void*)A.graph.row_map.data(), (void*)A.graph.entries.data(), + (void*)A.values.data(), myCusparseOffsetType, myCusparseEntryType, + CUSPARSE_INDEX_BASE_ZERO, myCudaDataType)); + + /* size and allocate buffer */ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV_bufferSize( + cusparseHandle, myCusparseOperation, &alpha, subhandle->mat, vecX, + &beta, vecY, myCudaDataType, algo, &subhandle->bufferSize)); + // Async memory management introduced in CUDA 11.2 +#if (CUDA_VERSION >= 11020) + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMallocAsync( + &subhandle->buffer, subhandle->bufferSize, exec.cuda_stream())); #else - alg = CUSPARSE_CSRMV_ALG2; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMalloc(&subhandle->buffer, subhandle->bufferSize)); #endif + handle->is_set_up = true; } - KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV_bufferSize( - cusparseHandle, myCusparseOperation, &alpha, A_cusparse, vecX, &beta, - vecY, myCudaDataType, alg, &bufferSize)); - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(&dBuffer, bufferSize)); /* perform SpMV */ - KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV(cusparseHandle, myCusparseOperation, - &alpha, A_cusparse, vecX, &beta, vecY, - myCudaDataType, alg, dBuffer)); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSpMV(cusparseHandle, myCusparseOperation, &alpha, subhandle->mat, + vecX, &beta, vecY, myCudaDataType, algo, subhandle->buffer)); - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(dBuffer)); KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(vecX)); KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnVec(vecY)); - KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(A_cusparse)); #elif (9000 <= CUDA_VERSION) - /* create and set the matrix descriptor */ - cusparseMatDescr_t descrA = 0; - KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descrA)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); - KOKKOS_CUSPARSE_SAFE_CALL( - cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); + KokkosSparse::Impl::CuSparse9_SpMV_Data* subhandle; - /* perform the actual SpMV operation */ - if (std::is_same::value) { - if (std::is_same::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseScsrmv( - cusparseHandle, myCusparseOperation, A.numRows(), A.numCols(), - A.nnz(), reinterpret_cast(&alpha), descrA, - reinterpret_cast(A.values.data()), - A.graph.row_map.data(), A.graph.entries.data(), - reinterpret_cast(x.data()), - reinterpret_cast(&beta), - reinterpret_cast(y.data()))); - - } else if (std::is_same::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrmv( - cusparseHandle, myCusparseOperation, A.numRows(), A.numCols(), - A.nnz(), reinterpret_cast(&alpha), descrA, - reinterpret_cast(A.values.data()), - A.graph.row_map.data(), A.graph.entries.data(), - reinterpret_cast(x.data()), - reinterpret_cast(&beta), - reinterpret_cast(y.data()))); - } else if (std::is_same>::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseCcsrmv( - cusparseHandle, myCusparseOperation, A.numRows(), A.numCols(), - A.nnz(), reinterpret_cast(&alpha), descrA, - reinterpret_cast(A.values.data()), - A.graph.row_map.data(), A.graph.entries.data(), - reinterpret_cast(x.data()), - reinterpret_cast(&beta), - reinterpret_cast(y.data()))); - } else if (std::is_same>::value) { - KOKKOS_CUSPARSE_SAFE_CALL(cusparseZcsrmv( - cusparseHandle, myCusparseOperation, A.numRows(), A.numCols(), - A.nnz(), reinterpret_cast(&alpha), descrA, - reinterpret_cast(A.values.data()), - A.graph.row_map.data(), A.graph.entries.data(), - reinterpret_cast(x.data()), - reinterpret_cast(&beta), - reinterpret_cast(y.data()))); - } else { - throw std::logic_error( - "Trying to call cusparse SpMV with a scalar type not float/double, " - "nor complex of either!"); - } + if (handle->is_set_up) { + subhandle = + dynamic_cast(handle->tpl); + if (!subhandle) + throw std::runtime_error( + "KokkosSparse::spmv: subhandle is not set up for cusparse"); } else { - throw std::logic_error( - "With cuSPARSE pre-10.0, offset type must be int. Something wrong with " - "TPL avail logic."); + /* create and set the subhandle and matrix descriptor */ + subhandle = new KokkosSparse::Impl::CuSparse9_SpMV_Data(exec); + handle->tpl = subhandle; + cusparseMatDescr_t descrA = 0; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&subhandle->mat)); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetMatType(subhandle->mat, CUSPARSE_MATRIX_TYPE_GENERAL)); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetMatIndexBase(subhandle->mat, CUSPARSE_INDEX_BASE_ZERO)); + handle->is_set_up = true; } - KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(descrA)); + /* perform the actual SpMV operation */ + static_assert( + std::is_same_v, + "With cuSPARSE pre-10.0, offset type must be int. Something wrong with " + "TPL avail logic."); + if constexpr (std::is_same_v) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseScsrmv( + cusparseHandle, myCusparseOperation, A.numRows(), A.numCols(), A.nnz(), + reinterpret_cast(&alpha), subhandle->mat, + reinterpret_cast(A.values.data()), A.graph.row_map.data(), + A.graph.entries.data(), reinterpret_cast(x.data()), + reinterpret_cast(&beta), + reinterpret_cast(y.data()))); + + } else if constexpr (std::is_same_v) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrmv( + cusparseHandle, myCusparseOperation, A.numRows(), A.numCols(), A.nnz(), + reinterpret_cast(&alpha), subhandle->mat, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), + reinterpret_cast(x.data()), + reinterpret_cast(&beta), + reinterpret_cast(y.data()))); + } else if constexpr (std::is_same_v>) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCcsrmv( + cusparseHandle, myCusparseOperation, A.numRows(), A.numCols(), A.nnz(), + reinterpret_cast(&alpha), subhandle->mat, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), + reinterpret_cast(x.data()), + reinterpret_cast(&beta), + reinterpret_cast(y.data()))); + } else if constexpr (std::is_same_v>) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseZcsrmv( + cusparseHandle, myCusparseOperation, A.numRows(), A.numCols(), A.nnz(), + reinterpret_cast(&alpha), subhandle->mat, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), + reinterpret_cast(x.data()), + reinterpret_cast(&beta), + reinterpret_cast(y.data()))); + } else { + static_assert( + static_assert(KokkosKernels::Impl::always_false_v, + "Trying to call cusparse SpMV with a scalar type not float/double, " + "nor complex of either!"); + } #endif // CUDA_VERSION } -#define KOKKOSSPARSE_SPMV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ - COMPILE_LIBRARY) \ - template <> \ - struct SPMV< \ - Kokkos::Cuda, \ - KokkosSparse::CrsMatrix< \ - SCALAR const, ORDINAL const, Kokkos::Device, \ - Kokkos::MemoryTraits, OFFSET const>, \ - Kokkos::View< \ - SCALAR const*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, COMPILE_LIBRARY> { \ - using device_type = Kokkos::Device; \ - using memory_trait_type = Kokkos::MemoryTraits; \ - using AMatrix = CrsMatrix; \ - using XVector = Kokkos::View< \ - SCALAR const*, LAYOUT, device_type, \ - Kokkos::MemoryTraits>; \ - using YVector = \ - Kokkos::View; \ - using Controls = KokkosKernels::Experimental::Controls; \ - \ - using coefficient_type = typename YVector::non_const_value_type; \ - \ - static void spmv(const Kokkos::Cuda& exec, const Controls& controls, \ - const char mode[], const coefficient_type& alpha, \ - const AMatrix& A, const XVector& x, \ - const coefficient_type& beta, const YVector& y) { \ - std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spmv_cusparse(exec, controls, mode, alpha, A, x, beta, y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSSPARSE_SPMV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ + COMPILE_LIBRARY) \ + template <> \ + struct SPMV< \ + Kokkos::Cuda, \ + KokkosSparse::Impl::SPMVHandleImpl, \ + KokkosSparse::CrsMatrix< \ + SCALAR const, ORDINAL const, Kokkos::Device, \ + Kokkos::MemoryTraits, OFFSET const>, \ + Kokkos::View< \ + SCALAR const*, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, COMPILE_LIBRARY> { \ + using device_type = Kokkos::Device; \ + using memory_trait_type = Kokkos::MemoryTraits; \ + using Handle = \ + KokkosSparse::Impl::SPMVHandleImpl; \ + using AMatrix = CrsMatrix; \ + using XVector = Kokkos::View< \ + SCALAR const*, LAYOUT, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View; \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + static void spmv(const Kokkos::Cuda& exec, Handle* handle, \ + const char mode[], const coefficient_type& alpha, \ + const AMatrix& A, const XVector& x, \ + const coefficient_type& beta, const YVector& y) { \ + std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_cusparse(exec, handle, mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -// BMK: cuSPARSE that comes with CUDA 9 does not support tranpose or conjugate -// transpose modes. No version of cuSPARSE supports mode C (conjugate, non -// transpose). In those cases, fall back to KokkosKernels native spmv. - #if (9000 <= CUDA_VERSION) KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, Kokkos::CudaSpace, @@ -362,10 +375,8 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int64_t, size_t, namespace KokkosSparse { namespace Impl { -template -void spmv_rocsparse(const Kokkos::HIP& exec, - const KokkosKernels::Experimental::Controls& controls, - const char mode[], +template +void spmv_rocsparse(const Kokkos::HIP& exec, Handle* handle, const char mode[], typename YVector::non_const_value_type const& alpha, const AMatrix& A, const XVector& x, typename YVector::non_const_value_type const& beta, @@ -375,9 +386,10 @@ void spmv_rocsparse(const Kokkos::HIP& exec, using value_type = typename AMatrix::non_const_value_type; /* initialize rocsparse library */ - rocsparse_handle handle = controls.getRocsparseHandle(); + rocsparse_handle rocsparseHandle = + KokkosKernels::Impl::RocsparseSingleton::singleton().rocsparseHandle; /* Set rocsparse to use the given stream until this function exits */ - TemporarySetRocsparseStream(handle, exec); + TemporarySetRocsparseStream tsrs(rocsparseHandle, exec); /* Set the operation mode */ rocsparse_operation myRocsparseOperation = mode_kk_to_rocsparse(mode); @@ -389,24 +401,6 @@ void spmv_rocsparse(const Kokkos::HIP& exec, /* Set the scalar type */ rocsparse_datatype compute_type = rocsparse_compute_type(); - /* Create the rocsparse mat and csr descr */ - rocsparse_mat_descr Amat; - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_mat_descr(&Amat)); - rocsparse_spmat_descr Aspmat; - // We need to do some casting to void* - // Note that row_map is always a const view so const_cast is necessary, - // however entries and values may not be const so we need to check first. - void* csr_row_ptr = - static_cast(const_cast(A.graph.row_map.data())); - void* csr_col_ind = - static_cast(const_cast(A.graph.entries.data())); - void* csr_val = static_cast(const_cast(A.values.data())); - - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_csr_descr( - &Aspmat, A.numRows(), A.numCols(), A.nnz(), csr_row_ptr, csr_col_ind, - csr_val, offset_index_type, entry_index_type, rocsparse_index_base_zero, - compute_type)); - /* Create rocsparse dense vectors for X and Y */ rocsparse_dnvec_descr vecX, vecY; void* x_data = static_cast( @@ -420,99 +414,134 @@ void spmv_rocsparse(const Kokkos::HIP& exec, &vecY, y.extent_int(0), y_data, rocsparse_compute_type())); - /* Actually perform the SpMV operation, first size buffer, then compute result - */ - size_t buffer_size = 0; - void* tmp_buffer = nullptr; rocsparse_spmv_alg alg = rocsparse_spmv_alg_default; - // Note, Dec 6th 2021 - lbv: - // rocSPARSE offers two diffrent algorithms for spmv - // 1. ocsparse_spmv_alg_csr_adaptive - // 2. rocsparse_spmv_alg_csr_stream - // it is unclear which one is the default algorithm - // or what both algorithms are intended for? - if (controls.isParameter("algorithm")) { - const std::string algName = controls.getParameter("algorithm"); - if (algName == "default") - alg = rocsparse_spmv_alg_default; - else if (algName == "merge") - alg = rocsparse_spmv_alg_csr_stream; + + KokkosSparse::Impl::RocSparse_CRS_SpMV_Data* subhandle; + if (handle->is_set_up) { + subhandle = + dynamic_cast(handle->tpl); + if (!subhandle) + throw std::runtime_error( + "KokkosSparse::spmv: subhandle is not set up for rocsparse CRS"); + } else { + subhandle = new KokkosSparse::Impl::RocSparse_CRS_SpMV_Data(exec); + handle->tpl = subhandle; + /* Create the rocsparse csr descr */ + // We need to do some casting to void* + // Note that row_map is always a const view so const_cast is necessary, + // however entries and values may not be const so we need to check first. + void* csr_row_ptr = + static_cast(const_cast(A.graph.row_map.data())); + void* csr_col_ind = + static_cast(const_cast(A.graph.entries.data())); + void* csr_val = + static_cast(const_cast(A.values.data())); + + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_create_csr_descr( + &subhandle->mat, A.numRows(), A.numCols(), A.nnz(), csr_row_ptr, + csr_col_ind, csr_val, offset_index_type, entry_index_type, + rocsparse_index_base_zero, compute_type)); + + /* Size and allocate buffer, and analyze the matrix */ + +#if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 60000 + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv( + rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, + &beta, vecY, compute_type, alg, rocsparse_spmv_stage_buffer_size, + &subhandle->bufferSize, nullptr)); + KOKKOS_IMPL_HIP_SAFE_CALL( + hipMalloc(&subhandle->buffer, subhandle->bufferSize)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv( + rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, + &beta, vecY, compute_type, alg, rocsparse_spmv_stage_preprocess, + &subhandle->bufferSize, subhandle->buffer)); +#elif KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50400 + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv_ex( + rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, + &beta, vecY, compute_type, alg, rocsparse_spmv_stage_auto, + &subhandle->bufferSize, nullptr)); + KOKKOS_IMPL_HIP_SAFE_CALL( + hipMalloc(&subhandle->buffer, subhandle->bufferSize)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv_ex( + rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, + &beta, vecY, compute_type, alg, rocsparse_spmv_stage_preprocess, + &subhandle->bufferSize, subhandle->buffer)); +#else + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv( + rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, + &beta, vecY, compute_type, alg, &subhandle->bufferSize, nullptr)); + KOKKOS_IMPL_HIP_SAFE_CALL( + hipMalloc(&subhandle->buffer, subhandle->bufferSize)); +#endif + handle->is_set_up = true; } + /* Perform the actual computation */ + #if KOKKOSSPARSE_IMPL_ROCM_VERSION >= 60000 - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( - rocsparse_spmv(handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, - vecY, compute_type, alg, rocsparse_spmv_stage_buffer_size, - &buffer_size, tmp_buffer)); - KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&tmp_buffer, buffer_size)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( - rocsparse_spmv(handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, - vecY, compute_type, alg, rocsparse_spmv_stage_compute, - &buffer_size, tmp_buffer)); + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv( + rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, + &beta, vecY, compute_type, alg, rocsparse_spmv_stage_compute, + &subhandle->bufferSize, subhandle->buffer)); #elif KOKKOSSPARSE_IMPL_ROCM_VERSION >= 50400 KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv_ex( - handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, vecY, - compute_type, alg, rocsparse_spmv_stage_auto, &buffer_size, tmp_buffer)); - KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&tmp_buffer, buffer_size)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_spmv_ex( - handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, vecY, - compute_type, alg, rocsparse_spmv_stage_auto, &buffer_size, tmp_buffer)); + rocsparseHandle, myRocsparseOperation, &alpha, subhandle->mat, vecX, + &beta, vecY, compute_type, alg, rocsparse_spmv_stage_compute, + &subhandle->bufferSize, subhandle->buffer)); #else KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( - rocsparse_spmv(handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, - vecY, compute_type, alg, &buffer_size, tmp_buffer)); - KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&tmp_buffer, buffer_size)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( - rocsparse_spmv(handle, myRocsparseOperation, &alpha, Aspmat, vecX, &beta, - vecY, compute_type, alg, &buffer_size, tmp_buffer)); + rocsparse_spmv(rocsparseHandle, myRocsparseOperation, &alpha, + subhandle->mat, vecX, &beta, vecY, compute_type, alg, + &subhandle->bufferSize, subhandle->buffer)); #endif - KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(tmp_buffer)); KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_dnvec_descr(vecY)); KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_dnvec_descr(vecX)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_spmat_descr(Aspmat)); - KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_destroy_mat_descr(Amat)); } -#define KOKKOSSPARSE_SPMV_ROCSPARSE(SCALAR, LAYOUT, COMPILE_LIBRARY) \ - template <> \ - struct SPMV< \ - Kokkos::HIP, \ - KokkosSparse::CrsMatrix, \ - Kokkos::MemoryTraits, \ - rocsparse_int const>, \ - Kokkos::View< \ - SCALAR const*, LAYOUT, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, COMPILE_LIBRARY> { \ - using device_type = Kokkos::Device; \ - using memory_trait_type = Kokkos::MemoryTraits; \ - using AMatrix = CrsMatrix; \ - using XVector = Kokkos::View< \ - SCALAR const*, LAYOUT, device_type, \ - Kokkos::MemoryTraits>; \ - using YVector = \ - Kokkos::View; \ - using Controls = KokkosKernels::Experimental::Controls; \ - \ - using coefficient_type = typename YVector::non_const_value_type; \ - \ - static void spmv(const Kokkos::HIP& exec, const Controls& controls, \ - const char mode[], const coefficient_type& alpha, \ - const AMatrix& A, const XVector& x, \ - const coefficient_type& beta, const YVector& y) { \ - std::string label = "KokkosSparse::spmv[TPL_ROCSPARSE," + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spmv_rocsparse(exec, controls, mode, alpha, A, x, beta, y); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSSPARSE_SPMV_ROCSPARSE(SCALAR, LAYOUT, COMPILE_LIBRARY) \ + template <> \ + struct SPMV< \ + Kokkos::HIP, \ + KokkosSparse::Impl::SPMVHandleImpl, \ + KokkosSparse::CrsMatrix, \ + Kokkos::MemoryTraits, \ + rocsparse_int const>, \ + Kokkos::View< \ + SCALAR const*, LAYOUT, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, COMPILE_LIBRARY> { \ + using device_type = Kokkos::Device; \ + using memory_trait_type = Kokkos::MemoryTraits; \ + using Handle = KokkosSparse::Impl::SPMVHandleImpl< \ + Kokkos::HIP, Kokkos::HIPSpace, SCALAR, rocsparse_int, rocsparse_int>; \ + using AMatrix = CrsMatrix; \ + using XVector = Kokkos::View< \ + SCALAR const*, LAYOUT, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View; \ + \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + static void spmv(const Kokkos::HIP& exec, Handle* handle, \ + const char mode[], const coefficient_type& alpha, \ + const AMatrix& A, const XVector& x, \ + const coefficient_type& beta, const YVector& y) { \ + std::string label = "KokkosSparse::spmv[TPL_ROCSPARSE," + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_rocsparse(exec, handle, mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSSPARSE_SPMV_ROCSPARSE(double, Kokkos::LayoutLeft, @@ -548,82 +577,77 @@ namespace Impl { #if (__INTEL_MKL__ > 2017) // MKL 2018 and above: use new interface: sparse_matrix_t and mkl_sparse_?_mv() -inline void spmv_mkl(sparse_operation_t op, float alpha, float beta, MKL_INT m, - MKL_INT n, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, const float* Avalues, - const float* x, float* y) { - sparse_matrix_t A_mkl; - matrix_descr A_descr; - A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; - A_descr.mode = SPARSE_FILL_MODE_FULL; - A_descr.diag = SPARSE_DIAG_NON_UNIT; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_csr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), - const_cast(Arowptrs + 1), const_cast(Aentries), - const_cast(Avalues))); - KOKKOSKERNELS_MKL_SAFE_CALL( - mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y)); -} - -inline void spmv_mkl(sparse_operation_t op, double alpha, double beta, - MKL_INT m, MKL_INT n, const MKL_INT* Arowptrs, - const MKL_INT* Aentries, const double* Avalues, - const double* x, double* y) { - sparse_matrix_t A_mkl; - matrix_descr A_descr; - A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; - A_descr.mode = SPARSE_FILL_MODE_FULL; - A_descr.diag = SPARSE_DIAG_NON_UNIT; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), - const_cast(Arowptrs + 1), const_cast(Aentries), - const_cast(Avalues))); - KOKKOSKERNELS_MKL_SAFE_CALL( - mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y)); -} - -inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, MKL_INT n, - const MKL_INT* Arowptrs, const MKL_INT* Aentries, - const Kokkos::complex* Avalues, - const Kokkos::complex* x, - Kokkos::complex* y) { - sparse_matrix_t A_mkl; - matrix_descr A_descr; - A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; - A_descr.mode = SPARSE_FILL_MODE_FULL; - A_descr.diag = SPARSE_DIAG_NON_UNIT; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_csr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), - const_cast(Arowptrs + 1), const_cast(Aentries), - (MKL_Complex8*)Avalues)); - MKL_Complex8 alpha_mkl{alpha.real(), alpha.imag()}; - MKL_Complex8 beta_mkl{beta.real(), beta.imag()}; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_mv( - op, alpha_mkl, A_mkl, A_descr, reinterpret_cast(x), - beta_mkl, reinterpret_cast(y))); -} - -inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, - Kokkos::complex beta, MKL_INT m, MKL_INT n, - const MKL_INT* Arowptrs, const MKL_INT* Aentries, - const Kokkos::complex* Avalues, - const Kokkos::complex* x, - Kokkos::complex* y) { - sparse_matrix_t A_mkl; - matrix_descr A_descr; - A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; - A_descr.mode = SPARSE_FILL_MODE_FULL; - A_descr.diag = SPARSE_DIAG_NON_UNIT; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_csr( - &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), - const_cast(Arowptrs + 1), const_cast(Aentries), - (MKL_Complex16*)Avalues)); - MKL_Complex16 alpha_mkl{alpha.real(), alpha.imag()}; - MKL_Complex16 beta_mkl{beta.real(), beta.imag()}; - KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_mv( - op, alpha_mkl, A_mkl, A_descr, reinterpret_cast(x), - beta_mkl, reinterpret_cast(y))); +// Note: Scalar here is the Kokkos type, not the MKL type +template +inline void spmv_mkl(Handle* handle, sparse_operation_t op, Scalar alpha, + Scalar beta, MKL_INT m, MKL_INT n, const MKL_INT* Arowptrs, + const MKL_INT* Aentries, const Scalar* Avalues, + const Scalar* x, Scalar* y) { + using MKLScalar = typename KokkosToMKLScalar::type; + using ExecSpace = typename Handle::ExecutionSpaceType; + using Subhandle = MKL_SpMV_Data; + Subhandle* subhandle; + const MKLScalar* x_mkl = reinterpret_cast(x); + MKLScalar* y_mkl = reinterpret_cast(y); + if (handle->is_set_up) { + subhandle = dynamic_cast(handle->tpl); + if (!subhandle) + throw std::runtime_error( + "KokkosSparse::spmv: subhandle is not set up for MKL CRS"); + } else { + // Use the default execution space instance, as classic MKL does not use + // a specific instance. + subhandle = new Subhandle(ExecSpace()); + handle->tpl = subhandle; + subhandle->descr.type = SPARSE_MATRIX_TYPE_GENERAL; + subhandle->descr.mode = SPARSE_FILL_MODE_FULL; + subhandle->descr.diag = SPARSE_DIAG_NON_UNIT; + // Note: the create_csr routine requires non-const values even though + // they're not actually modified + MKLScalar* Avalues_mkl = + reinterpret_cast(const_cast(Avalues)); + if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_csr( + &subhandle->mat, SPARSE_INDEX_BASE_ZERO, m, n, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), Avalues_mkl)); + } else if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( + &subhandle->mat, SPARSE_INDEX_BASE_ZERO, m, n, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), Avalues_mkl)); + } else if constexpr (std::is_same_v>) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_csr( + &subhandle->mat, SPARSE_INDEX_BASE_ZERO, m, n, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), Avalues_mkl)); + } else if constexpr (std::is_same_v>) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_csr( + &subhandle->mat, SPARSE_INDEX_BASE_ZERO, m, n, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), Avalues_mkl)); + } + handle->is_set_up = true; + } + MKLScalar alpha_mkl = KokkosToMKLScalar(alpha); + MKLScalar beta_mkl = KokkosToMKLScalar(beta); + if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_mv(op, alpha_mkl, subhandle->mat, + subhandle->descr, x_mkl, + beta_mkl, y_mkl)); + } else if constexpr (std::is_same_v) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_mv(op, alpha_mkl, subhandle->mat, + subhandle->descr, x_mkl, + beta_mkl, y_mkl)); + } else if constexpr (std::is_same_v>) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_mv(op, alpha_mkl, subhandle->mat, + subhandle->descr, x_mkl, + beta_mkl, y_mkl)); + } else if constexpr (std::is_same_v>) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_mv(op, alpha_mkl, subhandle->mat, + subhandle->descr, x_mkl, + beta_mkl, y_mkl)); + } } // Note: classic MKL runs on Serial/OpenMP but can't use our execution space @@ -631,6 +655,8 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, #define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ template <> \ struct SPMV, \ KokkosSparse::CrsMatrix< \ SCALAR const, MKL_INT const, \ Kokkos::Device, \ @@ -644,6 +670,9 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, Kokkos::MemoryTraits>, \ true, COMPILE_LIBRARY> { \ using device_type = Kokkos::Device; \ + using Handle = \ + KokkosSparse::Impl::SPMVHandleImpl; \ using AMatrix = \ CrsMatrix, MKL_INT const>; \ @@ -653,17 +682,16 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, using YVector = Kokkos::View>; \ using coefficient_type = typename YVector::non_const_value_type; \ - using Controls = KokkosKernels::Experimental::Controls; \ \ - static void spmv(const EXECSPACE&, const Controls&, const char mode[], \ + static void spmv(const EXECSPACE&, Handle* handle, const char mode[], \ const coefficient_type& alpha, const AMatrix& A, \ const XVector& x, const coefficient_type& beta, \ const YVector& y) { \ std::string label = "KokkosSparse::spmv[TPL_MKL," + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ - spmv_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), A.numCols(), \ - A.graph.row_map.data(), A.graph.entries.data(), \ + spmv_mkl(handle, mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), \ + A.numCols(), A.graph.row_map.data(), A.graph.entries.data(), \ A.values.data(), x.data(), y.data()); \ Kokkos::Profiling::popRegion(); \ } \ @@ -705,122 +733,103 @@ inline oneapi::mkl::transpose mode_kk_to_onemkl(char mode_kk) { "Invalid mode for oneMKL (should be one of N, T, H)"); } -template -struct spmv_onemkl_wrapper {}; - -template <> -struct spmv_onemkl_wrapper { - template - static void spmv(const execution_space& exec, oneapi::mkl::transpose mkl_mode, - typename matrix_type::non_const_value_type const alpha, - const matrix_type& A, const xview_type& x, - typename matrix_type::non_const_value_type const beta, - const yview_type& y) { - using scalar_type = typename matrix_type::non_const_value_type; - using ordinal_type = typename matrix_type::non_const_ordinal_type; - - // oneAPI doesn't directly support mode H with real values, but this is - // equivalent to mode T - if (mkl_mode == oneapi::mkl::transpose::conjtrans && - !Kokkos::ArithTraits::isComplex) - mkl_mode = oneapi::mkl::transpose::trans; - - oneapi::mkl::sparse::matrix_handle_t handle = nullptr; - oneapi::mkl::sparse::init_matrix_handle(&handle); - auto ev_set = oneapi::mkl::sparse::set_csr_data( - exec.sycl_queue(), handle, A.numRows(), A.numCols(), +template +inline void spmv_onemkl(const execution_space& exec, Handle* handle, + oneapi::mkl::transpose mkl_mode, + typename matrix_type::non_const_value_type const alpha, + const matrix_type& A, const xview_type& x, + typename matrix_type::non_const_value_type const beta, + const yview_type& y) { + using scalar_type = typename matrix_type::non_const_value_type; + using onemkl_scalar_type = typename KokkosToOneMKLScalar::type; + using ordinal_type = typename matrix_type::non_const_ordinal_type; + + // oneAPI doesn't directly support mode H with real values, but this is + // equivalent to mode T + if (mkl_mode == oneapi::mkl::transpose::conjtrans && + !Kokkos::ArithTraits::isComplex) + mkl_mode = oneapi::mkl::transpose::trans; + + OneMKL_SpMV_Data* subhandle; + if (handle->is_set_up) { + subhandle = dynamic_cast(handle->tpl); + if (!subhandle) + throw std::runtime_error( + "KokkosSparse::spmv: subhandle is not set up for OneMKL CRS"); + } else { + subhandle = new OneMKL_SpMV_Data(exec); + handle->tpl = subhandle; + oneapi::mkl::sparse::init_matrix_handle(&subhandle->mat); + // Even for out-of-order SYCL queue, the inputs here do not depend on + // kernels being sequenced + auto ev = oneapi::mkl::sparse::set_csr_data( + exec.sycl_queue(), subhandle->mat, A.numRows(), A.numCols(), oneapi::mkl::index_base::zero, const_cast(A.graph.row_map.data()), const_cast(A.graph.entries.data()), - const_cast(A.values.data())); - auto ev_opt = oneapi::mkl::sparse::optimize_gemv( - exec.sycl_queue(), mkl_mode, handle, {ev_set}); - auto ev_gemv = - oneapi::mkl::sparse::gemv(exec.sycl_queue(), mkl_mode, alpha, handle, - x.data(), beta, y.data(), {ev_opt}); - auto ev_release = oneapi::mkl::sparse::release_matrix_handle( - exec.sycl_queue(), &handle, {ev_gemv}); - ev_release.wait(); - } -}; - -template <> -struct spmv_onemkl_wrapper { - template - static void spmv(const execution_space& exec, oneapi::mkl::transpose mkl_mode, - typename matrix_type::non_const_value_type const alpha, - const matrix_type& A, const xview_type& x, - typename matrix_type::non_const_value_type const beta, - const yview_type& y) { - using scalar_type = typename matrix_type::non_const_value_type; - using ordinal_type = typename matrix_type::non_const_ordinal_type; - using mag_type = typename Kokkos::ArithTraits::mag_type; - - oneapi::mkl::sparse::matrix_handle_t handle = nullptr; - oneapi::mkl::sparse::init_matrix_handle(&handle); - auto ev_set = oneapi::mkl::sparse::set_csr_data( - exec.sycl_queue(), handle, static_cast(A.numRows()), - static_cast(A.numCols()), oneapi::mkl::index_base::zero, - const_cast(A.graph.row_map.data()), - const_cast(A.graph.entries.data()), - reinterpret_cast*>( + reinterpret_cast( const_cast(A.values.data()))); - auto ev_opt = oneapi::mkl::sparse::optimize_gemv( - exec.sycl_queue(), mkl_mode, handle, {ev_set}); - auto ev_gemv = oneapi::mkl::sparse::gemv( - exec.sycl_queue(), mkl_mode, alpha, handle, - reinterpret_cast*>( - const_cast(x.data())), - beta, reinterpret_cast*>(y.data()), {ev_opt}); - auto ev_release = oneapi::mkl::sparse::release_matrix_handle( - exec.sycl_queue(), &handle, {ev_gemv}); - ev_release.wait(); + // for out-of-order queue: the fence before gemv below will make sure + // optimize_gemv has finished + oneapi::mkl::sparse::optimize_gemv(exec.sycl_queue(), mkl_mode, + subhandle->mat, {ev}); + handle->is_set_up = true; } -}; - -#define KOKKOSSPARSE_SPMV_ONEMKL(SCALAR, ORDINAL, MEMSPACE, COMPILE_LIBRARY) \ - template <> \ - struct SPMV< \ - Kokkos::Experimental::SYCL, \ - KokkosSparse::CrsMatrix< \ - SCALAR const, ORDINAL const, \ - Kokkos::Device, \ - Kokkos::MemoryTraits, ORDINAL const>, \ - Kokkos::View< \ - SCALAR const*, Kokkos::LayoutLeft, \ - Kokkos::Device, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - true, COMPILE_LIBRARY> { \ - using execution_space = Kokkos::Experimental::SYCL; \ - using device_type = Kokkos::Device; \ - using AMatrix = \ - CrsMatrix, ORDINAL const>; \ - using XVector = Kokkos::View< \ - SCALAR const*, Kokkos::LayoutLeft, device_type, \ - Kokkos::MemoryTraits>; \ - using YVector = Kokkos::View>; \ - using coefficient_type = typename YVector::non_const_value_type; \ - using Controls = KokkosKernels::Experimental::Controls; \ - \ - static void spmv(const execution_space& exec, const Controls&, \ - const char mode[], const coefficient_type& alpha, \ - const AMatrix& A, const XVector& x, \ - const coefficient_type& beta, const YVector& y) { \ - std::string label = "KokkosSparse::spmv[TPL_ONEMKL," + \ - Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - oneapi::mkl::transpose mkl_mode = mode_kk_to_onemkl(mode[0]); \ - spmv_onemkl_wrapper::is_complex>::spmv( \ - exec, mkl_mode, alpha, A, x, beta, y); \ - Kokkos::Profiling::popRegion(); \ - } \ + + // Uncommon case: an out-of-order SYCL queue does not promise that previously + // enqueued kernels finish before starting this one. So fence exec to get the + // expected semantics. + if (!exec.sycl_queue().is_in_order()) exec.fence(); + oneapi::mkl::sparse::gemv( + exec.sycl_queue(), mkl_mode, alpha, subhandle->mat, + reinterpret_cast(x.data()), beta, + reinterpret_cast(y.data())); +} + +#define KOKKOSSPARSE_SPMV_ONEMKL(SCALAR, ORDINAL, MEMSPACE, COMPILE_LIBRARY) \ + template <> \ + struct SPMV< \ + Kokkos::Experimental::SYCL, \ + KokkosSparse::Impl::SPMVHandleImpl, \ + KokkosSparse::CrsMatrix< \ + SCALAR const, ORDINAL const, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, ORDINAL const>, \ + Kokkos::View< \ + SCALAR const*, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, COMPILE_LIBRARY> { \ + using execution_space = Kokkos::Experimental::SYCL; \ + using device_type = Kokkos::Device; \ + using Handle = KokkosSparse::Impl::SPMVHandleImpl< \ + Kokkos::Experimental::SYCL, MEMSPACE, SCALAR, ORDINAL, ORDINAL>; \ + using AMatrix = \ + CrsMatrix, ORDINAL const>; \ + using XVector = Kokkos::View< \ + SCALAR const*, Kokkos::LayoutLeft, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View>; \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + static void spmv(const execution_space& exec, Handle* handle, \ + const char mode[], const coefficient_type& alpha, \ + const AMatrix& A, const XVector& x, \ + const coefficient_type& beta, const YVector& y) { \ + std::string label = "KokkosSparse::spmv[TPL_ONEMKL," + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + oneapi::mkl::transpose mkl_mode = mode_kk_to_onemkl(mode[0]); \ + spmv_onemkl(exec, handle, mkl_mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSSPARSE_SPMV_ONEMKL(float, std::int32_t, @@ -829,12 +838,14 @@ KOKKOSSPARSE_SPMV_ONEMKL(float, std::int32_t, KOKKOSSPARSE_SPMV_ONEMKL(double, std::int32_t, Kokkos::Experimental::SYCLDeviceUSMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +/* KOKKOSSPARSE_SPMV_ONEMKL(Kokkos::complex, std::int32_t, - Kokkos::Experimental::SYCLDeviceUSMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::Experimental::SYCLDeviceUSMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) KOKKOSSPARSE_SPMV_ONEMKL(Kokkos::complex, std::int32_t, - Kokkos::Experimental::SYCLDeviceUSMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::Experimental::SYCLDeviceUSMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +*/ KOKKOSSPARSE_SPMV_ONEMKL(float, std::int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace, @@ -842,12 +853,14 @@ KOKKOSSPARSE_SPMV_ONEMKL(float, std::int64_t, KOKKOSSPARSE_SPMV_ONEMKL(double, std::int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +/* KOKKOSSPARSE_SPMV_ONEMKL(Kokkos::complex, std::int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) KOKKOSSPARSE_SPMV_ONEMKL(Kokkos::complex, std::int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +*/ #endif } // namespace Impl } // namespace KokkosSparse diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse.hpp index 8ae06b598a9a..624cd86ff5fe 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse.hpp @@ -16,9 +16,7 @@ #ifndef TEST_SPARSE_HPP #define TEST_SPARSE_HPP -#if KOKKOS_VERSION >= 40099 #include "Test_Sparse_coo2crs.hpp" -#endif // KOKKOS_VERSION >= 40099 #include "Test_Sparse_crs2coo.hpp" #include "Test_Sparse_Controls.hpp" #include "Test_Sparse_CrsMatrix.hpp" diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_bspgemm.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_bspgemm.hpp index d3c3a6134fca..58a2a18b8a30 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_bspgemm.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_bspgemm.hpp @@ -159,15 +159,6 @@ void test_bspgemm(lno_t blkDim, lno_t m, lno_t k, lno_t n, size_type nnz, return; } #endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL -#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && (CUSPARSE_VERSION < 11600) - { - std::cerr - << "TEST SKIPPED: See " - "https://github.com/kokkos/kokkos-kernels/issues/1965 for details." - << std::endl; - return; - } -#endif using namespace Test; // device::execution_space::initialize(); // device::execution_space::print_configuration(std::cout); diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 35fbcb44a48f..48c7d41a9197 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -356,7 +356,7 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, // Zero out X before solving Kokkos::deep_copy(x_vector, zero); run_gauss_seidel(input_mat, GS_CLUSTER, x_vector, y_vector, symmetric, - apply_type, clusterSizes[csize], + apply_type, clusterSizes[csize], false, (ClusteringAlgorithm)algo); Kokkos::deep_copy(x_host, x_vector); for (lno_t i = 0; i < numVecs; i++) { @@ -752,17 +752,8 @@ void test_gauss_seidel_streams_rank1( } #endif // KOKKOS_ENABLE_OPENMP - std::vector instances; - if (nstreams == 1) - instances = Kokkos::Experimental::partition_space(execution_space(), 1); - else if (nstreams == 2) - instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1); - else if (nstreams == 3) - instances = - Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1); - else - instances = - Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1); + auto instances = Kokkos::Experimental::partition_space( + execution_space(), std::vector(nstreams, 1)); std::vector kh_v(nstreams); std::vector input_mat_v(nstreams); diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_gmres.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_gmres.hpp index 199008752614..ee78d277297b 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_gmres.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_gmres.hpp @@ -48,120 +48,163 @@ struct TolMeta { static constexpr float value = 1e-5; // Lower tolerance for floats }; +template ::value>::type* = nullptr> +AType get_A(int n, int diagDominance, int) { + using lno_t = typename Crs::ordinal_type; + typename Crs::non_const_size_type nnz = 10 * n; + auto A = + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix( + n, n, nnz, 0, lno_t(0.01 * n), diagDominance); + KokkosSparse::sort_crs_matrix(A); + + return A; +} + +template ::value>::type* = nullptr> +AType get_A(int n, int diagDominance, int block_size) { + using lno_t = typename Crs::ordinal_type; + typename Crs::non_const_size_type nnz = 10 * n; + auto A_unblocked = + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix( + n, n, nnz, 0, lno_t(0.01 * n), diagDominance); + KokkosSparse::sort_crs_matrix(A_unblocked); + + // Convert to BSR + AType A(A_unblocked, block_size); + + return A; +} + template -void run_test_gmres() { - using exe_space = typename device::execution_space; - using mem_space = typename device::memory_space; - using sp_matrix_type = - KokkosSparse::CrsMatrix; +struct GmresTest { + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; + using AT = Kokkos::ArithTraits; + using exe_space = typename device::execution_space; + using mem_space = typename device::memory_space; + + using Crs = CrsMatrix; + using Bsr = BsrMatrix; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< size_type, lno_t, scalar_t, exe_space, mem_space, mem_space>; using float_t = typename Kokkos::ArithTraits::mag_type; - // Create a diagonally dominant sparse matrix to test: - constexpr auto n = 5000; - constexpr auto m = 15; - constexpr auto tol = TolMeta::value; - constexpr auto numRows = n; - constexpr auto numCols = n; - constexpr auto diagDominance = 1; - constexpr bool verbose = false; - - typename sp_matrix_type::non_const_size_type nnz = 10 * numRows; - auto A = KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< - sp_matrix_type>(numRows, numCols, nnz, 0, lno_t(0.01 * numRows), - diagDominance); - - // Make kernel handles - KernelHandle kh; - kh.create_gmres_handle(m, tol); - auto gmres_handle = kh.get_gmres_handle(); - using GMRESHandle = - typename std::remove_reference::type; - using ViewVectorType = typename GMRESHandle::nnz_value_view_t; - - // Set initial vectors: - ViewVectorType X("X", n); // Solution and initial guess - ViewVectorType Wj("Wj", n); // For checking residuals at end. - ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), - n); // right-hand side vec - // Make rhs ones so that results are repeatable: - Kokkos::deep_copy(B, 1.0); - - gmres_handle->set_verbose(verbose); - - // Test CGS2 - { - gmres(&kh, A, B, X); - - // Double check residuals at end of solve: - float_t nrmB = KokkosBlas::nrm2(B); - KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax - KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. - float_t endRes = KokkosBlas::nrm2(B) / nrmB; - - const auto conv_flag = gmres_handle->get_conv_flag_val(); - - EXPECT_LT(endRes, gmres_handle->get_tol()); - EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); - } + template + static void run_test_gmres() { + using sp_matrix_type = std::conditional_t; + + // Create a diagonally dominant sparse matrix to test: + constexpr auto n = 5000; + constexpr auto m = 15; + constexpr auto tol = TolMeta::value; + constexpr auto diagDominance = 1; + constexpr bool verbose = false; + constexpr auto block_size = UseBlocks ? 10 : 1; + + auto A = get_A(n, diagDominance, block_size); + + if (verbose) { + std::cout << "Running GMRES test with block_size=" << block_size + << std::endl; + } + + // Make kernel handles + KernelHandle kh; + kh.create_gmres_handle(m, tol); + auto gmres_handle = kh.get_gmres_handle(); + using GMRESHandle = + typename std::remove_reference::type; + using ViewVectorType = typename GMRESHandle::nnz_value_view_t; + + // Set initial vectors: + ViewVectorType X("X", n); // Solution and initial guess + ViewVectorType Wj("Wj", n); // For checking residuals at end. + ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), + n); // right-hand side vec + // Make rhs ones so that results are repeatable: + Kokkos::deep_copy(B, 1.0); - // Test MGS - { - gmres_handle->reset_handle(m, tol); - gmres_handle->set_ortho(GMRESHandle::Ortho::MGS); gmres_handle->set_verbose(verbose); - // reset X for next gmres call - Kokkos::deep_copy(X, 0.0); + // Test CGS2 + { + gmres(&kh, A, B, X); - gmres(&kh, A, B, X); + // Double check residuals at end of solve: + float_t nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + float_t endRes = KokkosBlas::nrm2(B) / nrmB; - // Double check residuals at end of solve: - float_t nrmB = KokkosBlas::nrm2(B); - KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax - KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. - float_t endRes = KokkosBlas::nrm2(B) / nrmB; + const auto conv_flag = gmres_handle->get_conv_flag_val(); - const auto conv_flag = gmres_handle->get_conv_flag_val(); + EXPECT_LT(endRes, gmres_handle->get_tol()); + EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); + } - EXPECT_LT(endRes, gmres_handle->get_tol()); - EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); - } + // Test MGS + { + gmres_handle->reset_handle(m, tol); + gmres_handle->set_ortho(GMRESHandle::Ortho::MGS); + gmres_handle->set_verbose(verbose); - // Test GSS2 with simple preconditioner - { - gmres_handle->reset_handle(m, tol); - gmres_handle->set_verbose(verbose); + // reset X for next gmres call + Kokkos::deep_copy(X, 0.0); + + gmres(&kh, A, B, X); + + // Double check residuals at end of solve: + float_t nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + float_t endRes = KokkosBlas::nrm2(B) / nrmB; + + const auto conv_flag = gmres_handle->get_conv_flag_val(); - // Make precond - KokkosSparse::Experimental::MatrixPrec myPrec(A); + EXPECT_LT(endRes, gmres_handle->get_tol()); + EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); + } - // reset X for next gmres call - Kokkos::deep_copy(X, 0.0); + // Test GSS2 with simple preconditioner + { + gmres_handle->reset_handle(m, tol); + gmres_handle->set_verbose(verbose); - gmres(&kh, A, B, X, &myPrec); + // Make precond + KokkosSparse::Experimental::MatrixPrec myPrec(A); - // Double check residuals at end of solve: - float_t nrmB = KokkosBlas::nrm2(B); - KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax - KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. - float_t endRes = KokkosBlas::nrm2(B) / nrmB; + // reset X for next gmres call + Kokkos::deep_copy(X, 0.0); - const auto conv_flag = gmres_handle->get_conv_flag_val(); + gmres(&kh, A, B, X, &myPrec); - EXPECT_LT(endRes, gmres_handle->get_tol()); - EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); + // Double check residuals at end of solve: + float_t nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + float_t endRes = KokkosBlas::nrm2(B) / nrmB; + + const auto conv_flag = gmres_handle->get_conv_flag_val(); + + EXPECT_LT(endRes, gmres_handle->get_tol()); + EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); + } } -} +}; } // namespace Test template void test_gmres() { - Test::run_test_gmres(); + using TestStruct = Test::GmresTest; + TestStruct::template run_test_gmres(); + TestStruct::template run_test_gmres(); } #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_par_ilut.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_par_ilut.hpp index 4370ebe37e0a..cda09d0639e3 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_par_ilut.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_par_ilut.hpp @@ -29,6 +29,8 @@ #include "KokkosSparse_LUPrec.hpp" #include "KokkosSparse_SortCrs.hpp" +#include "Test_vector_fixtures.hpp" + #include using namespace KokkosSparse; @@ -52,69 +54,6 @@ struct TolMeta { } // namespace ParIlut -template -std::vector> decompress_matrix( - Kokkos::View& row_map, - Kokkos::View& entries, - Kokkos::View& values) { - const size_type nrows = row_map.size() - 1; - std::vector> result; - result.resize(nrows); - for (auto& row : result) { - row.resize(nrows, 0.0); - } - - auto hrow_map = Kokkos::create_mirror_view(row_map); - auto hentries = Kokkos::create_mirror_view(entries); - auto hvalues = Kokkos::create_mirror_view(values); - Kokkos::deep_copy(hrow_map, row_map); - Kokkos::deep_copy(hentries, entries); - Kokkos::deep_copy(hvalues, values); - - for (size_type row_idx = 0; row_idx < nrows; ++row_idx) { - const size_type row_nnz_begin = hrow_map(row_idx); - const size_type row_nnz_end = hrow_map(row_idx + 1); - for (size_type row_nnz = row_nnz_begin; row_nnz < row_nnz_end; ++row_nnz) { - const lno_t col_idx = hentries(row_nnz); - const scalar_t value = hvalues(row_nnz); - result[row_idx][col_idx] = value; - } - } - - return result; -} - -template -void check_matrix(const std::string& name, - Kokkos::View& row_map, - Kokkos::View& entries, - Kokkos::View& values, - const std::vector>& expected) { - const auto decompressed_mtx = decompress_matrix(row_map, entries, values); - - const size_type nrows = row_map.size() - 1; - for (size_type row_idx = 0; row_idx < nrows; ++row_idx) { - for (size_type col_idx = 0; col_idx < nrows; ++col_idx) { - EXPECT_NEAR(expected[row_idx][col_idx], - decompressed_mtx[row_idx][col_idx], 0.01) - << "Failed check is: " << name << "[" << row_idx << "][" << col_idx - << "]"; - } - } -} - -template -void print_matrix(const std::vector>& matrix) { - for (const auto& row : matrix) { - for (const auto& item : row) { - std::printf("%.2f ", item); - } - std::cout << std::endl; - } -} - template void run_test_par_ilut() { @@ -131,47 +70,14 @@ void run_test_par_ilut() { {0.5, -3., 6., 0.}, {0.2, -0.5, -9., 0.}}; - const scalar_t ZERO = scalar_t(0); - - const size_type nrows = A.size(); - - // Count A nnz's - size_type nnz = 0; - for (size_type row_idx = 0; row_idx < nrows; ++row_idx) { - for (size_type col_idx = 0; col_idx < nrows; ++col_idx) { - if (A[row_idx][col_idx] != ZERO) { - ++nnz; - } - } - } - // Allocate device CRS views for A - RowMapType row_map("row_map", nrows + 1); - EntriesType entries("entries", nnz); - ValuesType values("values", nnz); - - // Create host mirror views for CRS A - auto hrow_map = Kokkos::create_mirror_view(row_map); - auto hentries = Kokkos::create_mirror_view(entries); - auto hvalues = Kokkos::create_mirror_view(values); + RowMapType row_map("row_map", 0); + EntriesType entries("entries", 0); + ValuesType values("values", 0); - // Compress A into CRS (host views) - size_type curr_nnz = 0; - for (size_type row_idx = 0; row_idx < nrows; ++row_idx) { - for (size_type col_idx = 0; col_idx < nrows; ++col_idx) { - if (A[row_idx][col_idx] != ZERO) { - hentries(curr_nnz) = col_idx; - hvalues(curr_nnz) = A[row_idx][col_idx]; - ++curr_nnz; - } - hrow_map(row_idx + 1) = curr_nnz; - } - } + compress_matrix(row_map, entries, values, A); - // Copy host A CRS views to device A CRS views - Kokkos::deep_copy(row_map, hrow_map); - Kokkos::deep_copy(entries, hentries); - Kokkos::deep_copy(values, hvalues); + const size_type nrows = A.size(); // Make kernel handle KernelHandle kh; diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spadd.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spadd.hpp index 05ff97bb3a91..3156801dbd20 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spadd.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spadd.hpp @@ -32,7 +32,11 @@ typedef Kokkos::complex kokkos_complex_double; typedef Kokkos::complex kokkos_complex_float; -// Create a random square matrix for testing mat-mat addition kernels +// Create a random nrows by ncols matrix for testing mat-mat addition kernels. +// minNNZ, maxNNZ: min and max number of nonzeros in any row. +// maxNNZ > ncols will result in duplicated entries in a row, otherwise entries +// in a row are unique. +// sortRows: whether to sort columns in a row template crsMat_t randomMatrix(ordinal_type nrows, ordinal_type ncols, ordinal_type minNNZ, ordinal_type maxNNZ, bool sortRows) { @@ -117,7 +121,9 @@ void test_spadd(lno_t numRows, lno_t numCols, size_type minNNZ, srand((numRows << 1) ^ numCols); KernelHandle handle; - handle.create_spadd_handle(sortRows); + // If maxNNZ <= numCols, the generated A, B have unique column indices in each + // row + handle.create_spadd_handle(sortRows, static_cast(maxNNZ) <= numCols); crsMat_t A = randomMatrix(numRows, numCols, minNNZ, maxNNZ, sortRows); crsMat_t B = @@ -129,9 +135,10 @@ void test_spadd(lno_t numRows, lno_t numCols, size_type minNNZ, // initialized Kokkos::deep_copy(c_row_map, (size_type)5); auto addHandle = handle.get_spadd_handle(); - KokkosSparse::Experimental::spadd_symbolic(&handle, A.graph.row_map, - A.graph.entries, B.graph.row_map, - B.graph.entries, c_row_map); + typename Device::execution_space exec{}; + KokkosSparse::Experimental::spadd_symbolic( + exec, &handle, numRows, numCols, A.graph.row_map, A.graph.entries, + B.graph.row_map, B.graph.entries, c_row_map); size_type c_nnz = addHandle->get_c_nnz(); // Fill values, entries with incorrect incorret values_type c_values( @@ -140,9 +147,9 @@ void test_spadd(lno_t numRows, lno_t numCols, size_type minNNZ, entries_type c_entries("C entries", c_nnz); Kokkos::deep_copy(c_entries, (lno_t)5); KokkosSparse::Experimental::spadd_numeric( - &handle, A.graph.row_map, A.graph.entries, A.values, KAT::one(), - B.graph.row_map, B.graph.entries, B.values, KAT::one(), c_row_map, - c_entries, c_values); + exec, &handle, numRows, numCols, A.graph.row_map, A.graph.entries, + A.values, KAT::one(), B.graph.row_map, B.graph.entries, B.values, + KAT::one(), c_row_map, c_entries, c_values); // done with handle // create C using CRS arrays crsMat_t C("C", numRows, numCols, c_nnz, c_values, c_row_map, c_entries); diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spgemm.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spgemm.hpp index 7e655d4c0cee..bd1e68c37001 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spgemm.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spgemm.hpp @@ -486,16 +486,6 @@ void test_issue402() { template void test_issue1738() { -#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && (CUDA_VERSION >= 11000) && \ - (CUDA_VERSION < 11040) - { - std::cerr - << "TEST SKIPPED: See " - "https://github.com/kokkos/kokkos-kernels/issues/1777 for details." - << std::endl; - return; - } -#endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL // Make sure that std::invalid_argument is thrown if you: // - call numeric where an input matrix's entries have changed. // - try to reuse an spgemm handle by calling symbolic with new input diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spiluk.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spiluk.hpp index 77cdb1ede129..2a8398ed46ba 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spiluk.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spiluk.hpp @@ -26,161 +26,139 @@ #include "KokkosBlas1_nrm2.hpp" #include "KokkosSparse_spmv.hpp" #include "KokkosSparse_spiluk.hpp" +#include "KokkosSparse_crs_to_bsr_impl.hpp" +#include "KokkosSparse_bsr_to_crs_impl.hpp" +#include "KokkosSparse_LUPrec.hpp" +#include "KokkosSparse_gmres.hpp" -#include +#include "Test_vector_fixtures.hpp" + +#include +#include using namespace KokkosSparse; using namespace KokkosSparse::Experimental; using namespace KokkosKernels; using namespace KokkosKernels::Experimental; -// #ifndef kokkos_complex_double -// #define kokkos_complex_double Kokkos::complex -// #define kokkos_complex_float Kokkos::complex -// #endif +using kokkos_complex_double = Kokkos::complex; +using kokkos_complex_float = Kokkos::complex; + +// Comment this out to do focussed debugging +#define TEST_SPILUK_FULL_CHECKS -typedef Kokkos::complex kokkos_complex_double; -typedef Kokkos::complex kokkos_complex_float; +// Test verbosity level. 0 = none, 1 = print residuals, 2 = print L,U +#define TEST_SPILUK_VERBOSE_LEVEL 0 + +// #define TEST_SPILUK_TINY_TEST namespace Test { -template -void run_test_spiluk() { - typedef Kokkos::View RowMapType; - typedef Kokkos::View EntriesType; - typedef Kokkos::View ValuesType; - typedef Kokkos::ArithTraits AT; - - const size_type nrows = 9; - const size_type nnz = 21; - - RowMapType row_map("row_map", nrows + 1); - EntriesType entries("entries", nnz); - ValuesType values("values", nnz); - - auto hrow_map = Kokkos::create_mirror_view(row_map); - auto hentries = Kokkos::create_mirror_view(entries); - auto hvalues = Kokkos::create_mirror_view(values); - - scalar_t ZERO = scalar_t(0); - scalar_t ONE = scalar_t(1); - scalar_t MONE = scalar_t(-1); - - hrow_map(0) = 0; - hrow_map(1) = 3; - hrow_map(2) = 5; - hrow_map(3) = 6; - hrow_map(4) = 9; - hrow_map(5) = 11; - hrow_map(6) = 13; - hrow_map(7) = 15; - hrow_map(8) = 18; - hrow_map(9) = nnz; - - hentries(0) = 0; - hentries(1) = 2; - hentries(2) = 5; - hentries(3) = 1; - hentries(4) = 6; - hentries(5) = 2; - hentries(6) = 0; - hentries(7) = 3; - hentries(8) = 4; - hentries(9) = 0; - hentries(10) = 4; - hentries(11) = 1; - hentries(12) = 5; - hentries(13) = 2; - hentries(14) = 6; - hentries(15) = 3; - hentries(16) = 4; - hentries(17) = 7; - hentries(18) = 3; - hentries(19) = 4; - hentries(20) = 8; - - hvalues(0) = 10; - hvalues(1) = 0.3; - hvalues(2) = 0.6; - hvalues(3) = 11; - hvalues(4) = 0.7; - hvalues(5) = 12; - hvalues(6) = 5; - hvalues(7) = 13; - hvalues(8) = 1; - hvalues(9) = 4; - hvalues(10) = 14; - hvalues(11) = 3; - hvalues(12) = 15; - hvalues(13) = 7; - hvalues(14) = 16; - hvalues(15) = 6; - hvalues(16) = 5; - hvalues(17) = 17; - hvalues(18) = 2; - hvalues(19) = 2.5; - hvalues(20) = 18; - - Kokkos::deep_copy(row_map, hrow_map); - Kokkos::deep_copy(entries, hentries); - Kokkos::deep_copy(values, hvalues); - - typedef KokkosKernels::Experimental::KokkosKernelsHandle< - size_type, lno_t, scalar_t, typename device::execution_space, - typename device::memory_space, typename device::memory_space> - KernelHandle; - - KernelHandle kh; - - // SPILUKAlgorithm::SEQLVLSCHD_RP - { - kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_RP, nrows, 4 * nrows, - 4 * nrows); +#ifdef TEST_SPILUK_TINY_TEST +template +std::vector> get_fixture() { + std::vector> A = {{10.00, 1.00, 0.00, 0.00}, + {0.00, 11.00, 0.00, 0.00}, + {0.00, 2.00, 12.00, 0.00}, + {5.00, 0.00, 3.00, 13.00}}; + return A; +} +#else +template +std::vector> get_fixture() { + std::vector> A = { + {10.00, 0.00, 0.30, 0.00, 0.00, 0.60, 0.00, 0.00, 0.00}, + {0.00, 11.00, 0.00, 0.00, 0.00, 0.00, 0.70, 0.00, 0.00}, + {0.00, 0.00, 12.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00}, + {5.00, 0.00, 0.00, 13.00, 1.00, 0.00, 0.00, 0.00, 0.00}, + {4.00, 0.00, 0.00, 0.00, 14.00, 0.00, 0.00, 0.00, 0.00}, + {0.00, 3.00, 0.00, 0.00, 0.00, 15.00, 0.00, 0.00, 0.00}, + {0.00, 0.00, 7.00, 0.00, 0.00, 0.00, 16.00, 0.00, 0.00}, + {0.00, 0.00, 0.00, 6.00, 5.00, 0.00, 0.00, 17.00, 0.00}, + {0.00, 0.00, 0.00, 2.00, 2.50, 0.00, 0.00, 0.00, 18.00}}; + return A; +} +#endif - auto spiluk_handle = kh.get_spiluk_handle(); +template < + typename MatrixType, typename CRS, + typename std::enable_if::value>::type* = nullptr> +MatrixType get_A(CRS A_unblocked, const size_t) { + return A_unblocked; +} - // Allocate L and U as outputs - RowMapType L_row_map("L_row_map", nrows + 1); - EntriesType L_entries("L_entries", spiluk_handle->get_nnzL()); - ValuesType L_values("L_values", spiluk_handle->get_nnzL()); - RowMapType U_row_map("U_row_map", nrows + 1); - EntriesType U_entries("U_entries", spiluk_handle->get_nnzU()); - ValuesType U_values("U_values", spiluk_handle->get_nnzU()); +template < + typename MatrixType, typename CRS, + typename std::enable_if::value>::type* = nullptr> +MatrixType get_A(CRS A_unblocked, const size_t block_size) { + // Convert to BSR + MatrixType A(A_unblocked, block_size); - typename KernelHandle::const_nnz_lno_t fill_lev = 2; + return A; +} - spiluk_symbolic(&kh, fill_lev, row_map, entries, L_row_map, L_entries, - U_row_map, U_entries); +template < + typename MatrixType, typename RowMapType, typename EntriesType, + typename ValuesType, + typename std::enable_if::value>::type* = nullptr> +MatrixType make_matrix(const char* name, const RowMapType& row_map, + const EntriesType& entries, const ValuesType& values, + const size_t) { + const auto nrows = row_map.extent(0) - 1; + return MatrixType(name, nrows, nrows, values.extent(0), values, row_map, + entries); +} - Kokkos::fence(); +template < + typename MatrixType, typename RowMapType, typename EntriesType, + typename ValuesType, + typename std::enable_if::value>::type* = nullptr> +MatrixType make_matrix(const char* name, const RowMapType& row_map, + const EntriesType& entries, const ValuesType& values, + const size_t block_size) { + const auto nrows = row_map.extent(0) - 1; + return MatrixType(name, nrows, nrows, values.extent(0), values, row_map, + entries, block_size); +} - Kokkos::resize(L_entries, spiluk_handle->get_nnzL()); - Kokkos::resize(L_values, spiluk_handle->get_nnzL()); - Kokkos::resize(U_entries, spiluk_handle->get_nnzU()); - Kokkos::resize(U_values, spiluk_handle->get_nnzU()); +static constexpr double EPS = 1e-7; - spiluk_handle->print_algorithm(); - spiluk_numeric(&kh, fill_lev, row_map, entries, values, L_row_map, - L_entries, L_values, U_row_map, U_entries, U_values); +template +struct SpilukTest { + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; + using AT = Kokkos::ArithTraits; - Kokkos::fence(); + using RowMapType_hostmirror = typename RowMapType::HostMirror; + using EntriesType_hostmirror = typename EntriesType::HostMirror; + using ValuesType_hostmirror = typename ValuesType::HostMirror; + using execution_space = typename device::execution_space; + using memory_space = typename device::memory_space; + using range_policy = Kokkos::RangePolicy; - // Checking - typedef CrsMatrix crsMat_t; - crsMat_t A("A_Mtx", nrows, nrows, nnz, values, row_map, entries); - crsMat_t L("L_Mtx", nrows, nrows, spiluk_handle->get_nnzL(), L_values, - L_row_map, L_entries); - crsMat_t U("U_Mtx", nrows, nrows, spiluk_handle->get_nnzU(), U_values, - U_row_map, U_entries); + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>; + + using Crs = CrsMatrix; + using Bsr = BsrMatrix; + + template + static typename AT::mag_type check_result_impl( + const AType& A, const LType& L, const UType& U, const size_type nrows, + const size_type block_size = 1) { + const scalar_t ZERO = scalar_t(0); + const scalar_t ONE = scalar_t(1); + const scalar_t MONE = scalar_t(-1); // Create a reference view e set to all 1's - ValuesType e_one("e_one", nrows); - Kokkos::deep_copy(e_one, 1.0); + ValuesType e_one("e_one", nrows * block_size); + Kokkos::deep_copy(e_one, ONE); // Create two views for spmv results - ValuesType bb("bb", nrows); - ValuesType bb_tmp("bb_tmp", nrows); + ValuesType bb("bb", nrows * block_size); + ValuesType bb_tmp("bb_tmp", nrows * block_size); // Compute norm2(L*U*e_one - A*e_one)/norm2(A*e_one) KokkosSparse::spmv("N", ONE, A, e_one, ZERO, bb); @@ -192,27 +170,111 @@ void run_test_spiluk() { typename AT::mag_type diff_nrm = KokkosBlas::nrm2(bb); - EXPECT_TRUE((diff_nrm / bb_nrm) < 1e-4); + return diff_nrm / bb_nrm; + } - kh.destroy_spiluk_handle(); + static bool is_triangular(const RowMapType& drow_map, + const EntriesType& dentries, bool check_lower) { + const size_type nrows = drow_map.extent(0) - 1; + + auto row_map = Kokkos::create_mirror_view(drow_map); + auto entries = Kokkos::create_mirror_view(dentries); + Kokkos::deep_copy(row_map, drow_map); + Kokkos::deep_copy(entries, dentries); + + for (size_type row = 0; row < nrows; ++row) { + const size_type row_nnz_begin = row_map(row); + const size_type row_nnz_end = row_map(row + 1); + for (size_type nnz = row_nnz_begin; nnz < row_nnz_end; ++nnz) { + const size_type col = entries(nnz); + if (col > row && check_lower) { + return false; + } else if (col < row && !check_lower) { + return false; + } + } + } + return true; } - // SPILUKAlgorithm::SEQLVLSCHD_TP1 - { - kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, nrows, 4 * nrows, - 4 * nrows); + template + static void check_result(const RowMapType& row_map, + const EntriesType& entries, const ValuesType& values, + const RowMapType& L_row_map, + const EntriesType& L_entries, + const ValuesType& L_values, + const RowMapType& U_row_map, + const EntriesType& U_entries, + const ValuesType& U_values, const lno_t fill_lev, + const size_type block_size = 1) { + using sp_matrix_type = std::conditional_t; + + KK_REQUIRE(UseBlocks || (block_size == 1)); + + // Checking + const auto nrows = row_map.extent(0) - 1; + auto A = make_matrix("A_Mtx", row_map, entries, values, + block_size); + auto L = make_matrix("L_Mtx", L_row_map, L_entries, + L_values, block_size); + auto U = make_matrix("U_Mtx", U_row_map, U_entries, + U_values, block_size); + + EXPECT_TRUE(is_triangular(L_row_map, L_entries, true)); + EXPECT_TRUE(is_triangular(U_row_map, U_entries, false)); + + const auto result = check_result_impl(A, L, U, nrows, block_size); + if (TEST_SPILUK_VERBOSE_LEVEL > 0) { + std::cout << "For nrows=" << nrows << ", fill_level=" << fill_lev; + if (UseBlocks) { + std::cout << ", block_size=" << block_size; + } else { + std::cout << ", unblocked"; + } + std::cout << " had residual: " << result << std::endl; + } + if (TEST_SPILUK_VERBOSE_LEVEL > 1) { + std::cout << "L result" << std::endl; + print_matrix( + decompress_matrix(L_row_map, L_entries, L_values, block_size)); + std::cout << "U result" << std::endl; + print_matrix( + decompress_matrix(U_row_map, U_entries, U_values, block_size)); + } + + if (fill_lev > 1) { + if (UseBlocks) { + EXPECT_LT(result, 1e-2); + } else { + EXPECT_LT(result, 1e-4); + } + } + } + + template + static std::tuple + run_and_check_spiluk(KernelHandle& kh, const RowMapType& row_map, + const EntriesType& entries, const ValuesType& values, + SPILUKAlgorithm alg, const lno_t fill_lev, + const size_type block_size = 1) { + KK_REQUIRE(UseBlocks || (block_size == 1)); + + const size_type block_items = block_size * block_size; + const size_type nrows = row_map.extent(0) - 1; + kh.create_spiluk_handle(alg, nrows, 40 * nrows, 40 * nrows, + !UseBlocks ? 0 : block_size); auto spiluk_handle = kh.get_spiluk_handle(); + if (TeamSize != -1) { + spiluk_handle->set_team_size(TeamSize); + } // Allocate L and U as outputs RowMapType L_row_map("L_row_map", nrows + 1); EntriesType L_entries("L_entries", spiluk_handle->get_nnzL()); - ValuesType L_values("L_values", spiluk_handle->get_nnzL()); RowMapType U_row_map("U_row_map", nrows + 1); EntriesType U_entries("U_entries", spiluk_handle->get_nnzU()); - ValuesType U_values("U_values", spiluk_handle->get_nnzU()); - - typename KernelHandle::const_nnz_lno_t fill_lev = 2; spiluk_symbolic(&kh, fill_lev, row_map, entries, L_row_map, L_entries, U_row_map, U_entries); @@ -220,292 +282,609 @@ void run_test_spiluk() { Kokkos::fence(); Kokkos::resize(L_entries, spiluk_handle->get_nnzL()); - Kokkos::resize(L_values, spiluk_handle->get_nnzL()); Kokkos::resize(U_entries, spiluk_handle->get_nnzU()); - Kokkos::resize(U_values, spiluk_handle->get_nnzU()); + ValuesType L_values("L_values", spiluk_handle->get_nnzL() * block_items); + ValuesType U_values("U_values", spiluk_handle->get_nnzU() * block_items); - spiluk_handle->print_algorithm(); spiluk_numeric(&kh, fill_lev, row_map, entries, values, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values); Kokkos::fence(); - // Checking - typedef CrsMatrix crsMat_t; - crsMat_t A("A_Mtx", nrows, nrows, nnz, values, row_map, entries); - crsMat_t L("L_Mtx", nrows, nrows, spiluk_handle->get_nnzL(), L_values, - L_row_map, L_entries); - crsMat_t U("U_Mtx", nrows, nrows, spiluk_handle->get_nnzU(), U_values, - U_row_map, U_entries); + check_result(row_map, entries, values, L_row_map, L_entries, + L_values, U_row_map, U_entries, U_values, fill_lev, + block_size); - // Create a reference view e set to all 1's - ValuesType e_one("e_one", nrows); - Kokkos::deep_copy(e_one, 1.0); + kh.destroy_spiluk_handle(); - // Create two views for spmv results - ValuesType bb("bb", nrows); - ValuesType bb_tmp("bb_tmp", nrows); +#ifdef TEST_SPILUK_FULL_CHECKS + // If block_size is 1, results should exactly match unblocked results + if (block_size == 1 && UseBlocks) { + const auto [L_row_map_u, L_entries_u, L_values_u, U_row_map_u, + U_entries_u, U_values_u] = + run_and_check_spiluk(kh, row_map, entries, values, + alg, fill_lev); + + EXPECT_NEAR_KK_1DVIEW(L_row_map, L_row_map_u, EPS); + EXPECT_NEAR_KK_1DVIEW(L_entries, L_entries_u, EPS); + EXPECT_NEAR_KK_1DVIEW(L_values, L_values_u, EPS); + EXPECT_NEAR_KK_1DVIEW(U_row_map, U_row_map_u, EPS); + EXPECT_NEAR_KK_1DVIEW(U_entries, U_entries_u, EPS); + EXPECT_NEAR_KK_1DVIEW(U_values, U_values_u, EPS); + } - // Compute norm2(L*U*e_one - A*e_one)/norm2(A*e_one) - KokkosSparse::spmv("N", ONE, A, e_one, ZERO, bb); + // Check that team size = 1 produces same result + if (TeamSize != 1) { + const auto [L_row_map_ts1, L_entries_ts1, L_values_ts1, U_row_map_ts1, + U_entries_ts1, U_values_ts1] = + run_and_check_spiluk(kh, row_map, entries, values, alg, + fill_lev, block_size); + + EXPECT_NEAR_KK_1DVIEW(L_row_map, L_row_map_ts1, EPS); + EXPECT_NEAR_KK_1DVIEW(L_entries, L_entries_ts1, EPS); + EXPECT_NEAR_KK_1DVIEW(L_values, L_values_ts1, EPS); + EXPECT_NEAR_KK_1DVIEW(U_row_map, U_row_map_ts1, EPS); + EXPECT_NEAR_KK_1DVIEW(U_entries, U_entries_ts1, EPS); + EXPECT_NEAR_KK_1DVIEW(U_values, U_values_ts1, EPS); + } +#endif - typename AT::mag_type bb_nrm = KokkosBlas::nrm2(bb); + return std::make_tuple(L_row_map, L_entries, L_values, U_row_map, U_entries, + U_values); + } - KokkosSparse::spmv("N", ONE, U, e_one, ZERO, bb_tmp); - KokkosSparse::spmv("N", ONE, L, bb_tmp, MONE, bb); + static void run_test_spiluk() { + std::vector> A = get_fixture(); - typename AT::mag_type diff_nrm = KokkosBlas::nrm2(bb); + if (TEST_SPILUK_VERBOSE_LEVEL > 1) { + std::cout << "A input" << std::endl; + print_matrix(A); + } - EXPECT_TRUE((diff_nrm / bb_nrm) < 1e-4); + RowMapType row_map; + EntriesType entries; + ValuesType values; - kh.destroy_spiluk_handle(); + compress_matrix(row_map, entries, values, A); + + const lno_t fill_lev = 2; + + KernelHandle kh; + + run_and_check_spiluk(kh, row_map, entries, values, + SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev); } -} -template -void run_test_spiluk_streams(int test_algo, int nstreams) { - using RowMapType = Kokkos::View; - using EntriesType = Kokkos::View; - using ValuesType = Kokkos::View; - using RowMapType_hostmirror = typename RowMapType::HostMirror; - using EntriesType_hostmirror = typename EntriesType::HostMirror; - using ValuesType_hostmirror = typename ValuesType::HostMirror; - using execution_space = typename device::execution_space; - using memory_space = typename device::memory_space; - using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< - size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>; - using crsMat_t = CrsMatrix; - using AT = Kokkos::ArithTraits; + static void run_test_spiluk_blocks() { + std::vector> A = get_fixture(); - // Workaround for OpenMP: skip tests if concurrency < nstreams because of - // not enough resource to partition - bool run_streams_test = true; -#ifdef KOKKOS_ENABLE_OPENMP - if (std::is_same::value) { - int exec_concurrency = execution_space().concurrency(); - if (exec_concurrency < nstreams) { - run_streams_test = false; - std::cout << " Skip stream test: concurrency = " << exec_concurrency - << std::endl; + if (TEST_SPILUK_VERBOSE_LEVEL > 1) { + std::cout << "A input" << std::endl; + print_matrix(A); + } + + RowMapType row_map, brow_map; + EntriesType entries, bentries; + ValuesType values, bvalues; + + compress_matrix(row_map, entries, values, A); + + const size_type nrows = A.size(); + const size_type nnz = values.extent(0); + const lno_t fill_lev = 2; + const size_type block_size = nrows % 2 == 0 ? 2 : 3; + ASSERT_EQ(nrows % block_size, 0); + + KernelHandle kh; + + Crs crs("crs for block spiluk test", nrows, nrows, nnz, values, row_map, + entries); + + std::vector block_sizes = {1, block_size}; + + for (auto block_size_itr : block_sizes) { + Bsr bsr(crs, block_size_itr); + + // Pull out views from BSR + Kokkos::resize(brow_map, bsr.graph.row_map.extent(0)); + Kokkos::resize(bentries, bsr.graph.entries.extent(0)); + Kokkos::resize(bvalues, bsr.values.extent(0)); + Kokkos::deep_copy(brow_map, bsr.graph.row_map); + Kokkos::deep_copy(bentries, bsr.graph.entries); + Kokkos::deep_copy(bvalues, bsr.values); + + run_and_check_spiluk(kh, brow_map, bentries, bvalues, + SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, + block_size_itr); + } + } + + static void run_test_spiluk_scale() { + // Create a diagonally dominant sparse matrix to test: + constexpr auto nrows = 5000; + constexpr auto diagDominance = 2; + + size_type nnz = 10 * nrows; + auto A = + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix( + nrows, nrows, nnz, 0, lno_t(0.01 * nrows), diagDominance); + + KokkosSparse::sort_crs_matrix(A); + + // Pull out views from CRS + RowMapType row_map("row_map", A.graph.row_map.extent(0)); + EntriesType entries("entries", A.graph.entries.extent(0)); + ValuesType values("values", A.values.extent(0)); + Kokkos::deep_copy(row_map, A.graph.row_map); + Kokkos::deep_copy(entries, A.graph.entries); + Kokkos::deep_copy(values, A.values); + + for (lno_t fill_lev = 0; fill_lev < 4; ++fill_lev) { + KernelHandle kh; + + run_and_check_spiluk(kh, row_map, entries, values, + SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev); + } + } + + static void run_test_spiluk_scale_blocks() { + // Create a diagonally dominant sparse matrix to test: + constexpr auto nrows = 5000; + constexpr auto diagDominance = 2; + + RowMapType brow_map; + EntriesType bentries; + ValuesType bvalues; + + // const size_type block_size = 10; + + size_type nnz = 10 * nrows; + auto A = + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix( + nrows, nrows, nnz, 0, lno_t(0.01 * nrows), diagDominance); + + KokkosSparse::sort_crs_matrix(A); + + std::vector block_sizes = {1, 2, 4, 10}; + + for (auto block_size : block_sizes) { + // Convert to BSR + Bsr bsr(A, block_size); + + // Pull out views from BSR + Kokkos::resize(brow_map, bsr.graph.row_map.extent(0)); + Kokkos::resize(bentries, bsr.graph.entries.extent(0)); + Kokkos::resize(bvalues, bsr.values.extent(0)); + Kokkos::deep_copy(brow_map, bsr.graph.row_map); + Kokkos::deep_copy(bentries, bsr.graph.entries); + Kokkos::deep_copy(bvalues, bsr.values); + + for (lno_t fill_lev = 0; fill_lev < 4; ++fill_lev) { + KernelHandle kh; + + run_and_check_spiluk(kh, brow_map, bentries, bvalues, + SPILUKAlgorithm::SEQLVLSCHD_TP1, fill_lev, + block_size); + } } } + + static void run_test_spiluk_streams(SPILUKAlgorithm test_algo, int nstreams) { + // Workaround for OpenMP: skip tests if concurrency < nstreams because of + // not enough resource to partition + bool run_streams_test = true; +#ifdef KOKKOS_ENABLE_OPENMP + if (std::is_same::value) { + int exec_concurrency = execution_space().concurrency(); + if (exec_concurrency < nstreams) { + run_streams_test = false; + std::cout << " Skip stream test: concurrency = " << exec_concurrency + << std::endl; + } + } #endif - if (!run_streams_test) return; - - const size_type nrows = 9; - const size_type nnz = 21; - - std::vector instances; - if (nstreams == 1) - instances = Kokkos::Experimental::partition_space(execution_space(), 1); - else if (nstreams == 2) - instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1); - else if (nstreams == 3) - instances = - Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1); - else - instances = - Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1); - - std::vector kh_v(nstreams); - std::vector kh_ptr_v(nstreams); - std::vector A_row_map_v(nstreams); - std::vector A_entries_v(nstreams); - std::vector A_values_v(nstreams); - std::vector L_row_map_v(nstreams); - std::vector L_entries_v(nstreams); - std::vector L_values_v(nstreams); - std::vector U_row_map_v(nstreams); - std::vector U_entries_v(nstreams); - std::vector U_values_v(nstreams); - - RowMapType_hostmirror hrow_map("hrow_map", nrows + 1); - EntriesType_hostmirror hentries("hentries", nnz); - ValuesType_hostmirror hvalues("hvalues", nnz); - - scalar_t ZERO = scalar_t(0); - scalar_t ONE = scalar_t(1); - scalar_t MONE = scalar_t(-1); - - hrow_map(0) = 0; - hrow_map(1) = 3; - hrow_map(2) = 5; - hrow_map(3) = 6; - hrow_map(4) = 9; - hrow_map(5) = 11; - hrow_map(6) = 13; - hrow_map(7) = 15; - hrow_map(8) = 18; - hrow_map(9) = nnz; - - hentries(0) = 0; - hentries(1) = 2; - hentries(2) = 5; - hentries(3) = 1; - hentries(4) = 6; - hentries(5) = 2; - hentries(6) = 0; - hentries(7) = 3; - hentries(8) = 4; - hentries(9) = 0; - hentries(10) = 4; - hentries(11) = 1; - hentries(12) = 5; - hentries(13) = 2; - hentries(14) = 6; - hentries(15) = 3; - hentries(16) = 4; - hentries(17) = 7; - hentries(18) = 3; - hentries(19) = 4; - hentries(20) = 8; - - hvalues(0) = 10; - hvalues(1) = 0.3; - hvalues(2) = 0.6; - hvalues(3) = 11; - hvalues(4) = 0.7; - hvalues(5) = 12; - hvalues(6) = 5; - hvalues(7) = 13; - hvalues(8) = 1; - hvalues(9) = 4; - hvalues(10) = 14; - hvalues(11) = 3; - hvalues(12) = 15; - hvalues(13) = 7; - hvalues(14) = 16; - hvalues(15) = 6; - hvalues(16) = 5; - hvalues(17) = 17; - hvalues(18) = 2; - hvalues(19) = 2.5; - hvalues(20) = 18; - - typename KernelHandle::const_nnz_lno_t fill_lev = 2; - - for (int i = 0; i < nstreams; i++) { - // Allocate A as input - A_row_map_v[i] = RowMapType("A_row_map", nrows + 1); - A_entries_v[i] = EntriesType("A_entries", nnz); - A_values_v[i] = ValuesType("A_values", nnz); - - // Copy from host to device - Kokkos::deep_copy(A_row_map_v[i], hrow_map); - Kokkos::deep_copy(A_entries_v[i], hentries); - Kokkos::deep_copy(A_values_v[i], hvalues); - - // Create handle - kh_v[i] = KernelHandle(); - if (test_algo == 0) - kh_v[i].create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_RP, nrows, - 4 * nrows, 4 * nrows); - else if (test_algo == 1) - kh_v[i].create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, nrows, - 4 * nrows, 4 * nrows); - kh_ptr_v[i] = &kh_v[i]; - - auto spiluk_handle = kh_v[i].get_spiluk_handle(); - std::cout << " Stream " << i << ": "; - spiluk_handle->print_algorithm(); + if (!run_streams_test) return; - // Allocate L and U as outputs - L_row_map_v[i] = RowMapType("L_row_map", nrows + 1); - L_entries_v[i] = EntriesType("L_entries", spiluk_handle->get_nnzL()); - L_values_v[i] = ValuesType("L_values", spiluk_handle->get_nnzL()); - U_row_map_v[i] = RowMapType("U_row_map", nrows + 1); - U_entries_v[i] = EntriesType("U_entries", spiluk_handle->get_nnzU()); - U_values_v[i] = ValuesType("U_values", spiluk_handle->get_nnzU()); - - // Symbolic phase - spiluk_symbolic(kh_ptr_v[i], fill_lev, A_row_map_v[i], A_entries_v[i], - L_row_map_v[i], L_entries_v[i], U_row_map_v[i], - U_entries_v[i], nstreams); + std::vector weights(nstreams, 1); + std::vector instances = + Kokkos::Experimental::partition_space(execution_space(), weights); - Kokkos::fence(); + std::vector kh_v(nstreams); + std::vector kh_ptr_v(nstreams); + std::vector A_row_map_v(nstreams); + std::vector A_entries_v(nstreams); + std::vector A_values_v(nstreams); + std::vector L_row_map_v(nstreams); + std::vector L_entries_v(nstreams); + std::vector L_values_v(nstreams); + std::vector U_row_map_v(nstreams); + std::vector U_entries_v(nstreams); + std::vector U_values_v(nstreams); - Kokkos::resize(L_entries_v[i], spiluk_handle->get_nnzL()); - Kokkos::resize(L_values_v[i], spiluk_handle->get_nnzL()); - Kokkos::resize(U_entries_v[i], spiluk_handle->get_nnzU()); - Kokkos::resize(U_values_v[i], spiluk_handle->get_nnzU()); - } // Done handle creation and spiluk_symbolic on all streams - - // Numeric phase - spiluk_numeric_streams(instances, kh_ptr_v, fill_lev, A_row_map_v, - A_entries_v, A_values_v, L_row_map_v, L_entries_v, - L_values_v, U_row_map_v, U_entries_v, U_values_v); - - for (int i = 0; i < nstreams; i++) instances[i].fence(); - - // Checking - for (int i = 0; i < nstreams; i++) { - auto spiluk_handle = kh_v[i].get_spiluk_handle(); - crsMat_t A("A_Mtx", nrows, nrows, nnz, A_values_v[i], A_row_map_v[i], - A_entries_v[i]); - crsMat_t L("L_Mtx", nrows, nrows, spiluk_handle->get_nnzL(), L_values_v[i], - L_row_map_v[i], L_entries_v[i]); - crsMat_t U("U_Mtx", nrows, nrows, spiluk_handle->get_nnzU(), U_values_v[i], - U_row_map_v[i], U_entries_v[i]); + std::vector> Afix = get_fixture(); - // Create a reference view e set to all 1's - ValuesType e_one("e_one", nrows); - Kokkos::deep_copy(e_one, 1.0); + RowMapType row_map; + EntriesType entries; + ValuesType values; - // Create two views for spmv results - ValuesType bb("bb", nrows); - ValuesType bb_tmp("bb_tmp", nrows); + compress_matrix(row_map, entries, values, Afix); - // Compute norm2(L*U*e_one - A*e_one)/norm2(A*e_one) - KokkosSparse::spmv("N", ONE, A, e_one, ZERO, bb); + const size_type nrows = Afix.size(); + const size_type nnz = values.extent(0); - typename AT::mag_type bb_nrm = KokkosBlas::nrm2(bb); + RowMapType_hostmirror hrow_map("hrow_map", nrows + 1); + EntriesType_hostmirror hentries("hentries", nnz); + ValuesType_hostmirror hvalues("hvalues", nnz); - KokkosSparse::spmv("N", ONE, U, e_one, ZERO, bb_tmp); - KokkosSparse::spmv("N", ONE, L, bb_tmp, MONE, bb); + Kokkos::deep_copy(hrow_map, row_map); + Kokkos::deep_copy(hentries, entries); + Kokkos::deep_copy(hvalues, values); - typename AT::mag_type diff_nrm = KokkosBlas::nrm2(bb); + typename KernelHandle::const_nnz_lno_t fill_lev = 2; + + for (int i = 0; i < nstreams; i++) { + // Allocate A as input + A_row_map_v[i] = RowMapType("A_row_map", nrows + 1); + A_entries_v[i] = EntriesType("A_entries", nnz); + A_values_v[i] = ValuesType("A_values", nnz); + + // Copy from host to device + Kokkos::deep_copy(A_row_map_v[i], hrow_map); + Kokkos::deep_copy(A_entries_v[i], hentries); + Kokkos::deep_copy(A_values_v[i], hvalues); + + // Create handle + kh_v[i] = KernelHandle(); + kh_v[i].create_spiluk_handle(test_algo, nrows, 4 * nrows, 4 * nrows); + kh_ptr_v[i] = &kh_v[i]; + + auto spiluk_handle = kh_v[i].get_spiluk_handle(); + + // Allocate L and U as outputs + L_row_map_v[i] = RowMapType("L_row_map", nrows + 1); + L_entries_v[i] = EntriesType("L_entries", spiluk_handle->get_nnzL()); + U_row_map_v[i] = RowMapType("U_row_map", nrows + 1); + U_entries_v[i] = EntriesType("U_entries", spiluk_handle->get_nnzU()); + + // Symbolic phase + spiluk_symbolic(kh_ptr_v[i], fill_lev, A_row_map_v[i], A_entries_v[i], + L_row_map_v[i], L_entries_v[i], U_row_map_v[i], + U_entries_v[i], nstreams); + + Kokkos::fence(); - EXPECT_TRUE((diff_nrm / bb_nrm) < 1e-4); + Kokkos::resize(L_entries_v[i], spiluk_handle->get_nnzL()); + Kokkos::resize(U_entries_v[i], spiluk_handle->get_nnzU()); + L_values_v[i] = ValuesType("L_values", spiluk_handle->get_nnzL()); + U_values_v[i] = ValuesType("U_values", spiluk_handle->get_nnzU()); + } // Done handle creation and spiluk_symbolic on all streams - kh_v[i].destroy_spiluk_handle(); + // Numeric phase + spiluk_numeric_streams(instances, kh_ptr_v, fill_lev, A_row_map_v, + A_entries_v, A_values_v, L_row_map_v, L_entries_v, + L_values_v, U_row_map_v, U_entries_v, U_values_v); + + for (int i = 0; i < nstreams; i++) instances[i].fence(); + + // Checking + for (int i = 0; i < nstreams; i++) { + check_result(A_row_map_v[i], A_entries_v[i], A_values_v[i], + L_row_map_v[i], L_entries_v[i], L_values_v[i], + U_row_map_v[i], U_entries_v[i], U_values_v[i], + fill_lev); + + kh_v[i].destroy_spiluk_handle(); + } } -} + + static void run_test_spiluk_streams_blocks(SPILUKAlgorithm test_algo, + int nstreams) { + // Workaround for OpenMP: skip tests if concurrency < nstreams because of + // not enough resource to partition + bool run_streams_test = true; +#ifdef KOKKOS_ENABLE_OPENMP + if (std::is_same::value) { + int exec_concurrency = execution_space().concurrency(); + if (exec_concurrency < nstreams) { + run_streams_test = false; + std::cout << " Skip stream test: concurrency = " << exec_concurrency + << std::endl; + } + } +#endif + if (!run_streams_test) return; + + std::vector weights(nstreams, 1); + std::vector instances = + Kokkos::Experimental::partition_space(execution_space(), weights); + + std::vector kh_v(nstreams); + std::vector kh_ptr_v(nstreams); + std::vector A_row_map_v(nstreams); + std::vector A_entries_v(nstreams); + std::vector A_values_v(nstreams); + std::vector L_row_map_v(nstreams); + std::vector L_entries_v(nstreams); + std::vector L_values_v(nstreams); + std::vector U_row_map_v(nstreams); + std::vector U_entries_v(nstreams); + std::vector U_values_v(nstreams); + + std::vector> Afix = get_fixture(); + + RowMapType row_map, brow_map; + EntriesType entries, bentries; + ValuesType values, bvalues; + + compress_matrix(row_map, entries, values, Afix); + + const size_type nrows = Afix.size(); + const size_type block_size = nrows % 2 == 0 ? 2 : 3; + const size_type block_items = block_size * block_size; + ASSERT_EQ(nrows % block_size, 0); + + // Convert to BSR + Crs crs("crs for block spiluk test", nrows, nrows, values.extent(0), values, + row_map, entries); + Bsr bsr(crs, block_size); + + // Pull out views from BSR + Kokkos::resize(brow_map, bsr.graph.row_map.extent(0)); + Kokkos::resize(bentries, bsr.graph.entries.extent(0)); + Kokkos::resize(bvalues, bsr.values.extent(0)); + Kokkos::deep_copy(brow_map, bsr.graph.row_map); + Kokkos::deep_copy(bentries, bsr.graph.entries); + Kokkos::deep_copy(bvalues, bsr.values); + + const size_type bnrows = brow_map.extent(0) - 1; + const size_type bnnz = bentries.extent(0); + + RowMapType_hostmirror hrow_map("hrow_map", bnrows + 1); + EntriesType_hostmirror hentries("hentries", bnnz); + ValuesType_hostmirror hvalues("hvalues", bnnz * block_items); + + Kokkos::deep_copy(hrow_map, brow_map); + Kokkos::deep_copy(hentries, bentries); + Kokkos::deep_copy(hvalues, bvalues); + + typename KernelHandle::const_nnz_lno_t fill_lev = 2; + + for (int i = 0; i < nstreams; i++) { + // Allocate A as input + A_row_map_v[i] = RowMapType("A_row_map", bnrows + 1); + A_entries_v[i] = EntriesType("A_entries", bnnz); + A_values_v[i] = ValuesType("A_values", bnnz * block_items); + + // Copy from host to device + Kokkos::deep_copy(A_row_map_v[i], hrow_map); + Kokkos::deep_copy(A_entries_v[i], hentries); + Kokkos::deep_copy(A_values_v[i], hvalues); + + // Create handle + kh_v[i] = KernelHandle(); + kh_v[i].create_spiluk_handle(test_algo, bnrows, 4 * bnrows, 4 * bnrows, + block_size); + kh_ptr_v[i] = &kh_v[i]; + + auto spiluk_handle = kh_v[i].get_spiluk_handle(); + + // Allocate L and U as outputs + L_row_map_v[i] = RowMapType("L_row_map", bnrows + 1); + L_entries_v[i] = EntriesType("L_entries", spiluk_handle->get_nnzL()); + U_row_map_v[i] = RowMapType("U_row_map", bnrows + 1); + U_entries_v[i] = EntriesType("U_entries", spiluk_handle->get_nnzU()); + + // Symbolic phase + spiluk_symbolic(kh_ptr_v[i], fill_lev, A_row_map_v[i], A_entries_v[i], + L_row_map_v[i], L_entries_v[i], U_row_map_v[i], + U_entries_v[i], nstreams); + + Kokkos::fence(); + + Kokkos::resize(L_entries_v[i], spiluk_handle->get_nnzL()); + Kokkos::resize(U_entries_v[i], spiluk_handle->get_nnzU()); + L_values_v[i] = + ValuesType("L_values", spiluk_handle->get_nnzL() * block_items); + U_values_v[i] = + ValuesType("U_values", spiluk_handle->get_nnzU() * block_items); + } // Done handle creation and spiluk_symbolic on all streams + + // Numeric phase + spiluk_numeric_streams(instances, kh_ptr_v, fill_lev, A_row_map_v, + A_entries_v, A_values_v, L_row_map_v, L_entries_v, + L_values_v, U_row_map_v, U_entries_v, U_values_v); + + for (int i = 0; i < nstreams; i++) instances[i].fence(); + + // Checking + for (int i = 0; i < nstreams; i++) { + check_result(A_row_map_v[i], A_entries_v[i], A_values_v[i], + L_row_map_v[i], L_entries_v[i], L_values_v[i], + U_row_map_v[i], U_entries_v[i], U_values_v[i], + fill_lev, block_size); + + kh_v[i].destroy_spiluk_handle(); + } + } + + template + static void run_test_spiluk_precond() { + // Test using spiluk as a preconditioner + // Does (LU)^inv Ax = (LU)^inv b converge faster than solving Ax=b? + + // Create a diagonally dominant sparse matrix to test: + using sp_matrix_type = std::conditional_t; + + constexpr auto nrows = 5000; + constexpr auto m = 15; + constexpr auto diagDominance = 2; + constexpr auto tol = 1e-5; + constexpr bool verbose = false; + + if (UseBlocks) { + // Skip test if not on host. block trsv only works on host + static constexpr bool is_host = + std::is_same::value; + if (!is_host) { + return; + } + } + + RowMapType brow_map; + EntriesType bentries; + ValuesType bvalues; + + size_type nnz = 10 * nrows; + auto A_unblocked = + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix( + nrows, nrows, nnz, 0, lno_t(0.01 * nrows), diagDominance); + + KokkosSparse::sort_crs_matrix(A_unblocked); + + std::vector block_sizes_blocked = {1, 2, 4, 10}; + std::vector block_sizes_unblocked = {1}; + std::vector block_sizes = + UseBlocks ? block_sizes_blocked : block_sizes_unblocked; + + for (auto block_size : block_sizes) { + // Convert to BSR if block enabled + auto A = get_A(A_unblocked, block_size); + + // Pull out views from BSR + Kokkos::resize(brow_map, A.graph.row_map.extent(0)); + Kokkos::resize(bentries, A.graph.entries.extent(0)); + Kokkos::resize(bvalues, A.values.extent(0)); + Kokkos::deep_copy(brow_map, A.graph.row_map); + Kokkos::deep_copy(bentries, A.graph.entries); + Kokkos::deep_copy(bvalues, A.values); + + // Make kernel handles + KernelHandle kh; + kh.create_gmres_handle(m, tol); + auto gmres_handle = kh.get_gmres_handle(); + gmres_handle->set_verbose(verbose); + using GMRESHandle = + typename std::remove_reference::type; + + for (lno_t fill_lev = 0; fill_lev < 4; ++fill_lev) { + const auto [L_row_map, L_entries, L_values, U_row_map, U_entries, + U_values] = + run_and_check_spiluk(kh, brow_map, bentries, bvalues, + SPILUKAlgorithm::SEQLVLSCHD_TP1, + fill_lev, block_size); + + // Create L, U + auto L = make_matrix("L_Mtx", L_row_map, L_entries, + L_values, block_size); + auto U = make_matrix("U_Mtx", U_row_map, U_entries, + U_values, block_size); + + // Set initial vectors: + ValuesType X("X", nrows); // Solution and initial guess + ValuesType Wj("Wj", nrows); // For checking residuals at end. + ValuesType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), + nrows); // right-hand side vec + // Make rhs ones so that results are repeatable: + Kokkos::deep_copy(B, 1.0); + + int num_iters_plain(0), num_iters_precond(0); + + // Solve Ax = b + { + gmres(&kh, A, B, X); + + // Double check residuals at end of solve: + float_t nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + float_t endRes = KokkosBlas::nrm2(B) / nrmB; + + const auto conv_flag = gmres_handle->get_conv_flag_val(); + num_iters_plain = gmres_handle->get_num_iters(); + + EXPECT_GT(num_iters_plain, 0); + EXPECT_LT(endRes, gmres_handle->get_tol()); + EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); + + if (TEST_SPILUK_VERBOSE_LEVEL > 0) { + std::cout << "Without LUPrec, with block_size=" << block_size + << ", converged in " << num_iters_plain + << " steps with endres=" << endRes << std::endl; + } + } + + // Solve Ax = b with LU preconditioner. + { + gmres_handle->reset_handle(m, tol); + gmres_handle->set_verbose(verbose); + + // Make precond. + KokkosSparse::Experimental::LUPrec + myPrec(L, U); + + // reset X for next gmres call + Kokkos::deep_copy(X, 0.0); + + gmres(&kh, A, B, X, &myPrec); + + // Double check residuals at end of solve: + float_t nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + float_t endRes = KokkosBlas::nrm2(B) / nrmB; + + const auto conv_flag = gmres_handle->get_conv_flag_val(); + num_iters_precond = gmres_handle->get_num_iters(); + + EXPECT_LT(endRes, gmres_handle->get_tol()); + EXPECT_EQ(conv_flag, GMRESHandle::Flag::Conv); + EXPECT_LT(num_iters_precond, num_iters_plain); + + if (TEST_SPILUK_VERBOSE_LEVEL > 0) { + std::cout << "With LUPrec, with block_size=" << block_size + << ", and fill_level=" << fill_lev << ", converged in " + << num_iters_precond << " steps with endres=" << endRes + << std::endl; + } + } + } + } + } +}; } // namespace Test template void test_spiluk() { - Test::run_test_spiluk(); + using TestStruct = Test::SpilukTest; + TestStruct::run_test_spiluk(); + TestStruct::run_test_spiluk_blocks(); + TestStruct::run_test_spiluk_scale(); + TestStruct::run_test_spiluk_scale_blocks(); + TestStruct::template run_test_spiluk_precond(); + TestStruct::template run_test_spiluk_precond(); } template void test_spiluk_streams() { - std::cout << "SPILUKAlgorithm::SEQLVLSCHD_RP: 1 stream" << std::endl; - Test::run_test_spiluk_streams(0, 1); - - std::cout << "SPILUKAlgorithm::SEQLVLSCHD_RP: 2 streams" << std::endl; - Test::run_test_spiluk_streams(0, 2); - - std::cout << "SPILUKAlgorithm::SEQLVLSCHD_RP: 3 streams" << std::endl; - Test::run_test_spiluk_streams(0, 3); - - std::cout << "SPILUKAlgorithm::SEQLVLSCHD_RP: 4 streams" << std::endl; - Test::run_test_spiluk_streams(0, 4); - - std::cout << "SPILUKAlgorithm::SEQLVLSCHD_TP1: 1 stream" << std::endl; - Test::run_test_spiluk_streams(1, 1); - - std::cout << "SPILUKAlgorithm::SEQLVLSCHD_TP1: 2 streams" << std::endl; - Test::run_test_spiluk_streams(1, 2); - - std::cout << "SPILUKAlgorithm::SEQLVLSCHD_TP1: 3 streams" << std::endl; - Test::run_test_spiluk_streams(1, 3); - - std::cout << "SPILUKAlgorithm::SEQLVLSCHD_TP1: 4 streams" << std::endl; - Test::run_test_spiluk_streams(1, 4); + using TestStruct = Test::SpilukTest; + + TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 1); + TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 2); + TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 3); + TestStruct::run_test_spiluk_streams(SPILUKAlgorithm::SEQLVLSCHD_TP1, 4); + + TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, + 1); + TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, + 2); + TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, + 3); + TestStruct::run_test_spiluk_streams_blocks(SPILUKAlgorithm::SEQLVLSCHD_TP1, + 4); } #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spmv.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spmv.hpp index 990fcc1a3053..c5107fcf0ab8 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spmv.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spmv.hpp @@ -24,7 +24,6 @@ #include #include -#include "KokkosKernels_Controls.hpp" #include "KokkosKernels_default_types.hpp" // #ifndef kokkos_complex_double @@ -180,10 +179,10 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, Kokkos::fence(); } -template +template void check_spmv( - const KokkosKernels::Experimental::Controls &controls, crsMat_t input_mat, - x_vector_type x, y_vector_type y, + handle_t *handle, crsMat_t input_mat, x_vector_type x, y_vector_type y, typename y_vector_type::non_const_value_type alpha, typename y_vector_type::non_const_value_type beta, const std::string &mode, typename Kokkos::ArithTraits::mag_type @@ -208,7 +207,7 @@ void check_spmv( bool threw = false; std::string msg; try { - KokkosSparse::spmv(controls, mode.data(), alpha, input_mat, x, beta, y); + KokkosSparse::spmv(handle, mode.data(), alpha, input_mat, x, beta, y); Kokkos::fence(); } catch (std::exception &e) { threw = true; @@ -229,9 +228,10 @@ void check_spmv( EXPECT_TRUE(num_errors == 0); } -template +template void check_spmv_mv( - crsMat_t input_mat, x_vector_type x, y_vector_type y, + Handle *handle, crsMat_t input_mat, x_vector_type x, y_vector_type y, y_vector_type expected_y, typename y_vector_type::non_const_value_type alpha, typename y_vector_type::non_const_value_type beta, int numMV, @@ -259,7 +259,7 @@ void check_spmv_mv( bool threw = false; std::string msg; try { - KokkosSparse::spmv(mode.data(), alpha, input_mat, x, beta, y); + KokkosSparse::spmv(handle, mode.data(), alpha, input_mat, x, beta, y); Kokkos::fence(); } catch (std::exception &e) { threw = true; @@ -388,51 +388,6 @@ void check_spmv_mv_struct( } } // check_spmv_mv_struct -template -void check_spmv_controls( - KokkosKernels::Experimental::Controls controls, crsMat_t input_mat, - x_vector_type x, y_vector_type y, - typename y_vector_type::non_const_value_type alpha, - typename y_vector_type::non_const_value_type beta, - typename Kokkos::ArithTraits::mag_type - max_val) { - // typedef typename crsMat_t::StaticCrsGraphType graph_t; - using ExecSpace = typename crsMat_t::execution_space; - using my_exec_space = Kokkos::RangePolicy; - using y_value_type = typename y_vector_type::non_const_value_type; - using y_value_trait = Kokkos::ArithTraits; - using y_value_mag_type = typename y_value_trait::mag_type; - - // y is the quantity being tested here, - // so let us use y_value_type to determine - // the appropriate tolerance precision. - const y_value_mag_type eps = - std::is_same::value ? 2 * 1e-3 : 1e-7; - const size_t nr = input_mat.numRows(); - y_vector_type expected_y("expected", nr); - Kokkos::deep_copy(expected_y, y); - Kokkos::fence(); - - sequential_spmv(input_mat, x, expected_y, alpha, beta); - -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - controls.setParameter("algorithm", "merge"); - printf("requested merge based algorithm\n"); -#endif - - KokkosSparse::spmv(controls, "N", alpha, input_mat, x, beta, y); - int num_errors = 0; - Kokkos::parallel_reduce( - "KokkosSparse::Test::spmv", my_exec_space(0, y.extent(0)), - fSPMV(expected_y, y, eps, max_val), - num_errors); - if (num_errors > 0) - printf("KokkosSparse::Test::spmv: %i errors of %i with params: %lf %lf\n", - num_errors, y.extent_int(0), y_value_trait::abs(alpha), - y_value_trait::abs(beta)); - EXPECT_TRUE(num_errors == 0); -} // check_spmv_controls - } // namespace Test template @@ -452,15 +407,16 @@ Kokkos::complex randomUpperBound>(int mag) { template -void test_spmv(const KokkosKernels::Experimental::Controls &controls, - lno_t numRows, size_type nnz, lno_t bandwidth, - lno_t row_size_variance, bool heavy) { +void test_spmv(KokkosSparse::SPMVAlgorithm algo, lno_t numRows, size_type nnz, + lno_t bandwidth, lno_t row_size_variance, bool heavy) { using crsMat_t = typename KokkosSparse::CrsMatrix; using scalar_view_t = typename crsMat_t::values_type::non_const_type; using x_vector_type = scalar_view_t; using y_vector_type = scalar_view_t; using mag_t = typename Kokkos::ArithTraits::mag_type; + using handle_t = + KokkosSparse::SPMVHandle; constexpr mag_t max_x = static_cast(1); constexpr mag_t max_y = static_cast(1); @@ -504,12 +460,17 @@ void test_spmv(const KokkosKernels::Experimental::Controls &controls, testAlphaBeta.push_back(-1.0); testAlphaBeta.push_back(2.5); } + + // This handle can be reused for all following calls, since the matrix does + // not change + handle_t handle(algo); + for (auto mode : nonTransModes) { for (double alpha : testAlphaBeta) { for (double beta : testAlphaBeta) { mag_t max_error = beta * max_y + alpha * max_nnz_per_row * max_val * max_x; - Test::check_spmv(controls, input_mat, input_x, output_y, alpha, beta, + Test::check_spmv(&handle, input_mat, input_x, output_y, alpha, beta, mode, max_error); } } @@ -520,7 +481,7 @@ void test_spmv(const KokkosKernels::Experimental::Controls &controls, // hoping the transpose won't have a long column... mag_t max_error = beta * max_y + alpha * max_nnz_per_row * max_val * max_x; - Test::check_spmv(controls, input_mat, input_xt, output_yt, alpha, beta, + Test::check_spmv(&handle, input_mat, input_xt, output_yt, alpha, beta, mode, max_error); } } @@ -531,29 +492,10 @@ template void test_spmv_algorithms(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance, bool heavy) { - { - KokkosKernels::Experimental::Controls controls; - test_spmv( - controls, numRows, nnz, bandwidth, row_size_variance, heavy); - } - - { - KokkosKernels::Experimental::Controls controls; - controls.setParameter("algorithm", "native"); - test_spmv( - controls, numRows, nnz, bandwidth, row_size_variance, heavy); - } - { - KokkosKernels::Experimental::Controls controls; - controls.setParameter("algorithm", "merge"); - test_spmv( - controls, numRows, nnz, bandwidth, row_size_variance, heavy); - } - { - KokkosKernels::Experimental::Controls controls; - controls.setParameter("algorithm", "native-merge"); - test_spmv( - controls, numRows, nnz, bandwidth, row_size_variance, heavy); + using namespace KokkosSparse; + for (SPMVAlgorithm algo : {SPMV_DEFAULT, SPMV_NATIVE, SPMV_MERGE_PATH}) { + test_spmv(algo, numRows, nnz, bandwidth, + row_size_variance, heavy); } } @@ -573,14 +515,16 @@ void test_spmv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, void, size_type>; using ViewTypeX = Kokkos::View; using ViewTypeY = Kokkos::View; + using handle_t = + KokkosSparse::SPMVHandle; - ViewTypeX b_x("A", numRows, numMV); - ViewTypeY b_y("B", numCols, numMV); - ViewTypeY b_y_copy("B", numCols, numMV); + ViewTypeX b_x("A", numCols, numMV); + ViewTypeY b_y("B", numRows, numMV); + ViewTypeY b_y_copy("B", numRows, numMV); - ViewTypeX b_xt("A", numCols, numMV); - ViewTypeY b_yt("B", numRows, numMV); - ViewTypeY b_yt_copy("B", numRows, numMV); + ViewTypeX b_xt("A", numRows, numMV); + ViewTypeY b_yt("B", numCols, numMV); + ViewTypeY b_yt_copy("B", numCols, numMV); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -613,13 +557,14 @@ void test_spmv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, testAlphaBeta.push_back(-1.0); testAlphaBeta.push_back(2.5); } + handle_t handle; for (auto mode : nonTransModes) { for (double alpha : testAlphaBeta) { for (double beta : testAlphaBeta) { mag_t max_error = beta * max_y + alpha * max_nnz_per_row * max_val * max_x; - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, alpha, beta, numMV, - mode, max_error); + Test::check_spmv_mv(&handle, input_mat, b_x, b_y, b_y_copy, alpha, beta, + numMV, mode, max_error); } } } @@ -629,17 +574,17 @@ void test_spmv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, // hoping the transpose won't have a long column... mag_t max_error = beta * max_y + alpha * max_nnz_per_row * max_val * max_x; - Test::check_spmv_mv(input_mat, b_xt, b_yt, b_yt_copy, alpha, beta, - numMV, mode, max_error); + Test::check_spmv_mv(&handle, input_mat, b_xt, b_yt, b_yt_copy, alpha, + beta, numMV, mode, max_error); } } } } template -void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth, - lno_t row_size_variance, int numMV) { + typename layout_x, typename layout_y, class Device> +void test_spmv_mv_heavy(lno_t numRows, lno_t numCols, size_type nnz, + lno_t bandwidth, lno_t row_size_variance, int numMV) { #if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) || defined(KOKKOS_ARCH_A64FX) if (std::is_same>::value) { std::cerr @@ -651,16 +596,18 @@ void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth, #endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL || KOKKOS_ARCH_A64FX using crsMat_t = typename KokkosSparse::CrsMatrix; - using ViewTypeX = Kokkos::View; - using ViewTypeY = Kokkos::View; + using ViewTypeX = Kokkos::View; + using ViewTypeY = Kokkos::View; using mag_t = typename Kokkos::ArithTraits::mag_type; + using handle_t = + KokkosSparse::SPMVHandle; constexpr mag_t max_x = static_cast(10); constexpr mag_t max_y = static_cast(10); constexpr mag_t max_val = static_cast(10); crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix( - numRows, numRows, nnz, row_size_variance, bandwidth); + numRows, numCols, nnz, row_size_variance, bandwidth); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -668,26 +615,35 @@ void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth, numRows ? (nnz / numRows + row_size_variance) : 0; for (int nv = 1; nv <= numMV; nv++) { - ViewTypeX b_x("A", numRows, nv); + ViewTypeX b_x("A", numCols, nv); ViewTypeY b_y("B", numRows, nv); ViewTypeY b_y_copy("B", numRows, nv); + ViewTypeX b_xt("A", numRows, nv); + ViewTypeY b_yt("B", numCols, nv); + ViewTypeY b_yt_copy("B", numCols, nv); + Kokkos::fill_random(b_x, rand_pool, scalar_t(10)); Kokkos::fill_random(b_y, rand_pool, scalar_t(10)); + Kokkos::fill_random(b_xt, rand_pool, scalar_t(10)); + Kokkos::fill_random(b_yt, rand_pool, scalar_t(10)); Kokkos::fill_random(input_mat.values, rand_pool, scalar_t(10)); Kokkos::deep_copy(b_y_copy, b_y); - - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, "N", - max_nnz_per_row * max_val * max_x); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, "N", - max_y); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, "N", - max_y + max_nnz_per_row * max_val * max_x); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, "T", - max_nnz_per_row * max_val * max_x); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, "T", - max_y); + Kokkos::deep_copy(b_yt_copy, b_yt); + + handle_t handle; + + Test::check_spmv_mv(&handle, input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, + "N", max_nnz_per_row * max_val * max_x); + Test::check_spmv_mv(&handle, input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, + "N", max_y); + Test::check_spmv_mv(&handle, input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, + "N", max_y + max_nnz_per_row * max_val * max_x); + Test::check_spmv_mv(&handle, input_mat, b_xt, b_yt, b_yt_copy, 1.0, 0.0, nv, + "T", max_nnz_per_row * max_val * max_x); + Test::check_spmv_mv(&handle, input_mat, b_xt, b_yt, b_yt_copy, 0.0, 1.0, nv, + "T", max_y); // Testing all modes together, since matrix is square std::vector modes = {"N", "C", "T", "H"}; std::vector testAlphaBeta = {0.0, 1.0, -1.0, 2.5}; @@ -696,8 +652,13 @@ void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth, for (double beta : testAlphaBeta) { mag_t max_error = beta * max_y + alpha * max_nnz_per_row * max_val * max_x; - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, alpha, beta, nv, - mode, max_error); + if (*mode == 'N' || *mode == 'C') { + Test::check_spmv_mv(&handle, input_mat, b_x, b_y, b_y_copy, alpha, + beta, nv, mode, max_error); + } else { + Test::check_spmv_mv(&handle, input_mat, b_xt, b_yt, b_yt_copy, + alpha, beta, nv, mode, max_error); + } } } } @@ -956,59 +917,6 @@ void test_spmv_mv_struct_1D(lno_t nx, int numMV) { output_y_copy, 1.0, 1.0, numMV, max_error); } -// check that the controls are flowing down correctly in the spmv kernel -template -void test_spmv_controls(lno_t numRows, size_type nnz, lno_t bandwidth, - lno_t row_size_variance, - const KokkosKernels::Experimental::Controls &controls = - KokkosKernels::Experimental::Controls()) { - using crsMat_t = typename KokkosSparse::CrsMatrix; - using scalar_view_t = typename crsMat_t::values_type::non_const_type; - using x_vector_type = scalar_view_t; - using y_vector_type = scalar_view_t; - using mag_t = typename Kokkos::ArithTraits::mag_type; - - constexpr mag_t max_x = static_cast(10); - constexpr mag_t max_y = static_cast(10); - constexpr mag_t max_val = static_cast(10); - - lno_t numCols = numRows; - - crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix( - numRows, numCols, nnz, row_size_variance, bandwidth); - lno_t nr = input_mat.numRows(); - lno_t nc = input_mat.numCols(); - - x_vector_type input_x("x", nc); - y_vector_type output_y("y", nr); - - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); - - Kokkos::fill_random(input_x, rand_pool, max_x); - Kokkos::fill_random(output_y, rand_pool, max_y); - Kokkos::fill_random(input_mat.values, rand_pool, max_val); - - const mag_t max_error = max_y + bandwidth * max_val * max_x; - - Test::check_spmv_controls(controls, input_mat, input_x, output_y, 1.0, 0.0, - max_error); - Test::check_spmv_controls(controls, input_mat, input_x, output_y, 0.0, 1.0, - max_error); - Test::check_spmv_controls(controls, input_mat, input_x, output_y, 1.0, 1.0, - max_error); -} // test_spmv_controls - -// test the native algorithm -template -void test_spmv_native(lno_t numRows, size_type nnz, lno_t bandwidth, - lno_t row_size_variance) { - KokkosKernels::Experimental::Controls controls; - controls.setParameter("algorithm", "native"); - test_spmv_controls(numRows, nnz, bandwidth, row_size_variance, controls); -} // test_spmv_native - // call it if ordinal int and, scalar float and double are instantiated. template void test_github_issue_101() { @@ -1177,6 +1085,10 @@ void test_spmv_all_interfaces_light() { using vector_t = Kokkos::View; using range1D_t = Kokkos::RangePolicy; using range2D_t = Kokkos::MDRangePolicy>; + using v_handle_t = + KokkosSparse::SPMVHandle; + using mv_handle_t = KokkosSparse::SPMVHandle; multivector_t x_mv("x_mv", n, 3); vector_t x("x", n); // Randomize x (it won't be modified after that) @@ -1216,41 +1128,24 @@ void test_spmv_all_interfaces_light() { space_partitions = Kokkos::Experimental::partition_space(space, 1, 1); space = space_partitions[1]; } - KokkosKernels::Experimental::Controls controls; - // All tagged versions - KokkosSparse::spmv(space, controls, "N", 1.0, A, x, 0.0, y, - KokkosSparse::RANK_ONE()); - space.fence(); - verify(); - clear_y(); - KokkosSparse::spmv(controls, "N", 1.0, A, x, 0.0, y, - KokkosSparse::RANK_ONE()); - verify(); - clear_y(); - KokkosSparse::spmv(space, controls, "N", 1.0, A, x_mv, 0.0, y_mv, - KokkosSparse::RANK_TWO()); - space.fence(); - verify_mv(); - clear_y(); - KokkosSparse::spmv(controls, "N", 1.0, A, x_mv, 0.0, y_mv, - KokkosSparse::RANK_TWO()); - verify_mv(); - clear_y(); - // Non-tagged versions - // space and controls - spmv(space, controls, "N", 1.0, A, x, 0.0, y); + + v_handle_t v_handle; + mv_handle_t mv_handle; + + // space and handle + spmv(space, &v_handle, "N", 1.0, A, x, 0.0, y); space.fence(); verify(); clear_y(); - spmv(space, controls, "N", 1.0, A, x_mv, 0.0, y_mv); + spmv(space, &mv_handle, "N", 1.0, A, x_mv, 0.0, y_mv); space.fence(); verify_mv(); clear_y(); - // controls - spmv(controls, "N", 1.0, A, x, 0.0, y); + // handle + spmv(&v_handle, "N", 1.0, A, x, 0.0, y); verify(); clear_y(); - spmv(controls, "N", 1.0, A, x_mv, 0.0, y_mv); + spmv(&mv_handle, "N", 1.0, A, x_mv, 0.0, y_mv); verify_mv(); clear_y(); // space @@ -1291,8 +1186,6 @@ void test_spmv_all_interfaces_light() { 100, 10, false); \ test_spmv_algorithms(10000, 10000 * 2, \ 100, 5, false); \ - test_spmv_controls(10000, 10000 * 20, \ - 100, 5); \ } #define EXECUTE_TEST_INTERFACES(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE) \ @@ -1308,19 +1201,30 @@ void test_spmv_all_interfaces_light() { TestCategory, \ sparse##_##spmv_mv##_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \ test_spmv_mv( \ - 1000, 1000 * 3, 200, 10, true, 1); \ + 1001, 1001 * 3, 200, 10, true, 1); \ test_spmv_mv( \ - 1000, 1000 * 3, 100, 10, true, 5); \ + 999, 999 * 3, 100, 10, true, 5); \ test_spmv_mv( \ - 1000, 1000 * 2, 100, 5, true, 10); \ + 1003, 1003 * 2, 100, 5, true, 10); \ test_spmv_mv( \ - 50000, 50000 * 3, 20, 10, false, 1); \ + 50007, 50007 * 3, 20, 10, false, 1); \ test_spmv_mv( \ - 50000, 50000 * 3, 100, 10, false, 1); \ + 50002, 50002 * 3, 100, 10, false, 1); \ test_spmv_mv( \ 10000, 10000 * 2, 100, 5, false, 5); \ - test_spmv_mv_heavy( \ - 200, 200 * 10, 60, 4, 30); \ + test_spmv_mv_heavy(204, 201, 204 * 10, 60, 4, 30); \ + test_spmv_mv_heavy(2, 3, 5, 3, 1, 10); \ + } + +#define EXECUTE_TEST_MV_MIXED_LAYOUT(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F( \ + TestCategory, \ + sparse##_##spmv_mv_mixed_layout##_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \ + test_spmv_mv_heavy(99, 101, 100 * 15, 40, 4, \ + 20); \ } #define EXECUTE_TEST_STRUCT(SCALAR, ORDINAL, OFFSET, DEVICE) \ @@ -1387,8 +1291,20 @@ EXECUTE_TEST_ISSUE_101(TestDevice) #include #undef KOKKOSKERNELS_EXECUTE_TEST +#endif + +// Test that requires mixing LayoutLeft and LayoutRight (never an ETI'd +// combination) +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_TEST_MV_MIXED_LAYOUT(SCALAR, ORDINAL, OFFSET, TestDevice) -#endif // defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST +#endif #undef EXECUTE_TEST_FN #undef EXECUTE_TEST_STRUCT diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spmv_bsr.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spmv_bsr.hpp index 5b823a22f7e5..6482d33d8a9a 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spmv_bsr.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_spmv_bsr.hpp @@ -40,7 +40,6 @@ #include #include #include -#include "KokkosKernels_Controls.hpp" #include "KokkosKernels_default_types.hpp" #include "KokkosSparse_spmv.hpp" @@ -53,29 +52,6 @@ using kokkos_complex_double = Kokkos::complex; using kokkos_complex_float = Kokkos::complex; -/* Poor-man's std::optional since CUDA 11.0 seems to have an ICE - https://github.com/kokkos/kokkos-kernels/issues/1943 -*/ -struct OptCtrls { - bool present_; - KokkosKernels::Experimental::Controls ctrls_; - - OptCtrls() : present_(false) {} - OptCtrls(const KokkosKernels::Experimental::Controls &ctrls) - : present_(true), ctrls_(ctrls) {} - - operator bool() const { return present_; } - - constexpr const KokkosKernels::Experimental::Controls &operator*() - const &noexcept { - return ctrls_; - } - constexpr const KokkosKernels::Experimental::Controls *operator->() const - noexcept { - return &ctrls_; - } -}; - namespace Test_Spmv_Bsr { /*! \brief Maximum value used to fill A */ @@ -171,10 +147,10 @@ Bsr bsr_random(const int blockSize, const int blockRows, const int blockCols) { /*! \brief test a specific spmv */ -template -void test_spmv(const OptCtrls &controls, const char *mode, const Alpha &alpha, +template +void test_spmv(Handle *handle, const char *mode, const Alpha &alpha, const Beta &beta, const Bsr &a, const Crs &acrs, size_t maxNnzPerRow, const XVector &x, const YVector &y) { using scalar_type = typename Bsr::non_const_value_type; @@ -191,11 +167,7 @@ void test_spmv(const OptCtrls &controls, const char *mode, const Alpha &alpha, YVector yAct("yAct", y.extent(0)); Kokkos::deep_copy(yAct, y); - if (controls) { - KokkosSparse::spmv(*controls, mode, alpha, a, x, beta, yAct); - } else { - KokkosSparse::spmv(mode, alpha, a, x, beta, yAct); - } + KokkosSparse::spmv(handle, mode, alpha, a, x, beta, yAct); // compare yExp and yAct auto hyExp = Kokkos::create_mirror_view(yExp); @@ -223,12 +195,8 @@ void test_spmv(const OptCtrls &controls, const char *mode, const Alpha &alpha, } if (!errIdx.empty()) { - std::string alg; - if (controls) { - alg = controls->getParameter("algorithm", ""); - } else { - alg = ""; - } + std::string alg = + KokkosSparse::get_spmv_algorithm_name(handle->get_algorithm()); std::cerr << __FILE__ << ":" << __LINE__ << " BsrMatrix SpMV failure!" << std::endl; @@ -384,38 +352,43 @@ auto random_vecs_for_spmv(const char *mode, const Bsr &a) { template void test_spmv_combos(const char *mode, const Bsr &a, const Crs &acrs, size_t maxNnzPerRow) { + using namespace KokkosSparse; using scalar_type = typename Bsr::non_const_value_type; using execution_space = typename Bsr::execution_space; auto [x, y] = random_vecs_for_spmv(mode, a); - // cover a variety of controls - using Ctrls = KokkosKernels::Experimental::Controls; - std::vector ctrls = {OptCtrls(), // no controls - OptCtrls(Ctrls()), // empty controls - OptCtrls(Ctrls({{"algorithm", "tpl"}})), - OptCtrls(Ctrls({{"algorithm", "v4.1"}}))}; + using handle_t = SPMVHandle; + // cover a variety of algorithms + std::vector handles; + for (SPMVAlgorithm algo : {SPMV_DEFAULT, SPMV_NATIVE, SPMV_BSR_V41}) + handles.push_back(new handle_t(algo)); + + // Tensor core algorithm temporarily disabled, fails on V100 + /* if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space()) { #if defined(KOKKOS_ENABLE_CUDA) if constexpr (std::is_same_v) { #if defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_VOLTA) - ctrls.push_back(OptCtrls(Ctrls({{"algorithm", "experimental_tc"}}))); + handles.push_back(new handle_t(SPMV_BSR_TC)); #if defined(KOKKOS_ARCH_AMPERE) - ctrls.push_back(OptCtrls(Ctrls( - {{"algorithm", "experimental_tc"}, {"tc_precision", "double"}}))); + // Also call SPMV_BSR_TC with Precision = Double on Ampere + handles.push_back(new handle_t(SPMV_BSR_TC)); + handles.back()->bsr_tc_precision = Experimental::Bsr_TC_Precision::Double; #endif // AMPERE #endif // AMPERE || VOLTA } #endif // CUDA } + */ - for (const auto &ctrl : ctrls) { + for (handle_t *handle : handles) { for (scalar_type alpha : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(3.7)}) { for (scalar_type beta : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(-1.5)}) { - test_spmv(ctrl, mode, alpha, beta, a, acrs, maxNnzPerRow, x, y); + test_spmv(handle, mode, alpha, beta, a, acrs, maxNnzPerRow, x, y); } } } @@ -499,9 +472,9 @@ void test_spmv() { // Note: if mode_is_transpose(mode), then maxNnzPerRow is for A^T. Otherwise, // it's for A. -template -void test_spm_mv(const OptCtrls &controls, const char *mode, const Alpha &alpha, +template +void test_spm_mv(Handle *handle, const char *mode, const Alpha &alpha, const Beta &beta, const Bsr &a, const Crs &acrs, size_t maxNnzPerRow, const XVector &x, const YVector &y) { using scalar_type = typename Bsr::non_const_value_type; @@ -518,11 +491,7 @@ void test_spm_mv(const OptCtrls &controls, const char *mode, const Alpha &alpha, YVector yAct("yAct", y.extent(0), y.extent(1)); Kokkos::deep_copy(yAct, y); - if (controls) { - KokkosSparse::spmv(*controls, mode, alpha, a, x, beta, yAct); - } else { - KokkosSparse::spmv(mode, alpha, a, x, beta, yAct); - } + KokkosSparse::spmv(handle, mode, alpha, a, x, beta, yAct); // compare yExp and yAct auto hyExp = Kokkos::create_mirror_view(yExp); @@ -550,12 +519,8 @@ void test_spm_mv(const OptCtrls &controls, const char *mode, const Alpha &alpha, } if (!errIdx.empty()) { - std::string alg; - if (controls) { - alg = controls->getParameter("algorithm", ""); - } else { - alg = ""; - } + std::string alg = + KokkosSparse::get_spmv_algorithm_name(handle->get_algorithm()); std::cerr << __FILE__ << ":" << __LINE__ << " BsrMatrix SpMMV failure!" << std::endl; @@ -621,38 +586,44 @@ auto random_multivecs_for_spm_mv(const char *mode, const Bsr &a, template void test_spm_mv_combos(const char *mode, const Bsr &a, const Crs &acrs, size_t maxNnzPerRow) { + using namespace KokkosSparse; using execution_space = typename Bsr::execution_space; using scalar_type = typename Bsr::non_const_value_type; + using multivector_t = typename MultiVectorTypeFor::type; + using handle_t = + SPMVHandle; - // cover a variety of controls - using Ctrls = KokkosKernels::Experimental::Controls; - std::vector ctrls = {OptCtrls(), // no controls - OptCtrls(Ctrls()), // empty controls - OptCtrls(Ctrls({{"algorithm", "tpl"}})), - OptCtrls(Ctrls({{"algorithm", "v4.1"}}))}; + // cover a variety of algorithms + std::vector handles; + for (SPMVAlgorithm algo : {SPMV_DEFAULT, SPMV_NATIVE, SPMV_BSR_V41}) + handles.push_back(new handle_t(algo)); + // Tensor core algorithm temporarily disabled, fails on V100 + /* if constexpr (KokkosKernels::Impl::kk_is_gpu_exec_space()) { #if defined(KOKKOS_ENABLE_CUDA) if constexpr (std::is_same_v) { #if defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_VOLTA) - ctrls.push_back(OptCtrls(Ctrls({{"algorithm", "experimental_tc"}}))); + handles.push_back(new handle_t(SPMV_BSR_TC)); #if defined(KOKKOS_ARCH_AMPERE) - ctrls.push_back(OptCtrls(Ctrls( - {{"algorithm", "experimental_tc"}, {"tc_precision", "double"}}))); + // Also call SPMV_BSR_TC with Precision = Double on Ampere + handles.push_back(new handle_t(SPMV_BSR_TC)); + handles.back()->bsr_tc_precision = Experimental::Bsr_TC_Precision::Double; #endif // AMPERE #endif // AMPERE || VOLTA } #endif // CUDA } + */ for (size_t numVecs : {1, 7}) { // num multivecs auto [x, y] = random_multivecs_for_spm_mv(mode, a, numVecs); - for (const auto &ctrl : ctrls) { + for (handle_t *handle : handles) { for (scalar_type alpha : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(3.7)}) { for (scalar_type beta : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(-1.5)}) { - test_spm_mv(ctrl, mode, alpha, beta, a, acrs, maxNnzPerRow, x, y); + test_spm_mv(handle, mode, alpha, beta, a, acrs, maxNnzPerRow, x, y); } } } diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_sptrsv.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_sptrsv.hpp index 1a4c78e08e02..b8b35bc422be 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_sptrsv.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_sptrsv.hpp @@ -38,1320 +38,808 @@ using namespace KokkosKernels; using namespace KokkosKernels::Impl; using namespace KokkosKernels::Experimental; -// #ifndef kokkos_complex_double -// #define kokkos_complex_double Kokkos::complex -// #endif -// #ifndef kokkos_complex_float -// #define kokkos_complex_float Kokkos::complex -// #endif - -typedef Kokkos::complex kokkos_complex_double; -typedef Kokkos::complex kokkos_complex_float; +using kokkos_complex_double = Kokkos::complex; +using kokkos_complex_float = Kokkos::complex; namespace Test { -#if 0 -template -void run_test_sptrsv_mtx() { - - typedef typename KokkosSparse::CrsMatrix crsmat_t; - typedef typename crsmat_t::StaticCrsGraphType graph_t; - - //typedef Kokkos::View< size_type*, device > RowMapType; - //typedef Kokkos::View< lno_t*, device > EntriesType; - typedef Kokkos::View< scalar_t*, device > ValuesType; - - // Lower tri - std::cout << "LowerTriTest Begin" << std::endl; - { - -// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/L-offshore-amd.mtx"; -// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/L-Transport-amd.mtx"; -// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/L-Fault_639amd.mtx"; -// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/L-thermal2-amd.mtx"; - std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/L-dielFilterV2real-amd.mtx"; - std::cout << "Matrix file: " << mtx_filename << std::endl; - crsmat_t triMtx = KokkosKernels::Impl::read_kokkos_crst_matrix(mtx_filename.c_str()); //in_matrix - graph_t lgraph = triMtx.graph; // in_graph - - auto row_map = lgraph.row_map; - auto entries = lgraph.entries; - auto values = triMtx.values; - - const size_type nrows = lgraph.numRows(); -// const size_type nnz = triMtx.nnz(); - - scalar_t ZERO = scalar_t(0); - scalar_t ONE = scalar_t(1); - - typedef KokkosKernels::Experimental::KokkosKernelsHandle KernelHandle; - - std::cout << "UnitTest nrows = " << nrows << std::endl; - - KernelHandle kh; - bool is_lower_tri = true; - std::cout << "Create handle" << std::endl; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri); - - std::cout << "Prepare linear system" << std::endl; - // Create known_lhs, generate rhs, then solve for lhs to compare to known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); - - // Solution to find - ValuesType lhs("lhs", nrows); - - // A*known_lhs generates rhs: rhs is dense, use spmv - ValuesType rhs("rhs", nrows); - -// typedef CrsMatrix crsMat_t; -// crsMat_t triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); - - std::cout << "SPMV" << std::endl; - KokkosSparse::spmv( "N", ONE, triMtx, known_lhs, ZERO, rhs); - - std::cout << "TriSolve Symbolic" << std::endl; - Kokkos::Timer timer; - sptrsv_symbolic( &kh, row_map, entries ); - std::cout << "LTRI Symbolic Time: " << timer.seconds() << std::endl; +template +struct SptrsvTest { + // Define useful types + using RowMapType = Kokkos::View; + using EntriesType = Kokkos::View; + using ValuesType = Kokkos::View; + using RowMapType_hostmirror = typename RowMapType::HostMirror; + using EntriesType_hostmirror = typename EntriesType::HostMirror; + using ValuesType_hostmirror = typename ValuesType::HostMirror; + using execution_space = typename device::execution_space; + using memory_space = typename device::memory_space; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>; - std::cout << "TriSolve Solve" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - timer.reset(); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - std::cout << "LTRI Solve TEAMPOLICY! Time: " << timer.seconds() << std::endl; + using Crs = CrsMatrix; + using Bsr = BsrMatrix; - scalar_t sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { - tsum += lhs(i); - }, sum); - if ( sum != lhs.extent(0) ) { - std::cout << "Lower Tri Solve FAILURE" << std::endl; - } - else { - std::cout << "Lower Tri Solve SUCCESS!" << std::endl; - //std::cout << "Num-levels = " << kh->get_sptrsv_handle()->get_num_levels() << std::endl; - } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - - Kokkos::deep_copy(lhs, 0); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); - kh.get_sptrsv_handle()->print_algorithm(); - timer.reset(); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - std::cout << "LTRI Solve SEQLVLSCHD_RP Time: " << timer.seconds() << std::endl; - - sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { - tsum += lhs(i); - }, sum); - if ( sum != lhs.extent(0) ) { - std::cout << "Lower Tri Solve FAILURE" << std::endl; - } - else { - std::cout << "Lower Tri Solve SUCCESS!" << std::endl; - //std::cout << "Num-levels = " << kh->get_sptrsv_handle()->get_num_levels() << std::endl; - } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - - Kokkos::deep_copy(lhs, 0); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); - kh.get_sptrsv_handle()->print_algorithm(); - timer.reset(); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - std::cout << "LTRI Solve SEQLVLSCHED_TP2 Time: " << timer.seconds() << std::endl; - - sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { - tsum += lhs(i); - }, sum); - if ( sum != lhs.extent(0) ) { - std::cout << "Lower Tri Solve FAILURE" << std::endl; - } - else { - std::cout << "Lower Tri Solve SUCCESS!" << std::endl; - //std::cout << "Num-levels = " << kh->get_sptrsv_handle()->get_num_levels() << std::endl; - } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); + using crs_graph_t = typename Crs::StaticCrsGraphType; + using range_policy_t = Kokkos::RangePolicy; - kh.destroy_sptrsv_handle(); + static std::vector> get_5x5_ut_ones_fixture() { + std::vector> A = {{1.00, 0.00, 1.00, 0.00, 0.00}, + {0.00, 1.00, 0.00, 0.00, 1.00}, + {0.00, 0.00, 1.00, 1.00, 1.00}, + {0.00, 0.00, 0.00, 1.00, 1.00}, + {0.00, 0.00, 0.00, 0.00, 1.00}}; + return A; } - // Upper tri - std::cout << "UpperTriTest Begin" << std::endl; - { -// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/U-offshore-amd.mtx"; -// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/U-Transport-amd.mtx"; -// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/U-Fault_639amd.mtx"; -// std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/U-thermal2-amd.mtx"; - std::string mtx_filename = "/ascldap/users/ndellin/TestCodes-GitlabEx/KokkosEcoCodes/KokkosKernels-DevTests/Matrices/U-dielFilterV2real-amd.mtx"; - std::cout << "Matrix file: " << mtx_filename << std::endl; - crsmat_t triMtx = KokkosKernels::Impl::read_kokkos_crst_matrix(mtx_filename.c_str()); //in_matrix - graph_t lgraph = triMtx.graph; // in_graph - - auto row_map = lgraph.row_map; - auto entries = lgraph.entries; - auto values = triMtx.values; - - const size_type nrows = lgraph.numRows(); -// const size_type nnz = triMtx.nnz(); - - scalar_t ZERO = scalar_t(0); - scalar_t ONE = scalar_t(1); - - typedef KokkosKernels::Experimental::KokkosKernelsHandle KernelHandle; - - std::cout << "UnitTest nrows = " << nrows << std::endl; - - KernelHandle kh; - bool is_lower_tri = false; - std::cout << "Create handle" << std::endl; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri); - - std::cout << "Prepare linear system" << std::endl; - // Create known_lhs, generate rhs, then solve for lhs to compare to known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); - - // Solution to find - ValuesType lhs("lhs", nrows); - - // A*known_lhs generates rhs: rhs is dense, use spmv - ValuesType rhs("rhs", nrows); - -// typedef CrsMatrix crsMat_t; -// crsMat_t triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); - std::cout << "SPMV" << std::endl; - KokkosSparse::spmv( "N", ONE, triMtx, known_lhs, ZERO, rhs); - - std::cout << "TriSolve Symbolic" << std::endl; - Kokkos::Timer timer; - sptrsv_symbolic( &kh, row_map, entries ); - std::cout << "UTRI Symbolic Time: " << timer.seconds() << std::endl; - - std::cout << "TriSolve Solve" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - timer.reset(); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - std::cout << "UTRI Solve SEQLVLSCHD_TP1 Time: " << timer.seconds() << std::endl; - - scalar_t sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { - tsum += lhs(i); - }, sum); - if ( sum != lhs.extent(0) ) { - std::cout << "Upper Tri Solve FAILURE" << std::endl; - } - else { - std::cout << "Upper Tri Solve SUCCESS!" << std::endl; - //std::cout << "Num-levels = " << kh->get_sptrsv_handle()->get_num_levels() << std::endl; - } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - - Kokkos::deep_copy(lhs, 0); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); - kh.get_sptrsv_handle()->print_algorithm(); - timer.reset(); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - std::cout << "UTRI Solve SEQLVLSCHD_RP Time: " << timer.seconds() << std::endl; - - sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { - tsum += lhs(i); - }, sum); - if ( sum != lhs.extent(0) ) { - std::cout << "Upper Tri Solve FAILURE" << std::endl; - } - else { - std::cout << "Upper Tri Solve SUCCESS!" << std::endl; - //std::cout << "Num-levels = " << kh->get_sptrsv_handle()->get_num_levels() << std::endl; - } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - - Kokkos::deep_copy(lhs, 0); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); - kh.get_sptrsv_handle()->print_algorithm(); - timer.reset(); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - std::cout << "UTRI Solve SEQLVLSCHED_TP2 Time: " << timer.seconds() << std::endl; - - sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { - tsum += lhs(i); - }, sum); - if ( sum != lhs.extent(0) ) { - std::cout << "Upper Tri Solve FAILURE" << std::endl; - } - else { - std::cout << "Upper Tri Solve SUCCESS!" << std::endl; - //std::cout << "Num-levels = " << kh->get_sptrsv_handle()->get_num_levels() << std::endl; - } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - - kh.destroy_sptrsv_handle(); + static std::vector> get_5x5_ut_fixture() { + const auto KZ = KEEP_ZERO(); + std::vector> A = {{5.00, 1.00, 1.00, 0.00, KZ}, + {KZ, 5.00, KZ, 0.00, 1.00}, + {0.00, 0.00, 5.00, 1.00, 1.00}, + {0.00, 0.00, 0.00, 5.00, 1.00}, + {0.00, 0.00, 0.00, 0.00, 5.00}}; + return A; } -} -#endif - -namespace { -template -struct ReductionCheck { - using lno_t = OrdinalType; - using value_type = ValueType; - - ViewType lhs; + static std::vector> get_5x5_lt_fixture() { + const auto KZ = KEEP_ZERO(); + std::vector> A = {{5.00, KZ, 0.00, 0.00, 0.00}, + {2.00, 5.00, 0.00, 0.00, 0.00}, + {1.00, KZ, 5.00, 0.00, 0.00}, + {0.00, 0.00, 1.00, 5.00, 0.00}, + {KZ, 1.00, 1.00, 1.00, 5.00}}; + return A; + } - ReductionCheck(const ViewType &lhs_) : lhs(lhs_) {} + static std::vector> get_5x5_lt_ones_fixture() { + std::vector> A = {{1.00, 0.00, 0.00, 0.00, 0.00}, + {0.00, 1.00, 0.00, 0.00, 0.00}, + {1.00, 0.00, 1.00, 0.00, 0.00}, + {0.00, 0.00, 1.00, 1.00, 0.00}, + {0.00, 1.00, 1.00, 1.00, 1.00}}; + return A; + } - KOKKOS_INLINE_FUNCTION - void operator()(lno_t i, value_type &tsum) const { tsum += lhs(i); } -}; -} // namespace + struct ReductionCheck { + ValuesType lhs; -template -void run_test_sptrsv() { - typedef Kokkos::View RowMapType; - typedef Kokkos::View EntriesType; - typedef Kokkos::View ValuesType; + ReductionCheck(const ValuesType &lhs_) : lhs(lhs_) {} - scalar_t ZERO = scalar_t(0); - scalar_t ONE = scalar_t(1); + KOKKOS_INLINE_FUNCTION + void operator()(lno_t i, scalar_t &tsum) const { tsum += lhs(i); } + }; - const size_type nrows = 5; - const size_type nnz = 10; + static void run_test_sptrsv() { + scalar_t ZERO = scalar_t(0); + scalar_t ONE = scalar_t(1); - using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< - size_type, lno_t, scalar_t, typename device::execution_space, - typename device::memory_space, typename device::memory_space>; + const size_type nrows = 5; + const size_type nnz = 10; #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) - using host_crsmat_t = typename KernelHandle::SPTRSVHandleType::host_crsmat_t; - using host_graph_t = typename host_crsmat_t::StaticCrsGraphType; + using host_crsmat_t = + typename KernelHandle::SPTRSVHandleType::host_crsmat_t; + using host_graph_t = typename host_crsmat_t::StaticCrsGraphType; - using row_map_view_t = typename host_graph_t::row_map_type::non_const_type; - using cols_view_t = typename host_graph_t::entries_type::non_const_type; - using values_view_t = typename host_crsmat_t::values_type::non_const_type; + using row_map_view_t = typename host_graph_t::row_map_type::non_const_type; + using cols_view_t = typename host_graph_t::entries_type::non_const_type; + using values_view_t = typename host_crsmat_t::values_type::non_const_type; - // L & U handle for supernodal SpTrsv - KernelHandle khL; - KernelHandle khU; + // L & U handle for supernodal SpTrsv + KernelHandle khL; + KernelHandle khU; - // right-hand-side and solution - ValuesType B("rhs", nrows); - ValuesType X("sol", nrows); + // right-hand-side and solution + ValuesType B("rhs", nrows); + ValuesType X("sol", nrows); - // host CRS for L & U - host_crsmat_t L, U, Ut; + // host CRS for L & U + host_crsmat_t L, U, Ut; #endif - // Upper tri - { - RowMapType row_map("row_map", nrows + 1); - EntriesType entries("entries", nnz); - ValuesType values("values", nnz); - - auto hrow_map = Kokkos::create_mirror_view(row_map); - auto hentries = Kokkos::create_mirror_view(entries); - auto hvalues = Kokkos::create_mirror_view(values); - - hrow_map(0) = 0; - hrow_map(1) = 2; - hrow_map(2) = 4; - hrow_map(3) = 7; - hrow_map(4) = 9; - hrow_map(5) = 10; - - hentries(0) = 0; - hentries(1) = 2; - hentries(2) = 1; - hentries(3) = 4; - hentries(4) = 2; - hentries(5) = 3; - hentries(6) = 4; - hentries(7) = 3; - hentries(8) = 4; - hentries(9) = 4; - - for (size_type i = 0; i < nnz; ++i) { - hvalues(i) = ONE; - } - - Kokkos::deep_copy(row_map, hrow_map); - Kokkos::deep_copy(entries, hentries); - Kokkos::deep_copy(values, hvalues); + // Upper tri + { + RowMapType row_map; + EntriesType entries; + ValuesType values; - // Create known_lhs, generate rhs, then solve for lhs to compare to - // known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); + auto fixture = get_5x5_ut_ones_fixture(); - // Solution to find - ValuesType lhs("lhs", nrows); + compress_matrix(row_map, entries, values, fixture); - // A*known_lhs generates rhs: rhs is dense, use spmv - ValuesType rhs("rhs", nrows); + // Create known_lhs, generate rhs, then solve for lhs to compare to + // known_lhs + ValuesType known_lhs("known_lhs", nrows); + // Create known solution lhs set to all 1's + Kokkos::deep_copy(known_lhs, ONE); - typedef CrsMatrix crsMat_t; - crsMat_t triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); - KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs); + // Solution to find + ValuesType lhs("lhs", nrows); - { - KernelHandle kh; - bool is_lower_tri = false; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, - is_lower_tri); - - sptrsv_symbolic(&kh, row_map, entries); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, - lhs.extent(0)), - ReductionCheck(lhs), sum); - if (sum != lhs.extent(0)) { - std::cout << "Upper Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); - - Kokkos::deep_copy(lhs, ZERO); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, - lhs.extent(0)), - ReductionCheck(lhs), sum); - if (sum != lhs.extent(0)) { - std::cout << "Upper Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); - - // FIXME Issues with various integral type combos - algorithm currently - // unavailable and commented out until fixed - /* - Kokkos::deep_copy(lhs, ZERO); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - Kokkos::fence(); - - sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), ReductionCheck(lhs), sum); if ( sum != lhs.extent(0) ) { std::cout << - "Upper Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - */ + // A*known_lhs generates rhs: rhs is dense, use spmv + ValuesType rhs("rhs", nrows); - kh.destroy_sptrsv_handle(); - } + Crs triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); + KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs); - { - Kokkos::deep_copy(lhs, ZERO); - KernelHandle kh; - bool is_lower_tri = false; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, - is_lower_tri); - auto chain_threshold = 1; - kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); - - sptrsv_symbolic(&kh, row_map, entries); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, - lhs.extent(0)), - ReductionCheck(lhs), sum); - if (sum != lhs.extent(0)) { - std::cout << "Upper Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); + { + KernelHandle kh; + bool is_lower_tri = false; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, + is_lower_tri); + + sptrsv_symbolic(&kh, row_map, entries); + Kokkos::fence(); + + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), + ReductionCheck(lhs), sum); + EXPECT_EQ(sum, lhs.extent(0)); + + Kokkos::deep_copy(lhs, ZERO); + kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), + ReductionCheck(lhs), sum); + EXPECT_EQ(sum, lhs.extent(0)); + + // FIXME Issues with various integral type combos - algorithm currently + // unavailable and commented out until fixed + /* + Kokkos::deep_copy(lhs, ZERO); + kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); + sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); + Kokkos::fence(); + + sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), + ReductionCheck(lhs), sum); + EXPECT_EQ(sum, lhs.extent(0) ); + */ + + kh.destroy_sptrsv_handle(); } - EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); - kh.destroy_sptrsv_handle(); - } + { + Kokkos::deep_copy(lhs, ZERO); + KernelHandle kh; + bool is_lower_tri = false; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, + is_lower_tri); + auto chain_threshold = 1; + kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); + + sptrsv_symbolic(&kh, row_map, entries); + Kokkos::fence(); + + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), + ReductionCheck(lhs), sum); + EXPECT_EQ(sum, lhs.extent(0)); + + kh.destroy_sptrsv_handle(); + } #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - if (std::is_same::value && - std::is_same::value && - std::is_same::value) { - Kokkos::deep_copy(lhs, ZERO); - KernelHandle kh; - bool is_lower_tri = false; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, - is_lower_tri); - - sptrsv_symbolic(&kh, row_map, entries, values); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, - lhs.extent(0)), - ReductionCheck(lhs), sum); - if (sum != lhs.extent(0)) { - std::cout << "Upper Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); + if (std::is_same::value && + std::is_same::value && + std::is_same::value) { + Kokkos::deep_copy(lhs, ZERO); + KernelHandle kh; + bool is_lower_tri = false; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, + is_lower_tri); + + sptrsv_symbolic(&kh, row_map, entries, values); + Kokkos::fence(); + + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), + ReductionCheck(lhs), sum); + EXPECT_EQ(sum, lhs.extent(0)); + + kh.destroy_sptrsv_handle(); } - EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); - - kh.destroy_sptrsv_handle(); - } #endif #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) - const scalar_t FIVE = scalar_t(5); - const size_type nnz_sp = 14; - { - // U in csr - row_map_view_t hUrowptr("hUrowptr", nrows + 1); - cols_view_t hUcolind("hUcolind", nnz_sp); - values_view_t hUvalues("hUvalues", nnz_sp); - - // rowptr - hUrowptr(0) = 0; - hUrowptr(1) = 4; - hUrowptr(2) = 8; - hUrowptr(3) = 11; - hUrowptr(4) = 13; - hUrowptr(5) = 14; - - // colind - // first row (first supernode) - hUcolind(0) = 0; - hUcolind(1) = 1; - hUcolind(2) = 2; - hUcolind(3) = 4; - // second row (first supernode) - hUcolind(4) = 0; - hUcolind(5) = 1; - hUcolind(6) = 2; - hUcolind(7) = 4; - // third row (second supernode) - hUcolind(8) = 2; - hUcolind(9) = 3; - hUcolind(10) = 4; - // fourth row (third supernode) - hUcolind(11) = 3; - hUcolind(12) = 4; - // fifth row (fourth supernode) - hUcolind(13) = 4; - - // values - // first row (first supernode) - hUvalues(0) = FIVE; - hUvalues(1) = ONE; - hUvalues(2) = ONE; - hUvalues(3) = ZERO; - // second row (first supernode) - hUvalues(4) = ZERO; - hUvalues(5) = FIVE; - hUvalues(6) = ZERO; - hUvalues(7) = ONE; - // third row (second supernode) - hUvalues(8) = FIVE; - hUvalues(9) = ONE; - hUvalues(10) = ONE; - // fourth row (third supernode) - hUvalues(11) = FIVE; - hUvalues(12) = ONE; - // fifth row (fourth supernode) - hUvalues(13) = FIVE; - - // save U for Supernodal Sptrsv - host_graph_t static_graph(hUcolind, hUrowptr); - U = host_crsmat_t("CrsMatrixU", nrows, hUvalues, static_graph); - - // create handle for Supernodal Sptrsv - bool is_lower_tri = false; - khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, - is_lower_tri); - - // X = U*ONES to generate B = A*ONES (on device) + const scalar_t FIVE = scalar_t(5); + const size_type nnz_sp = 14; { - RowMapType Urowptr("Urowptr", nrows + 1); - EntriesType Ucolind("Ucolind", nnz_sp); - ValuesType Uvalues("Uvalues", nnz_sp); - - Kokkos::deep_copy(Urowptr, hUrowptr); - Kokkos::deep_copy(Ucolind, hUcolind); - Kokkos::deep_copy(Uvalues, hUvalues); + // U in csr + auto ut_fixture = get_5x5_ut_fixture(); + row_map_view_t hUrowptr; + cols_view_t hUcolind; + values_view_t hUvalues; + + // first row -> first supernode + // second row -> first supernode + // third row -> second supernode + // fourth row -> third supernode + // fifth row -> fourth supernode + + compress_matrix(hUrowptr, hUcolind, hUvalues, ut_fixture); + + // save U for Supernodal Sptrsv + host_graph_t static_graph(hUcolind, hUrowptr); + U = host_crsmat_t("CrsMatrixU", nrows, hUvalues, static_graph); + + // create handle for Supernodal Sptrsv + bool is_lower_tri = false; + khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, + is_lower_tri); + + // X = U*ONES to generate B = A*ONES (on device) + { + RowMapType Urowptr("Urowptr", nrows + 1); + EntriesType Ucolind("Ucolind", nnz_sp); + ValuesType Uvalues("Uvalues", nnz_sp); + + Kokkos::deep_copy(Urowptr, hUrowptr); + Kokkos::deep_copy(Ucolind, hUcolind); + Kokkos::deep_copy(Uvalues, hUvalues); + + Crs mtxU("mtxU", nrows, nrows, nnz_sp, Uvalues, Urowptr, Ucolind); + Kokkos::deep_copy(B, ONE); + KokkosSparse::spmv("N", ONE, mtxU, B, ZERO, X); + } + } - crsMat_t mtxU("mtxU", nrows, nrows, nnz_sp, Uvalues, Urowptr, Ucolind); - Kokkos::deep_copy(B, ONE); - KokkosSparse::spmv("N", ONE, mtxU, B, ZERO, X); + { + // U in csc (for inverting off-diag) + row_map_view_t hUcolptr("hUcolptr", nrows + 1); + cols_view_t hUrowind("hUrowind", nnz_sp); + values_view_t hUvalues("hUvalues", nnz_sp); + + // The unsorted ordering seems to matter here, so we cannot use our + // fixture tools. + + hUcolptr(0) = 0; + hUcolptr(1) = 2; + hUcolptr(2) = 4; + hUcolptr(3) = 7; + hUcolptr(4) = 9; + hUcolptr(5) = 14; + + // colind + // first column (first supernode) + hUrowind(0) = 0; + hUrowind(1) = 1; + // second column (first supernode) + hUrowind(2) = 0; + hUrowind(3) = 1; + // third column (second supernode) + hUrowind(4) = 2; + hUrowind(5) = 0; + hUrowind(6) = 1; + // fourth column (third supernode) + hUrowind(7) = 3; + hUrowind(8) = 2; + // fifth column (fourth supernode) + hUrowind(9) = 4; + hUrowind(10) = 0; + hUrowind(11) = 1; + hUrowind(12) = 2; + hUrowind(13) = 3; + + // values + // first column (first supernode) + hUvalues(0) = FIVE; + hUvalues(1) = ZERO; + // second column (first supernode) + hUvalues(2) = ONE; + hUvalues(3) = FIVE; + // third column (second supernode) + hUvalues(4) = FIVE; + hUvalues(5) = ONE; + hUvalues(6) = ZERO; + // fourth column (third supernode) + hUvalues(7) = FIVE; + hUvalues(8) = ONE; + // fifth column (fourth supernode) + hUvalues(9) = FIVE; + hUvalues(10) = ZERO; + hUvalues(11) = ONE; + hUvalues(12) = ONE; + hUvalues(13) = ONE; + + // store Ut in crsmat + host_graph_t static_graph(hUrowind, hUcolptr); + Ut = host_crsmat_t("CrsMatrixUt", nrows, hUvalues, static_graph); } +#endif } + // Lower tri { - // U in csc (for inverting off-diag) - row_map_view_t hUcolptr("hUcolptr", nrows + 1); - cols_view_t hUrowind("hUrowind", nnz_sp); - values_view_t hUvalues("hUvalues", nnz_sp); - - // colptr - hUcolptr(0) = 0; - hUcolptr(1) = 2; - hUcolptr(2) = 4; - hUcolptr(3) = 7; - hUcolptr(4) = 9; - hUcolptr(5) = 14; - - // colind - // first column (first supernode) - hUrowind(0) = 0; - hUrowind(1) = 1; - // second column (first supernode) - hUrowind(2) = 0; - hUrowind(3) = 1; - // third column (second supernode) - hUrowind(4) = 2; - hUrowind(5) = 0; - hUrowind(6) = 1; - // fourth column (third supernode) - hUrowind(7) = 3; - hUrowind(8) = 2; - // fifth column (fourth supernode) - hUrowind(9) = 4; - hUrowind(10) = 0; - hUrowind(11) = 1; - hUrowind(12) = 2; - hUrowind(13) = 3; - - // values - // first column (first supernode) - hUvalues(0) = FIVE; - hUvalues(1) = ZERO; - // second column (first supernode) - hUvalues(2) = ONE; - hUvalues(3) = FIVE; - // third column (second supernode) - hUvalues(4) = FIVE; - hUvalues(5) = ONE; - hUvalues(6) = ZERO; - // fourth column (third supernode) - hUvalues(7) = FIVE; - hUvalues(8) = ONE; - // fifth column (fourth supernode) - hUvalues(9) = FIVE; - hUvalues(10) = ZERO; - hUvalues(11) = ONE; - hUvalues(12) = ONE; - hUvalues(13) = ONE; - - // store Ut in crsmat - host_graph_t static_graph(hUrowind, hUcolptr); - Ut = host_crsmat_t("CrsMatrixUt", nrows, hUvalues, static_graph); - } -#endif - } - - // Lower tri - { - RowMapType row_map("row_map", nrows + 1); - EntriesType entries("entries", nnz); - ValuesType values("values", nnz); - - auto hrow_map = Kokkos::create_mirror_view(row_map); - auto hentries = Kokkos::create_mirror_view(entries); - auto hvalues = Kokkos::create_mirror_view(values); - - hrow_map(0) = 0; - hrow_map(1) = 1; - hrow_map(2) = 2; - hrow_map(3) = 4; - hrow_map(4) = 6; - hrow_map(5) = 10; - - hentries(0) = 0; - hentries(1) = 1; - hentries(2) = 0; - hentries(3) = 2; - hentries(4) = 2; - hentries(5) = 3; - hentries(6) = 1; - hentries(7) = 2; - hentries(8) = 3; - hentries(9) = 4; - - for (size_type i = 0; i < nnz; ++i) { - hvalues(i) = ONE; - } + auto fixture = get_5x5_lt_ones_fixture(); + RowMapType row_map; + EntriesType entries; + ValuesType values; - Kokkos::deep_copy(row_map, hrow_map); - Kokkos::deep_copy(entries, hentries); - Kokkos::deep_copy(values, hvalues); + compress_matrix(row_map, entries, values, fixture); - // Create known_lhs, generate rhs, then solve for lhs to compare to - // known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); + // Create known_lhs, generate rhs, then solve for lhs to compare to + // known_lhs + ValuesType known_lhs("known_lhs", nrows); + // Create known solution lhs set to all 1's + Kokkos::deep_copy(known_lhs, ONE); - // Solution to find - ValuesType lhs("lhs", nrows); + // Solution to find + ValuesType lhs("lhs", nrows); - // A*known_lhs generates rhs: rhs is dense, use spmv - ValuesType rhs("rhs", nrows); + // A*known_lhs generates rhs: rhs is dense, use spmv + ValuesType rhs("rhs", nrows); - typedef CrsMatrix crsMat_t; - crsMat_t triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); - KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs); + Crs triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); + KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs); - { - KernelHandle kh; - bool is_lower_tri = true; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, - is_lower_tri); - - sptrsv_symbolic(&kh, row_map, entries); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, - lhs.extent(0)), - ReductionCheck(lhs), sum); - if (sum != lhs.extent(0)) { - std::cout << "Lower Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); - - Kokkos::deep_copy(lhs, ZERO); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, - lhs.extent(0)), - ReductionCheck(lhs), sum); - if (sum != lhs.extent(0)) { - std::cout << "Lower Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); - - // FIXME Issues with various integral type combos - algorithm currently - // unavailable and commented out until fixed - /* - Kokkos::deep_copy(lhs, ZERO); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - Kokkos::fence(); - - sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), ReductionCheck(lhs), sum); if ( sum != lhs.extent(0) ) { std::cout << - "Lower Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); + { + KernelHandle kh; + bool is_lower_tri = true; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, + is_lower_tri); + + sptrsv_symbolic(&kh, row_map, entries); + Kokkos::fence(); + + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), + ReductionCheck(lhs), sum); + EXPECT_EQ(sum, lhs.extent(0)); + + Kokkos::deep_copy(lhs, ZERO); + kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), + ReductionCheck(lhs), sum); + EXPECT_EQ(sum, lhs.extent(0)); + + // FIXME Issues with various integral type combos - algorithm currently + // unavailable and commented out until fixed + /* + Kokkos::deep_copy(lhs, ZERO); + kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); + sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); + Kokkos::fence(); + + sum = 0.0; + Kokkos::parallel_reduce( range_policy_t(0, lhs.extent(0)), + ReductionCheck(lhs), sum); + EXPECT_EQ( sum, lhs.extent(0) ); + */ + + kh.destroy_sptrsv_handle(); } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - */ - kh.destroy_sptrsv_handle(); - } - - { - Kokkos::deep_copy(lhs, ZERO); - KernelHandle kh; - bool is_lower_tri = true; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, - is_lower_tri); - auto chain_threshold = 1; - kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); - - sptrsv_symbolic(&kh, row_map, entries); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, - lhs.extent(0)), - ReductionCheck(lhs), sum); - if (sum != lhs.extent(0)) { - std::cout << "Lower Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); + { + Kokkos::deep_copy(lhs, ZERO); + KernelHandle kh; + bool is_lower_tri = true; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, + is_lower_tri); + auto chain_threshold = 1; + kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); + + sptrsv_symbolic(&kh, row_map, entries); + Kokkos::fence(); + + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), + ReductionCheck(lhs), sum); + EXPECT_EQ(sum, lhs.extent(0)); + + kh.destroy_sptrsv_handle(); } - EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); - - kh.destroy_sptrsv_handle(); - } #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - if (std::is_same::value && - std::is_same::value && - std::is_same::value) { - Kokkos::deep_copy(lhs, ZERO); - KernelHandle kh; - bool is_lower_tri = true; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, - is_lower_tri); - - sptrsv_symbolic(&kh, row_map, entries, values); - Kokkos::fence(); - - sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, - lhs.extent(0)), - ReductionCheck(lhs), sum); - if (sum != lhs.extent(0)) { - std::cout << "Lower Tri Solve FAILURE" << std::endl; - kh.get_sptrsv_handle()->print_algorithm(); + if (std::is_same::value && + std::is_same::value && + std::is_same::value) { + Kokkos::deep_copy(lhs, ZERO); + KernelHandle kh; + bool is_lower_tri = true; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, + is_lower_tri); + + sptrsv_symbolic(&kh, row_map, entries, values); + Kokkos::fence(); + + sptrsv_solve(&kh, row_map, entries, values, rhs, lhs); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs.extent(0)), + ReductionCheck(lhs), sum); + EXPECT_EQ(sum, lhs.extent(0)); + + kh.destroy_sptrsv_handle(); } - EXPECT_TRUE(sum == scalar_t(lhs.extent(0))); - - kh.destroy_sptrsv_handle(); - } #endif #if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) - { - // L in csc - const scalar_t TWO = scalar_t(2); - const scalar_t FIVE = scalar_t(5); - const size_type nnz_sp = 14; - - row_map_view_t hLcolptr("hUcolptr", nrows + 1); - cols_view_t hLrowind("hUrowind", nnz_sp); - values_view_t hLvalues("hUvalues", nnz_sp); - - // colptr - hLcolptr(0) = 0; - hLcolptr(1) = 4; - hLcolptr(2) = 8; - hLcolptr(3) = 11; - hLcolptr(4) = 13; - hLcolptr(5) = 14; - - // rowind - // first column (first supernode) - hLrowind(0) = 0; - hLrowind(1) = 1; - hLrowind(2) = 2; - hLrowind(3) = 4; - // second column (first supernode) - hLrowind(4) = 0; - hLrowind(5) = 1; - hLrowind(6) = 2; - hLrowind(7) = 4; - // third column (second supernode) - hLrowind(8) = 2; - hLrowind(9) = 3; - hLrowind(10) = 4; - // fourth column (third supernode) - hLrowind(11) = 3; - hLrowind(12) = 4; - // fifth column (fourth supernode) - hLrowind(13) = 4; - - // values - // first column (first supernode) - hLvalues(0) = FIVE; - hLvalues(1) = TWO; - hLvalues(2) = ONE; - hLvalues(3) = ZERO; - // second column (first supernode) - hLvalues(4) = ZERO; - hLvalues(5) = FIVE; - hLvalues(6) = ZERO; - hLvalues(7) = ONE; - // third column (second supernode) - hLvalues(8) = FIVE; - hLvalues(9) = ONE; - hLvalues(10) = ONE; - // fourth column (third supernode) - hLvalues(11) = FIVE; - hLvalues(12) = ONE; - // fifth column (fourth supernode) - hLvalues(13) = FIVE; - - // store Lt in crsmat - host_graph_t static_graph(hLrowind, hLcolptr); - L = host_crsmat_t("CrsMatrixL", nrows, hLvalues, static_graph); - - bool is_lower_tri = true; - khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, - is_lower_tri); - - // generate B = A*ONES = L*(U*ONES), where X = U*ONES (on device) { - RowMapType Lcolptr("Lcolptr", nrows + 1); - EntriesType Lrowind("Lrowind", nnz_sp); - ValuesType Lvalues("Lvalues", nnz_sp); - - Kokkos::deep_copy(Lcolptr, hLcolptr); - Kokkos::deep_copy(Lrowind, hLrowind); - Kokkos::deep_copy(Lvalues, hLvalues); - - crsMat_t mtxL("mtxL", nrows, nrows, nnz_sp, Lvalues, Lcolptr, Lrowind); - KokkosSparse::spmv("T", ONE, mtxL, X, ZERO, B); + // L in csc + const size_type nnz_sp = 14; + + // first column (first supernode) + // second column (first supernode) + // third column (second supernode) + // fourth column (third supernode) + // fifth column (fourth supernode) + + auto lt_fixture = get_5x5_lt_fixture(); + row_map_view_t hLcolptr; + cols_view_t hLrowind; + values_view_t hLvalues; + compress_matrix(hLcolptr, hLrowind, hLvalues, lt_fixture); + + // store Lt in crsmat + host_graph_t static_graph(hLrowind, hLcolptr); + L = host_crsmat_t("CrsMatrixL", nrows, hLvalues, static_graph); + + bool is_lower_tri = true; + khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, + is_lower_tri); + + // generate B = A*ONES = L*(U*ONES), where X = U*ONES (on device) + { + RowMapType Lcolptr("Lcolptr", nrows + 1); + EntriesType Lrowind("Lrowind", nnz_sp); + ValuesType Lvalues("Lvalues", nnz_sp); + + Kokkos::deep_copy(Lcolptr, hLcolptr); + Kokkos::deep_copy(Lrowind, hLrowind); + Kokkos::deep_copy(Lvalues, hLvalues); + + Crs mtxL("mtxL", nrows, nrows, nnz_sp, Lvalues, Lcolptr, Lrowind); + KokkosSparse::spmv("T", ONE, mtxL, X, ZERO, B); + } } - } - { - // unit-test for supernode SpTrsv (default) - // > set up supernodes (block size = one) - size_type nsupers = 4; - Kokkos::View supercols("supercols", - 1 + nsupers); - supercols(0) = 0; - supercols(1) = 2; // two columns - supercols(2) = 3; // one column - supercols(3) = 4; // one column - supercols(4) = 5; // one column - int *etree = NULL; // we generate graph internally - - // invert diagonal blocks - bool invert_diag = true; - khL.set_sptrsv_invert_diagonal(invert_diag); - khU.set_sptrsv_invert_diagonal(invert_diag); - - // > symbolic (on host) - sptrsv_supernodal_symbolic(nsupers, supercols.data(), etree, L.graph, - &khL, U.graph, &khU); - // > numeric (on host) - sptrsv_compute(&khL, L); - sptrsv_compute(&khU, U); - Kokkos::fence(); - - // > solve - ValuesType b("b", nrows); - Kokkos::deep_copy(b, B); - Kokkos::deep_copy(X, ZERO); - sptrsv_solve(&khL, &khU, X, b); - Kokkos::fence(); - - // > check - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, X.extent(0)), - ReductionCheck(X), sum); - if (sum != lhs.extent(0)) { - std::cout << "Supernode Tri Solve FAILURE : " << sum << " vs." - << lhs.extent(0) << std::endl; - khL.get_sptrsv_handle()->print_algorithm(); - } else { - std::cout << "Supernode Tri Solve SUCCESS" << std::endl; - khL.get_sptrsv_handle()->print_algorithm(); + { + // unit-test for supernode SpTrsv (default) + // > set up supernodes (block size = one) + size_type nsupers = 4; + Kokkos::View supercols("supercols", + 1 + nsupers); + supercols(0) = 0; + supercols(1) = 2; // two columns + supercols(2) = 3; // one column + supercols(3) = 4; // one column + supercols(4) = 5; // one column + int *etree = NULL; // we generate graph internally + + // invert diagonal blocks + bool invert_diag = true; + khL.set_sptrsv_invert_diagonal(invert_diag); + khU.set_sptrsv_invert_diagonal(invert_diag); + + // > symbolic (on host) + sptrsv_supernodal_symbolic(nsupers, supercols.data(), etree, L.graph, + &khL, U.graph, &khU); + // > numeric (on host) + sptrsv_compute(&khL, L); + sptrsv_compute(&khU, U); + Kokkos::fence(); + + // > solve + ValuesType b("b", nrows); + Kokkos::deep_copy(b, B); + Kokkos::deep_copy(X, ZERO); + sptrsv_solve(&khL, &khU, X, b); + Kokkos::fence(); + + // > check + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, X.extent(0)), + ReductionCheck(X), sum); + EXPECT_EQ(sum, lhs.extent(0)); + EXPECT_EQ(sum, X.extent(0)); + + khL.destroy_sptrsv_handle(); + khU.destroy_sptrsv_handle(); } - EXPECT_TRUE(sum == scalar_t(X.extent(0))); - khL.destroy_sptrsv_handle(); - khU.destroy_sptrsv_handle(); - } - - { - // unit-test for supernode SpTrsv (running TRMM on device for compute) - // > set up supernodes - size_type nsupers = 4; - Kokkos::View supercols("supercols", - 1 + nsupers); - supercols(0) = 0; - supercols(1) = 2; // two columns - supercols(2) = 3; // one column - supercols(3) = 4; // one column - supercols(4) = 5; // one column - int *etree = NULL; // we generate tree internally - - // > create handles - KernelHandle khLd; - KernelHandle khUd; - khLd.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, true); - khUd.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, false); - - // > invert diagonal blocks - bool invert_diag = true; - khLd.set_sptrsv_invert_diagonal(invert_diag); - khUd.set_sptrsv_invert_diagonal(invert_diag); - - // > invert off-diagonal blocks - bool invert_offdiag = true; - khUd.set_sptrsv_column_major(true); - khLd.set_sptrsv_invert_offdiagonal(invert_offdiag); - khUd.set_sptrsv_invert_offdiagonal(invert_offdiag); - - // > forcing sptrsv compute to perform TRMM on device - khLd.set_sptrsv_diag_supernode_sizes(1, 1); - khUd.set_sptrsv_diag_supernode_sizes(1, 1); - - // > symbolic (on host) - sptrsv_supernodal_symbolic(nsupers, supercols.data(), etree, L.graph, - &khLd, Ut.graph, &khUd); - // > numeric (on host) - sptrsv_compute(&khLd, L); - sptrsv_compute(&khUd, Ut); - Kokkos::fence(); - - // > solve - ValuesType b("b", nrows); - Kokkos::deep_copy(b, B); - Kokkos::deep_copy(X, ZERO); - sptrsv_solve(&khLd, &khUd, X, b); - Kokkos::fence(); - - // > check - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, X.extent(0)), - ReductionCheck(X), sum); - if (sum != lhs.extent(0)) { - std::cout << "Supernode Tri Solve FAILURE : " << sum << " vs." - << lhs.extent(0) << std::endl; - khLd.get_sptrsv_handle()->print_algorithm(); - } else { - std::cout << "Supernode Tri Solve SUCCESS" << std::endl; - khLd.get_sptrsv_handle()->print_algorithm(); + { + // unit-test for supernode SpTrsv (running TRMM on device for compute) + // > set up supernodes + size_type nsupers = 4; + Kokkos::View supercols("supercols", + 1 + nsupers); + supercols(0) = 0; + supercols(1) = 2; // two columns + supercols(2) = 3; // one column + supercols(3) = 4; // one column + supercols(4) = 5; // one column + int *etree = NULL; // we generate tree internally + + // > create handles + KernelHandle khLd; + KernelHandle khUd; + khLd.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, true); + khUd.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, + false); + + // > invert diagonal blocks + bool invert_diag = true; + khLd.set_sptrsv_invert_diagonal(invert_diag); + khUd.set_sptrsv_invert_diagonal(invert_diag); + + // > invert off-diagonal blocks + bool invert_offdiag = true; + khUd.set_sptrsv_column_major(true); + khLd.set_sptrsv_invert_offdiagonal(invert_offdiag); + khUd.set_sptrsv_invert_offdiagonal(invert_offdiag); + + // > forcing sptrsv compute to perform TRMM on device + khLd.set_sptrsv_diag_supernode_sizes(1, 1); + khUd.set_sptrsv_diag_supernode_sizes(1, 1); + + // > symbolic (on host) + sptrsv_supernodal_symbolic(nsupers, supercols.data(), etree, L.graph, + &khLd, Ut.graph, &khUd); + // > numeric (on host) + sptrsv_compute(&khLd, L); + sptrsv_compute(&khUd, Ut); + Kokkos::fence(); + + // > solve + ValuesType b("b", nrows); + Kokkos::deep_copy(b, B); + Kokkos::deep_copy(X, ZERO); + sptrsv_solve(&khLd, &khUd, X, b); + Kokkos::fence(); + + // > check + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, X.extent(0)), + ReductionCheck(X), sum); + EXPECT_EQ(sum, lhs.extent(0)); + EXPECT_EQ(sum, X.extent(0)); + + khLd.destroy_sptrsv_handle(); + khUd.destroy_sptrsv_handle(); } - EXPECT_TRUE(sum == scalar_t(X.extent(0))); - - khLd.destroy_sptrsv_handle(); - khUd.destroy_sptrsv_handle(); - } #endif + } } -} - -template -void run_test_sptrsv_streams(int test_algo, int nstreams) { - using RowMapType = Kokkos::View; - using EntriesType = Kokkos::View; - using ValuesType = Kokkos::View; - using RowMapType_hostmirror = typename RowMapType::HostMirror; - using EntriesType_hostmirror = typename EntriesType::HostMirror; - using ValuesType_hostmirror = typename ValuesType::HostMirror; - using execution_space = typename device::execution_space; - using memory_space = typename device::memory_space; - using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< - size_type, lno_t, scalar_t, execution_space, memory_space, memory_space>; - using crsMat_t = CrsMatrix; - // Workaround for OpenMP: skip tests if concurrency < nstreams because of - // not enough resource to partition - bool run_streams_test = true; + static void run_test_sptrsv_streams(int test_algo, int nstreams) { + // Workaround for OpenMP: skip tests if concurrency < nstreams because of + // not enough resource to partition + bool run_streams_test = true; #ifdef KOKKOS_ENABLE_OPENMP - if (std::is_same::value) { - int exec_concurrency = execution_space().concurrency(); - if (exec_concurrency < nstreams) { - run_streams_test = false; - std::cout << " Skip stream test: concurrency = " << exec_concurrency - << std::endl; + if (std::is_same::value) { + int exec_concurrency = execution_space().concurrency(); + if (exec_concurrency < nstreams) { + run_streams_test = false; + std::cout << " Skip stream test: concurrency = " << exec_concurrency + << std::endl; + } } - } #endif - if (!run_streams_test) return; - - scalar_t ZERO = scalar_t(0); - scalar_t ONE = scalar_t(1); - - const size_type nrows = 5; - const size_type nnz = 10; - - std::vector instances; - if (nstreams == 1) - instances = Kokkos::Experimental::partition_space(execution_space(), 1); - else if (nstreams == 2) - instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1); - else if (nstreams == 3) - instances = - Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1); - else // (nstreams == 4) - instances = - Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1); - - std::vector kh_v(nstreams); - std::vector kh_ptr_v(nstreams); - std::vector row_map_v(nstreams); - std::vector entries_v(nstreams); - std::vector values_v(nstreams); - std::vector rhs_v(nstreams); - std::vector lhs_v(nstreams); - - RowMapType_hostmirror hrow_map("hrow_map", nrows + 1); - EntriesType_hostmirror hentries("hentries", nnz); - ValuesType_hostmirror hvalues("hvalues", nnz); - - // Upper tri - { - hrow_map(0) = 0; - hrow_map(1) = 2; - hrow_map(2) = 4; - hrow_map(3) = 7; - hrow_map(4) = 9; - hrow_map(5) = 10; - - hentries(0) = 0; - hentries(1) = 2; - hentries(2) = 1; - hentries(3) = 4; - hentries(4) = 2; - hentries(5) = 3; - hentries(6) = 4; - hentries(7) = 3; - hentries(8) = 4; - hentries(9) = 4; - - for (size_type i = 0; i < nnz; ++i) { - hvalues(i) = ONE; - } + if (!run_streams_test) return; - for (int i = 0; i < nstreams; i++) { - // Allocate U - row_map_v[i] = RowMapType("row_map", nrows + 1); - entries_v[i] = EntriesType("entries", nnz); - values_v[i] = ValuesType("values", nnz); + scalar_t ZERO = scalar_t(0); + scalar_t ONE = scalar_t(1); - // Copy from host to device - Kokkos::deep_copy(row_map_v[i], hrow_map); - Kokkos::deep_copy(entries_v[i], hentries); - Kokkos::deep_copy(values_v[i], hvalues); + const size_type nrows = 5; + const size_type nnz = 10; - // Create known_lhs, generate rhs, then solve for lhs to compare to - // known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); + auto instances = Kokkos::Experimental::partition_space( + execution_space(), std::vector(nstreams, 1)); - // Solution to find - lhs_v[i] = ValuesType("lhs", nrows); + std::vector kh_v(nstreams); + std::vector kh_ptr_v(nstreams); + std::vector row_map_v(nstreams); + std::vector entries_v(nstreams); + std::vector values_v(nstreams); + std::vector rhs_v(nstreams); + std::vector lhs_v(nstreams); - // A*known_lhs generates rhs: rhs is dense, use spmv - rhs_v[i] = ValuesType("rhs", nrows); - - crsMat_t triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i], - entries_v[i]); - - KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]); - Kokkos::fence(); - - // Create handle - kh_v[i] = KernelHandle(); - bool is_lower_tri = false; - if (test_algo == 0) - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, - is_lower_tri); - else if (test_algo == 1) - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, - is_lower_tri); - else - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, - is_lower_tri); - - kh_ptr_v[i] = &kh_v[i]; - - // Symbolic phase - sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]); - Kokkos::fence(); - } // Done handle creation and sptrsv_symbolic on all streams - - // Solve phase - sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v, - rhs_v, lhs_v); - - for (int i = 0; i < nstreams; i++) instances[i].fence(); - - // Checking - for (int i = 0; i < nstreams; i++) { - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy( - 0, lhs_v[i].extent(0)), - ReductionCheck(lhs_v[i]), sum); - if (sum != lhs_v[i].extent(0)) { - std::cout << "Upper Tri Solve FAILURE on stream " << i << std::endl; - kh_v[i].get_sptrsv_handle()->print_algorithm(); - } - EXPECT_TRUE(sum == scalar_t(lhs_v[i].extent(0))); + RowMapType_hostmirror hrow_map; + EntriesType_hostmirror hentries; + ValuesType_hostmirror hvalues; - kh_v[i].destroy_sptrsv_handle(); - } - } - - // Lower tri - { - hrow_map(0) = 0; - hrow_map(1) = 1; - hrow_map(2) = 2; - hrow_map(3) = 4; - hrow_map(4) = 6; - hrow_map(5) = 10; - - hentries(0) = 0; - hentries(1) = 1; - hentries(2) = 0; - hentries(3) = 2; - hentries(4) = 2; - hentries(5) = 3; - hentries(6) = 1; - hentries(7) = 2; - hentries(8) = 3; - hentries(9) = 4; - - for (size_type i = 0; i < nnz; ++i) { - hvalues(i) = ONE; + // Upper tri + { + auto fixture = get_5x5_ut_ones_fixture(); + compress_matrix(hrow_map, hentries, hvalues, fixture); + + for (int i = 0; i < nstreams; i++) { + // Allocate U + row_map_v[i] = RowMapType("row_map", nrows + 1); + entries_v[i] = EntriesType("entries", nnz); + values_v[i] = ValuesType("values", nnz); + + // Copy from host to device + Kokkos::deep_copy(row_map_v[i], hrow_map); + Kokkos::deep_copy(entries_v[i], hentries); + Kokkos::deep_copy(values_v[i], hvalues); + + // Create known_lhs, generate rhs, then solve for lhs to compare to + // known_lhs + ValuesType known_lhs("known_lhs", nrows); + // Create known solution lhs set to all 1's + Kokkos::deep_copy(known_lhs, ONE); + + // Solution to find + lhs_v[i] = ValuesType("lhs", nrows); + + // A*known_lhs generates rhs: rhs is dense, use spmv + rhs_v[i] = ValuesType("rhs", nrows); + + Crs triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i], + entries_v[i]); + + KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]); + Kokkos::fence(); + + // Create handle + kh_v[i] = KernelHandle(); + bool is_lower_tri = false; + if (test_algo == 0) + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, + is_lower_tri); + else if (test_algo == 1) + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, + is_lower_tri); + else + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, + is_lower_tri); + + kh_ptr_v[i] = &kh_v[i]; + + // Symbolic phase + sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]); + Kokkos::fence(); + } // Done handle creation and sptrsv_symbolic on all streams + + // Solve phase + sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v, + rhs_v, lhs_v); + + for (int i = 0; i < nstreams; i++) instances[i].fence(); + + // Checking + for (int i = 0; i < nstreams; i++) { + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs_v[i].extent(0)), + ReductionCheck(lhs_v[i]), sum); + EXPECT_EQ(sum, lhs_v[i].extent(0)); + + kh_v[i].destroy_sptrsv_handle(); + } } - for (int i = 0; i < nstreams; i++) { - // Allocate L - row_map_v[i] = RowMapType("row_map", nrows + 1); - entries_v[i] = EntriesType("entries", nnz); - values_v[i] = ValuesType("values", nnz); - - // Copy from host to device - Kokkos::deep_copy(row_map_v[i], hrow_map); - Kokkos::deep_copy(entries_v[i], hentries); - Kokkos::deep_copy(values_v[i], hvalues); - - // Create known_lhs, generate rhs, then solve for lhs to compare to - // known_lhs - ValuesType known_lhs("known_lhs", nrows); - // Create known solution lhs set to all 1's - Kokkos::deep_copy(known_lhs, ONE); - - // Solution to find - lhs_v[i] = ValuesType("lhs", nrows); - - // A*known_lhs generates rhs: rhs is dense, use spmv - rhs_v[i] = ValuesType("rhs", nrows); - - crsMat_t triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i], - entries_v[i]); - - KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]); - Kokkos::fence(); - - // Create handle - kh_v[i] = KernelHandle(); - bool is_lower_tri = true; - if (test_algo == 0) - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, - is_lower_tri); - else if (test_algo == 1) - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, - is_lower_tri); - else - kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, - is_lower_tri); - - kh_ptr_v[i] = &kh_v[i]; - - // Symbolic phase - sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]); - Kokkos::fence(); - } // Done handle creation and sptrsv_symbolic on all streams - - // Solve phase - sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v, - rhs_v, lhs_v); - - for (int i = 0; i < nstreams; i++) instances[i].fence(); - - // Checking - for (int i = 0; i < nstreams; i++) { - scalar_t sum = 0.0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy( - 0, lhs_v[i].extent(0)), - ReductionCheck(lhs_v[i]), sum); - if (sum != lhs_v[i].extent(0)) { - std::cout << "Lower Tri Solve FAILURE on stream " << i << std::endl; - kh_v[i].get_sptrsv_handle()->print_algorithm(); + // Lower tri + { + auto fixture = get_5x5_lt_ones_fixture(); + compress_matrix(hrow_map, hentries, hvalues, fixture); + + for (int i = 0; i < nstreams; i++) { + // Allocate L + row_map_v[i] = RowMapType("row_map", nrows + 1); + entries_v[i] = EntriesType("entries", nnz); + values_v[i] = ValuesType("values", nnz); + + // Copy from host to device + Kokkos::deep_copy(row_map_v[i], hrow_map); + Kokkos::deep_copy(entries_v[i], hentries); + Kokkos::deep_copy(values_v[i], hvalues); + + // Create known_lhs, generate rhs, then solve for lhs to compare to + // known_lhs + ValuesType known_lhs("known_lhs", nrows); + // Create known solution lhs set to all 1's + Kokkos::deep_copy(known_lhs, ONE); + + // Solution to find + lhs_v[i] = ValuesType("lhs", nrows); + + // A*known_lhs generates rhs: rhs is dense, use spmv + rhs_v[i] = ValuesType("rhs", nrows); + + Crs triMtx("triMtx", nrows, nrows, nnz, values_v[i], row_map_v[i], + entries_v[i]); + + KokkosSparse::spmv("N", ONE, triMtx, known_lhs, ZERO, rhs_v[i]); + Kokkos::fence(); + + // Create handle + kh_v[i] = KernelHandle(); + bool is_lower_tri = true; + if (test_algo == 0) + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, + is_lower_tri); + else if (test_algo == 1) + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, + is_lower_tri); + else + kh_v[i].create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, + is_lower_tri); + + kh_ptr_v[i] = &kh_v[i]; + + // Symbolic phase + sptrsv_symbolic(kh_ptr_v[i], row_map_v[i], entries_v[i], values_v[i]); + Kokkos::fence(); + } // Done handle creation and sptrsv_symbolic on all streams + + // Solve phase + sptrsv_solve_streams(instances, kh_ptr_v, row_map_v, entries_v, values_v, + rhs_v, lhs_v); + + for (int i = 0; i < nstreams; i++) instances[i].fence(); + + // Checking + for (int i = 0; i < nstreams; i++) { + scalar_t sum = 0.0; + Kokkos::parallel_reduce(range_policy_t(0, lhs_v[i].extent(0)), + ReductionCheck(lhs_v[i]), sum); + EXPECT_EQ(sum, lhs_v[i].extent(0)); + + kh_v[i].destroy_sptrsv_handle(); } - EXPECT_TRUE(sum == scalar_t(lhs_v[i].extent(0))); - - kh_v[i].destroy_sptrsv_handle(); } } -} +}; } // namespace Test template void test_sptrsv() { - Test::run_test_sptrsv(); - // Test::run_test_sptrsv_mtx(); + using TestStruct = Test::SptrsvTest; + TestStruct::run_test_sptrsv(); } template void test_sptrsv_streams() { - std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_RP: 1 stream" << std::endl; - Test::run_test_sptrsv_streams(0, 1); - - std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_RP: 2 streams" << std::endl; - Test::run_test_sptrsv_streams(0, 2); - - std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_RP: 3 streams" << std::endl; - Test::run_test_sptrsv_streams(0, 3); - - std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_RP: 4 streams" << std::endl; - Test::run_test_sptrsv_streams(0, 4); - - std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_TP1: 1 stream" << std::endl; - Test::run_test_sptrsv_streams(1, 1); + using TestStruct = Test::SptrsvTest; - std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_TP1: 2 streams" << std::endl; - Test::run_test_sptrsv_streams(1, 2); - - std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_TP1: 3 streams" << std::endl; - Test::run_test_sptrsv_streams(1, 3); - - std::cout << "SPTRSVAlgorithm::SEQLVLSCHD_TP1: 4 streams" << std::endl; - Test::run_test_sptrsv_streams(1, 4); + TestStruct::run_test_sptrsv_streams(0, 1); + TestStruct::run_test_sptrsv_streams(0, 2); + TestStruct::run_test_sptrsv_streams(0, 3); + TestStruct::run_test_sptrsv_streams(0, 4); + TestStruct::run_test_sptrsv_streams(1, 1); + TestStruct::run_test_sptrsv_streams(1, 2); + TestStruct::run_test_sptrsv_streams(1, 3); + TestStruct::run_test_sptrsv_streams(1, 4); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) if (std::is_same::value && std::is_same::value) { - std::cout << "SPTRSVAlgorithm::SPTRSV_CUSPARSE: 1 stream" << std::endl; - Test::run_test_sptrsv_streams(2, 1); - - std::cout << "SPTRSVAlgorithm::SPTRSV_CUSPARSE: 2 streams" << std::endl; - Test::run_test_sptrsv_streams(2, 2); - - std::cout << "SPTRSVAlgorithm::SPTRSV_CUSPARSE: 3 streams" << std::endl; - Test::run_test_sptrsv_streams(2, 3); - - std::cout << "SPTRSVAlgorithm::SPTRSV_CUSPARSE: 4 streams" << std::endl; - Test::run_test_sptrsv_streams(2, 4); + TestStruct::run_test_sptrsv_streams(2, 1); + TestStruct::run_test_sptrsv_streams(2, 2); + TestStruct::run_test_sptrsv_streams(2, 3); + TestStruct::run_test_sptrsv_streams(2, 4); } #endif } diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_trsv.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_trsv.hpp index d580cc472dea..8fb4763d7121 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_trsv.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_trsv.hpp @@ -34,89 +34,131 @@ typedef Kokkos::complex kokkos_complex_double; typedef Kokkos::complex kokkos_complex_float; namespace Test { -// TODO: remove this once MD develop branch is merge. -// The below functionolity exists in SparseUtils. - -template -void check_trsv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type b, - y_vector_type expected_x, int numMV, const char uplo[], - const char trans[]) { - // typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - typedef typename scalar_view_t::value_type ScalarA; - double eps = (std::is_same::value - ? 2 * 1e-2 - : (std::is_same>::value || - std::is_same>::value) - ? 2 * 1e-1 - : 1e-7); - - Kokkos::fence(); - KokkosSparse::trsv(uplo, trans, "N", input_mat, b, x); - - for (int i = 0; i < numMV; ++i) { - auto x_i = Kokkos::subview(x, Kokkos::ALL(), i); - - auto expected_x_i = Kokkos::subview(expected_x, Kokkos::ALL(), i); - - EXPECT_NEAR_KK_1DVIEW(expected_x_i, x_i, eps); - } + +template < + typename Crs, typename LUType, typename size_type, + typename std::enable_if::value>::type* = nullptr> +LUType get_LU(char l_or_u, int n, size_type& nnz, int row_size_variance, + int bandwidth, int) { + auto LU = KokkosSparse::Impl::kk_generate_triangular_sparse_matrix( + l_or_u, n, n, nnz, row_size_variance, bandwidth); + + return LU; +} + +template < + typename Crs, typename LUType, typename size_type, + typename std::enable_if::value>::type* = nullptr> +LUType get_LU(char l_or_u, int n, size_type& nnz, int row_size_variance, + int bandwidth, int block_size) { + auto LU_unblocked = + KokkosSparse::Impl::kk_generate_triangular_sparse_matrix( + l_or_u, n, n, nnz, row_size_variance, bandwidth); + + // Convert to BSR + LUType LU(LU_unblocked, block_size); + + return LU; } -} // namespace Test template -void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, - lno_t row_size_variance, int numMV) { - lno_t numCols = numRows; + typename layout, typename device> +struct TrsvTest { + using View2D = Kokkos::View; + using execution_space = typename device::execution_space; + + using Crs = CrsMatrix; + using Bsr = BsrMatrix; + + // TODO: remove this once MD develop branch is merge. + // The below functionolity exists in SparseUtils. + template + static void check_trsv_mv(sp_matrix_type input_mat, View2D x, View2D b, + View2D expected_x, int numMV, const char uplo[], + const char trans[]) { + double eps = (std::is_same::value + ? 2 * 1e-2 + : (std::is_same>::value || + std::is_same>::value) + ? 2 * 1e-1 + : 1e-7); + + Kokkos::fence(); + KokkosSparse::trsv(uplo, trans, "N", input_mat, b, x); + + for (int i = 0; i < numMV; ++i) { + auto x_i = Kokkos::subview(x, Kokkos::ALL(), i); + + auto expected_x_i = Kokkos::subview(expected_x, Kokkos::ALL(), i); + + EXPECT_NEAR_KK_1DVIEW(expected_x_i, x_i, eps); + } + } + + template + static void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, + lno_t row_size_variance, int numMV) { + using sp_matrix_type = std::conditional_t; - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; - // typedef typename crsMat_t::values_type::non_const_type scalar_view_t; + constexpr auto block_size = UseBlocks ? 10 : 1; - typedef Kokkos::View ViewTypeX; - typedef Kokkos::View ViewTypeY; + lno_t numCols = numRows; - ViewTypeX b_x("A", numRows, numMV); - ViewTypeY b_y("B", numCols, numMV); - ViewTypeX b_x_copy("B", numCols, numMV); + View2D b_x("A", numRows, numMV); + View2D b_y("B", numCols, numMV); + View2D b_x_copy("B", numCols, numMV); - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); - Kokkos::fill_random(b_x_copy, rand_pool, scalar_t(10)); + Kokkos::Random_XorShift64_Pool rand_pool(13718); + Kokkos::fill_random(b_x_copy, rand_pool, scalar_t(10)); - typename ViewTypeY::non_const_value_type alpha = 1; - typename ViewTypeY::non_const_value_type beta = 0; + scalar_t alpha = 1; + scalar_t beta = 0; - // this function creates a dense lower and upper triangular matrix. - // TODO: SHOULD CHANGE IT TO SPARSE - crsMat_t lower_part = - KokkosSparse::Impl::kk_generate_triangular_sparse_matrix( - 'L', numRows, numCols, nnz, row_size_variance, bandwidth); + // this function creates a dense lower and upper triangular matrix. + auto lower_part = get_LU( + 'L', numRows, nnz, row_size_variance, bandwidth, block_size); - Test::shuffleMatrixEntries(lower_part.graph.row_map, lower_part.graph.entries, - lower_part.values); + Test::shuffleMatrixEntries(lower_part.graph.row_map, + lower_part.graph.entries, lower_part.values, + block_size); - KokkosSparse::spmv("N", alpha, lower_part, b_x_copy, beta, b_y); - Test::check_trsv_mv(lower_part, b_x, b_y, b_x_copy, numMV, "L", "N"); + KokkosSparse::spmv("N", alpha, lower_part, b_x_copy, beta, b_y); + check_trsv_mv(lower_part, b_x, b_y, b_x_copy, numMV, "L", "N"); - KokkosSparse::spmv("T", alpha, lower_part, b_x_copy, beta, b_y); - Test::check_trsv_mv(lower_part, b_x, b_y, b_x_copy, numMV, "L", "T"); - // typedef typename Kokkos::View indexview; + if (!UseBlocks) { + KokkosSparse::spmv("T", alpha, lower_part, b_x_copy, beta, b_y); + check_trsv_mv(lower_part, b_x, b_y, b_x_copy, numMV, "L", "T"); + } - crsMat_t upper_part = - KokkosSparse::Impl::kk_generate_triangular_sparse_matrix( - 'U', numRows, numCols, nnz, row_size_variance, bandwidth); + auto upper_part = get_LU( + 'U', numRows, nnz, row_size_variance, bandwidth, block_size); - Test::shuffleMatrixEntries(upper_part.graph.row_map, upper_part.graph.entries, - upper_part.values); + Test::shuffleMatrixEntries(upper_part.graph.row_map, + upper_part.graph.entries, upper_part.values, + block_size); - KokkosSparse::spmv("N", alpha, upper_part, b_x_copy, beta, b_y); - Test::check_trsv_mv(upper_part, b_x, b_y, b_x_copy, numMV, "U", "N"); + KokkosSparse::spmv("N", alpha, upper_part, b_x_copy, beta, b_y); + check_trsv_mv(upper_part, b_x, b_y, b_x_copy, numMV, "U", "N"); + + if (!UseBlocks) { + KokkosSparse::spmv("T", alpha, upper_part, b_x_copy, beta, b_y); + check_trsv_mv(upper_part, b_x, b_y, b_x_copy, numMV, "U", "T"); + } + } +}; - KokkosSparse::spmv("T", alpha, upper_part, b_x_copy, beta, b_y); - Test::check_trsv_mv(upper_part, b_x, b_y, b_x_copy, numMV, "U", "T"); +} // namespace Test + +template +void test_trsv_mv() { + using TestStruct = Test::TrsvTest; + TestStruct::template test_trsv_mv(1000, 1000 * 30, 200, 10, 1); + TestStruct::template test_trsv_mv(800, 800 * 30, 100, 10, 5); + TestStruct::template test_trsv_mv(400, 400 * 20, 100, 5, 10); + TestStruct::template test_trsv_mv(1000, 1000 * 30, 200, 10, 1); + TestStruct::template test_trsv_mv(800, 800 * 30, 100, 10, 5); + TestStruct::template test_trsv_mv(400, 400 * 20, 100, 5, 10); } // Note BMK 7-22: the matrix generator used by this test always @@ -126,12 +168,7 @@ void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, TEST_F( \ TestCategory, \ sparse##_##trsv_mv##_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \ - test_trsv_mv( \ - 1000, 1000 * 30, 200, 10, 1); \ - test_trsv_mv( \ - 800, 800 * 30, 100, 10, 5); \ - test_trsv_mv( \ - 400, 400 * 20, 100, 5, 10); \ + test_trsv_mv(); \ } #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_vector_fixtures.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_vector_fixtures.hpp new file mode 100644 index 000000000000..2037a5485e9f --- /dev/null +++ b/packages/kokkos-kernels/sparse/unit_test/Test_vector_fixtures.hpp @@ -0,0 +1,212 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef _TEST_VECTOR_FIXTURES_HPP +#define _TEST_VECTOR_FIXTURES_HPP + +#include + +#include + +/** + * API for working with 2D vectors of small matrices for testing. + */ + +namespace Test { + +template +scalar_t KEEP_ZERO() { + return scalar_t(-9999.0); +} + +template +void compress_matrix( + MapT& map, EntriesT& entries, ValuesT& values, + const std::vector>& + fixture) { + using size_type = typename MapT::non_const_value_type; + using scalar_t = typename ValuesT::non_const_value_type; + + const scalar_t ZERO = scalar_t(0); + + const size_type nrows = fixture.size(); + const size_type ncols = fixture[0].size(); + + // Count fixture nnz's + size_type nnz = 0; + for (size_type row_idx = 0; row_idx < nrows; ++row_idx) { + for (size_type col_idx = 0; col_idx < nrows; ++col_idx) { + if (fixture[row_idx][col_idx] != ZERO) { + ++nnz; + } + } + } + + // Allocate device CRS views + Kokkos::resize(map, (CSC ? ncols : nrows) + 1); + Kokkos::resize(entries, nnz); + Kokkos::resize(values, nnz); + + // Create host mirror views for CRS + auto hmap = Kokkos::create_mirror_view(map); + auto hentries = Kokkos::create_mirror_view(entries); + auto hvalues = Kokkos::create_mirror_view(values); + + // Compress into CRS (host views) + size_type curr_nnz = 0; + + const size_type num_outer = (CSC ? ncols : nrows); + const size_type num_inner = (CSC ? nrows : ncols); + for (size_type outer_idx = 0; outer_idx < num_outer; ++outer_idx) { + for (size_type inner_idx = 0; inner_idx < num_inner; ++inner_idx) { + const size_type row = CSC ? inner_idx : outer_idx; + const size_type col = CSC ? outer_idx : inner_idx; + const auto val = fixture[row][col]; + if (val != ZERO) { + hentries(curr_nnz) = inner_idx; + hvalues(curr_nnz) = val == KEEP_ZERO() ? ZERO : val; + ++curr_nnz; + } + hmap(outer_idx + 1) = curr_nnz; + } + } + + // Copy host CRS views to device CRS views + Kokkos::deep_copy(map, hmap); + Kokkos::deep_copy(entries, hentries); + Kokkos::deep_copy(values, hvalues); +} + +template +std::vector> +decompress_matrix(const RowMapT& row_map, const EntriesT& entries, + const ValuesT& values) { + using size_type = typename RowMapT::non_const_value_type; + using scalar_t = typename ValuesT::non_const_value_type; + + const scalar_t ZERO = scalar_t(0); + + const size_type nrows = row_map.size() - 1; + std::vector> result; + result.resize(nrows); + for (auto& row : result) { + row.resize(nrows, ZERO); + } + + auto hrow_map = Kokkos::create_mirror_view(row_map); + auto hentries = Kokkos::create_mirror_view(entries); + auto hvalues = Kokkos::create_mirror_view(values); + Kokkos::deep_copy(hrow_map, row_map); + Kokkos::deep_copy(hentries, entries); + Kokkos::deep_copy(hvalues, values); + + for (size_type row_idx = 0; row_idx < nrows; ++row_idx) { + const size_type row_nnz_begin = hrow_map(row_idx); + const size_type row_nnz_end = hrow_map(row_idx + 1); + for (size_type row_nnz = row_nnz_begin; row_nnz < row_nnz_end; ++row_nnz) { + const auto col_idx = hentries(row_nnz); + const scalar_t value = hvalues(row_nnz); + if (CSC) { + result[col_idx][row_idx] = value; + } else { + result[row_idx][col_idx] = value; + } + } + } + + return result; +} + +template +std::vector> +decompress_matrix(const RowMapT& row_map, const EntriesT& entries, + const ValuesT& values, + typename RowMapT::const_value_type block_size) { + using size_type = typename RowMapT::non_const_value_type; + using scalar_t = typename ValuesT::non_const_value_type; + + const scalar_t ZERO = scalar_t(0); + + const size_type nbrows = row_map.extent(0) - 1; + const size_type nrows = nbrows * block_size; + const size_type block_items = block_size * block_size; + std::vector> result; + result.resize(nrows); + for (auto& row : result) { + row.resize(nrows, ZERO); + } + + auto hrow_map = Kokkos::create_mirror_view(row_map); + auto hentries = Kokkos::create_mirror_view(entries); + auto hvalues = Kokkos::create_mirror_view(values); + Kokkos::deep_copy(hrow_map, row_map); + Kokkos::deep_copy(hentries, entries); + Kokkos::deep_copy(hvalues, values); + + for (size_type row_idx = 0; row_idx < nbrows; ++row_idx) { + const size_type row_nnz_begin = hrow_map(row_idx); + const size_type row_nnz_end = hrow_map(row_idx + 1); + for (size_type row_nnz = row_nnz_begin; row_nnz < row_nnz_end; ++row_nnz) { + const auto col_idx = hentries(row_nnz); + for (size_type i = 0; i < block_size; ++i) { + const size_type unc_row_idx = row_idx * block_size + i; + for (size_type j = 0; j < block_size; ++j) { + const size_type unc_col_idx = col_idx * block_size + j; + result[unc_row_idx][unc_col_idx] = + hvalues(row_nnz * block_items + i * block_size + j); + } + } + } + } + + return result; +} + +template +void check_matrix( + const std::string& name, const RowMapT& row_map, const EntriesT& entries, + const ValuesT& values, + const std::vector>& + expected) { + using size_type = typename RowMapT::non_const_value_type; + + const auto decompressed_mtx = decompress_matrix(row_map, entries, values); + + const size_type nrows = row_map.size() - 1; + for (size_type row_idx = 0; row_idx < nrows; ++row_idx) { + for (size_type col_idx = 0; col_idx < nrows; ++col_idx) { + EXPECT_NEAR(expected[row_idx][col_idx], + decompressed_mtx[row_idx][col_idx], 0.01) + << "Failed check is: " << name << "[" << row_idx << "][" << col_idx + << "]"; + } + } +} + +template +void print_matrix(const std::vector>& matrix) { + for (const auto& row : matrix) { + for (const auto& item : row) { + std::printf("%.5f ", item); + } + std::cout << std::endl; + } +} + +} // namespace Test + +#endif // _TEST_VECTOR_FIXTURES_HPP diff --git a/packages/kokkos-kernels/test_common/KokkosKernels_TestUtils.hpp b/packages/kokkos-kernels/test_common/KokkosKernels_TestUtils.hpp index 236bcdd1c83f..232b66242a27 100644 --- a/packages/kokkos-kernels/test_common/KokkosKernels_TestUtils.hpp +++ b/packages/kokkos-kernels/test_common/KokkosKernels_TestUtils.hpp @@ -776,9 +776,11 @@ class RandCsMatrix { MapViewTypeD get_map() { return __getter_copy_helper(__map_d); } }; -/// \brief Randomly shuffle the entries in each row (col) of a Crs (Ccs) matrix. +/// \brief Randomly shuffle the entries in each row (col) of a Crs (Ccs) or Bsr +/// matrix. template -void shuffleMatrixEntries(Rowptrs rowptrs, Entries entries, Values values) { +void shuffleMatrixEntries(Rowptrs rowptrs, Entries entries, Values values, + const size_t block_size = 1) { using size_type = typename Rowptrs::non_const_value_type; using ordinal_type = typename Entries::value_type; auto rowptrsHost = @@ -789,6 +791,7 @@ void shuffleMatrixEntries(Rowptrs rowptrs, Entries entries, Values values) { Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), values); ordinal_type numRows = rowptrsHost.extent(0) ? (rowptrsHost.extent(0) - 1) : 0; + const size_t block_items = block_size * block_size; for (ordinal_type i = 0; i < numRows; i++) { size_type rowBegin = rowptrsHost(i); size_type rowEnd = rowptrsHost(i + 1); @@ -796,7 +799,9 @@ void shuffleMatrixEntries(Rowptrs rowptrs, Entries entries, Values values) { ordinal_type swapRange = rowEnd - j; size_type swapOffset = j + (rand() % swapRange); std::swap(entriesHost(j), entriesHost(swapOffset)); - std::swap(valuesHost(j), valuesHost(swapOffset)); + std::swap_ranges(valuesHost.data() + j * block_items, + valuesHost.data() + (j + 1) * block_items, + valuesHost.data() + swapOffset * block_items); } } Kokkos::deep_copy(entries, entriesHost); diff --git a/packages/kokkos-kernels/test_common/Test_HIP.hpp b/packages/kokkos-kernels/test_common/Test_HIP.hpp index c9e02698c52c..dfb8e1d687b0 100644 --- a/packages/kokkos-kernels/test_common/Test_HIP.hpp +++ b/packages/kokkos-kernels/test_common/Test_HIP.hpp @@ -31,7 +31,18 @@ class hip : public ::testing::Test { static void TearDownTestCase() {} }; +using HIPSpaceDevice = Kokkos::Device; +using HIPManagedSpaceDevice = + Kokkos::Device; + #define TestCategory hip -#define TestDevice Kokkos::HIP + +// Prefer for any testing where only one exec space is used +#if defined(KOKKOSKERNELS_INST_MEMSPACE_HIPMANAGEDSPACE) && \ + !defined(KOKKOSKERNELS_INST_MEMSPACE_HIPSPACE) +#define TestDevice HIPManagedSpaceDevice +#else +#define TestDevice HIPSpaceDevice +#endif #endif // TEST_HIP_HPP diff --git a/packages/kokkos/.jenkins b/packages/kokkos/.jenkins index 6f5cf80033fa..ae3bffd92d72 100644 --- a/packages/kokkos/.jenkins +++ b/packages/kokkos/.jenkins @@ -8,16 +8,21 @@ pipeline { } options { + disableConcurrentBuilds(abortPrevious: true) timeout(time: 6, unit: 'HOURS') } + triggers { + issueCommentTrigger('.*test this please.*') + } + stages { stage('Clang-Format') { agent { dockerfile { filename 'Dockerfile.clang' dir 'scripts/docker' - label 'nvidia-docker || rocm-docker || docker' + label 'nvidia-docker || docker' args '-v /tmp/ccache.kokkos:/tmp/ccache' } } @@ -102,12 +107,11 @@ pipeline { } steps { sh 'ccache --zero-stats' - sh '''. /opt/intel/oneapi/setvars.sh --include-intel-llvm && \ - rm -rf build && mkdir -p build && cd build && \ + sh '''rm -rf build && mkdir -p build && cd build && \ cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ - -DCMAKE_CXX_COMPILER=/opt/intel/oneapi/compiler/2023.0.0/linux/bin-llvm/clang++ \ + -DCMAKE_CXX_COMPILER=clang++ \ -DCMAKE_CXX_FLAGS="-fsycl-device-code-split=per_kernel -Wno-deprecated-declarations -Werror -Wno-gnu-zero-variadic-macro-arguments -Wno-unknown-cuda-version -Wno-sycl-target" \ -DKOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED=0 \ -DKokkos_ARCH_NATIVE=ON \ @@ -135,8 +139,8 @@ pipeline { dockerfile { filename 'Dockerfile.hipcc' dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.2' - label 'rocm-docker && vega' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.2-complete' + label 'rocm-docker ' args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' } } @@ -177,8 +181,8 @@ pipeline { dockerfile { filename 'Dockerfile.hipcc' dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.6' - label 'rocm-docker && vega' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.6-complete' + label 'rocm-docker' args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' } } @@ -390,7 +394,6 @@ pipeline { -DKokkos_ENABLE_CUDA_LAMBDA=OFF \ -DKokkos_ENABLE_CUDA_UVM=ON \ -DKokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON \ - -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DKokkos_ENABLE_IMPL_MDSPAN=ON \ @@ -493,7 +496,6 @@ pipeline { -DCMAKE_CXX_FLAGS=-Werror \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DKokkos_ENABLE_TESTS=ON \ diff --git a/packages/kokkos/CHANGELOG.md b/packages/kokkos/CHANGELOG.md index 40e3c95f24fa..244165f05709 100644 --- a/packages/kokkos/CHANGELOG.md +++ b/packages/kokkos/CHANGELOG.md @@ -1,5 +1,105 @@ # CHANGELOG +## [4.3.00](https://github.com/kokkos/kokkos/tree/4.3.00) (2024-03-19) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.01...4.3.00) + +### Features: +* Add `Experimental::sort_by_key(exec, keys, values)` algorithm [\#6801](https://github.com/kokkos/kokkos/pull/6801) + +### Backend and Architecture Enhancements: + +#### CUDA: +* Experimental multi-GPU support (from the same process) [\#6782](https://github.com/kokkos/kokkos/pull/6782) +* Link against CUDA libraries even with KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE [\#6701](https://github.com/kokkos/kokkos/pull/6701) +* Don't use the compiler launcher script if the CMake compile language is CUDA. [\#6704](https://github.com/kokkos/kokkos/pull/6704) +* nvcc(wrapper): adding "long" and "short" versions for all flags [\#6615](https://github.com/kokkos/kokkos/pull/6615) + +#### HIP: + * Fix compilation when using amdclang (with ROCm >= 5.7) and RDC [\#6857](https://github.com/kokkos/kokkos/pull/6857) + * Use rocthrust for sorting, when available [\#6793](https://github.com/kokkos/kokkos/pull/6793) + +#### SYCL: +* We only support OneAPI SYCL implementation: add check during initialization + * Error out on initialization if the backend is different from `ext_oneapi_*` [\#6784](https://github.com/kokkos/kokkos/pull/6784) + * Filter GPU devices for `ext_onapi_*` GPU devices [\#6758](https://github.com/kokkos/kokkos/pull/6784) +* Performance Improvements + * Avoid unnecessary zero-memset of the scratch flags in SYCL [\#6739](https://github.com/kokkos/kokkos/pull/6739) + * Use host-pinned memory to copy reduction/scan result [\#6500](https://github.com/kokkos/kokkos/pull/6500) +* Address deprecations after oneAPI 2023.2.0 [\#6577](https://github.com/kokkos/kokkos/pull/6739) +* Make sure to call find_dependency for oneDPL if necessary [\#6870](https://github.com/kokkos/kokkos/pull/6870) + +#### OpenMPTarget: +* Use LLVM extensions for dynamic shared memory [\#6380](https://github.com/kokkos/kokkos/pull/6380) +* Guard scratch memory usage in ParallelReduce [\#6585 ](https://github.com/kokkos/kokkos/pull/6585) +* Update linker flags for Intel GPUs update [\#6735](https://github.com/kokkos/kokkos/pull/6735) +* Improve handling of printf on Intel GPUs [\#6652](https://github.com/kokkos/kokkos/pull/6652) + +#### OpenACC: +* Add atomics support [\#6446](https://github.com/kokkos/kokkos/pull/6446) +* Make the OpenACC backend asynchronous [\#6772](https://github.com/kokkos/kokkos/pull/6772) + +#### Threads: +* Add missing broadcast to TeamThreadRange parallel_scan [\#6601](https://github.com/kokkos/kokkos/pull/6446) + +#### OpenMP: +* Improve performance of view initializations and filling with zeros [\#6573](https://github.com/kokkos/kokkos/pull/6573) + +### General Enhancements + +* Improve performance of random number generation when using a normal distribution on GPUs [\#6556](https://github.com/kokkos/kokkos/pull/6556) +* Allocate temporary view with the user-provided execution space instance and do not initialize in `unique` algorithm [\#6598](https://github.com/kokkos/kokkos/pull/6598) +* Add deduction guide for `Kokkos::Array` [\#6373](https://github.com/kokkos/kokkos/pull/6373) +* Provide new public headers `` and `` [\#6687](https://github.com/kokkos/kokkos/pull/6687) +* Fix/improvement to `remove_if` parallel algorithm: use the provided execution space instance for temporary allocations and drop unnecessaryinitialization + avoid evaluating twice the predicate during final pass [\#6747](https://github.com/kokkos/kokkos/pull/6747) +* Add runtime function to query the number of devices and make device ID consistent with `KOKKOS_VISIBLE_DEVICES` [\#6713](https://github.com/kokkos/kokkos/pull/6713) +* simd: support `vector_aligned_tag` [\#6243](https://github.com/kokkos/kokkos/pull/6243) +* Avoid unnecessary allocation when default constructing Bitset [\#6524](https://github.com/kokkos/kokkos/pull/6524) +* Fix constness for views in std algorithms [\#6813](https://github.com/kokkos/kokkos/pull/6813) +* Improve error message on unsafe implicit conversion in MDRangePolicy [\#6855](https://github.com/kokkos/kokkos/pull/6855) +* CTAD (deduction guides) for RangePolicy [\#6850](https://github.com/kokkos/kokkos/pull/6850) +* CTAD (deduction guides) for MDRangePolicy [\#5516](https://github.com/kokkos/kokkos/pull/5516) + +### Build System Changes +* Require `Kokkos_ENABLE_ATOMICS_BYPASS` option to bypass atomic operation for Serial backend only builds [\#6692](https://github.com/kokkos/kokkos/pull/6692) +* Add support for RISCV and the Milk-V's Pioneer [\#6773](https://github.com/kokkos/kokkos/pull/6773) +* Add C++26 standard to CMake setup [\#6733](https://github.com/kokkos/kokkos/pull/6733) +* Fix Makefile when using gnu_generate_makefile.sh and make >= 4.3 [\#6606](https://github.com/kokkos/kokkos/pull/6606) +* Cuda: Fix configuring with CMake >= 3.28.4 - temporary fallback to internal CudaToolkit.cmake [\#6898](https://github.com/kokkos/kokkos/pull/6898) + +### Incompatibilities (i.e. breaking changes) +* Remove all `DEPRECATED_CODE_3` option and all code that was guarded by it [\#6523](https://github.com/kokkos/kokkos/pull/6523) +* Drop guards to accommodate external code defining `KOKKOS_ASSERT` [\#6665](https://github.com/kokkos/kokkos/pull/6665) +* `Profiling::ProfilingSection(std::string)` constructor marked explicit and nodiscard [\#6690](https://github.com/kokkos/kokkos/pull/6690) +* Add bound check preconditions for `RangePolicy` and `MDRangePolicy` [\#6617](https://github.com/kokkos/kokkos/pull/6617) [\#6726](https://github.com/kokkos/kokkos/pull/6726) +* Add checks for unsafe implicit conversions in RangePolicy [\#6754](https://github.com/kokkos/kokkos/pull/6754) +* Remove Kokkos::[b]half_t volatile overloads [\#6579](https://github.com/kokkos/kokkos/pull/6579) +* Remove KOKKOS_IMPL_DO_NOT_USE_PRINTF [\#6593](https://github.com/kokkos/kokkos/pull/6593) +* Check matching static extents in View constructor [\#5190 ](https://github.com/kokkos/kokkos/pull/5190) +* Tools(profiling): fix typo Kokkos_Tools_Optim[i]zationGoal [\#6642](https://github.com/kokkos/kokkos/pull/6642) +* Remove variadic range policy constructor (disallow passing multiple trailing chunk size arguments) [\#6845](https://github.com/kokkos/kokkos/pull/6845) +* Improve message on view out of bounds access and always abort [\#6861](https://github.com/kokkos/kokkos/pull/6861) +* Drop `KOKKOS_ENABLE_INTEL_MM_ALLOC` macro [\#6797](https://github.com/kokkos/kokkos/pull/6797) +* Remove `Kokkos::Experimental::LogicalMemorySpace` (without going through deprecation) [\#6557](https://github.com/kokkos/kokkos/pull/6557) +* Remove `Experimental::HBWSpace` and support for linking against memkind [\#6791](https://github.com/kokkos/kokkos/pull/6791) +* Drop librt TPL and associated `KOKKOS_ENABLE_LIBRT` macro [\#6798](https://github.com/kokkos/kokkos/pull/6798) +* Drop support for old CPU architectures (`ARCH_BGQ`, `ARCH_POWER7`, `ARCH_WSM` and associated `ARCH_SSE4` macro) [\#6806](https://github.com/kokkos/kokkos/pull/6806) +* Drop support for deprecated command-line arguments and environment variables [\#6744](https://github.com/kokkos/kokkos/pull/6744) + +### Deprecations +* Provide kokkos_swap as part of Core and deprecate Experimental::swap in Algorithms [\#6697](https://github.com/kokkos/kokkos/pull/6697) +* Deprecate {Cuda,HIP}::detect_device_count() and Cuda::[detect_]device_arch() [\#6710](https://github.com/kokkos/kokkos/pull/6710) +* Deprecate `ExecutionSpace::in_parallel()` [\#6582](https://github.com/kokkos/kokkos/pull/6582) + +### Bug Fixes +* Fix team-level MDRange reductions: [\#6511](https://github.com/kokkos/kokkos/pull/6511) +* Fix CUDA and SYCL small value type (16-bit) team reductions [\#5334](https://github.com/kokkos/kokkos/pull/5334) +* Enable `{transform_}exclusive_scan` in place [\#6667](https://github.com/kokkos/kokkos/pull/6667) +* `fill_random` overload that do not take an execution space instance argument should fence [\#6658](https://github.com/kokkos/kokkos/pull/6658) +* HIP,Cuda,OpenMPTarget: Fixup use provided execution space when copying host inaccessible reduction result [\#6777](https://github.com/kokkos/kokkos/pull/6777) +* Fix typo in `cuda_func_set_attribute[s]_wrapper` preventing proper setting of desired occupancy [\#6786](https://github.com/kokkos/kokkos/pull/6786) +* Avoid undefined behavior due to conversion between signed and unsigned integers in shift_{right, left}_team_impl [\#6821](https://github.com/kokkos/kokkos/pull/6821) +* Fix a bug in Makefile.kokkos when using AMD GPU architectures as `AMD_GFXYYY` [\#6892](https://github.com/kokkos/kokkos/pull/6892) + ## [4.2.01](https://github.com/kokkos/kokkos/tree/4.2.01) (2023-12-07) [Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.00...4.2.01) @@ -999,95 +1099,95 @@ - Major update for OpenMPTarget: many capabilities now work. For details contact us. - Added DPC++/SYCL backend: primary capabilites are working. - Added Kokkos Graph API analogous to CUDA Graphs. -- Added parallel_scan support with TeamThreadRange [\#3536](https://github.com/kokkos/kokkos/pull/#3536) -- Added Logical Memory Spaces [\#3546](https://github.com/kokkos/kokkos/pull/#3546) -- Added initial half precision support [\#3439](https://github.com/kokkos/kokkos/pull/#3439) -- Experimental feature: control cuda occupancy [\#3379](https://github.com/kokkos/kokkos/pull/#3379) +- Added parallel_scan support with TeamThreadRange [\#3536](https://github.com/kokkos/kokkos/pull/3536) +- Added Logical Memory Spaces [\#3546](https://github.com/kokkos/kokkos/pull/3546) +- Added initial half precision support [\#3439](https://github.com/kokkos/kokkos/pull/3439) +- Experimental feature: control cuda occupancy [\#3379](https://github.com/kokkos/kokkos/pull/3379) **Implemented enhancements Backends and Archs:** -- Add a64fx and fujitsu Compiler support [\#3614](https://github.com/kokkos/kokkos/pull/#3614) -- Adding support for AMD gfx908 archictecture [\#3375](https://github.com/kokkos/kokkos/pull/#3375) -- SYCL parallel\_for MDRangePolicy [\#3583](https://github.com/kokkos/kokkos/pull/#3583) -- SYCL add parallel\_scan [\#3577](https://github.com/kokkos/kokkos/pull/#3577) -- SYCL custom reductions [\#3544](https://github.com/kokkos/kokkos/pull/#3544) -- SYCL Enable container unit tests [\#3550](https://github.com/kokkos/kokkos/pull/#3550) -- SYCL feature level 5 [\#3480](https://github.com/kokkos/kokkos/pull/#3480) -- SYCL Feature level 4 (parallel\_for) [\#3474](https://github.com/kokkos/kokkos/pull/#3474) -- SYCL feature level 3 [\#3451](https://github.com/kokkos/kokkos/pull/#3451) -- SYCL feature level 2 [\#3447](https://github.com/kokkos/kokkos/pull/#3447) -- OpenMPTarget: Hierarchial reduction for + operator on scalars [\#3504](https://github.com/kokkos/kokkos/pull/#3504) -- OpenMPTarget hierarchical [\#3411](https://github.com/kokkos/kokkos/pull/#3411) -- HIP Add Impl::atomic\_[store,load] [\#3440](https://github.com/kokkos/kokkos/pull/#3440) -- HIP enable global lock arrays [\#3418](https://github.com/kokkos/kokkos/pull/#3418) -- HIP Implement multiple occupancy paths for various HIP kernel launchers [\#3366](https://github.com/kokkos/kokkos/pull/#3366) +- Add a64fx and fujitsu Compiler support [\#3614](https://github.com/kokkos/kokkos/pull/3614) +- Adding support for AMD gfx908 archictecture [\#3375](https://github.com/kokkos/kokkos/pull/3375) +- SYCL parallel\_for MDRangePolicy [\#3583](https://github.com/kokkos/kokkos/pull/3583) +- SYCL add parallel\_scan [\#3577](https://github.com/kokkos/kokkos/pull/3577) +- SYCL custom reductions [\#3544](https://github.com/kokkos/kokkos/pull/3544) +- SYCL Enable container unit tests [\#3550](https://github.com/kokkos/kokkos/pull/3550) +- SYCL feature level 5 [\#3480](https://github.com/kokkos/kokkos/pull/3480) +- SYCL Feature level 4 (parallel\_for) [\#3474](https://github.com/kokkos/kokkos/pull/3474) +- SYCL feature level 3 [\#3451](https://github.com/kokkos/kokkos/pull/3451) +- SYCL feature level 2 [\#3447](https://github.com/kokkos/kokkos/pull/3447) +- OpenMPTarget: Hierarchial reduction for + operator on scalars [\#3504](https://github.com/kokkos/kokkos/pull/3504) +- OpenMPTarget hierarchical [\#3411](https://github.com/kokkos/kokkos/pull/3411) +- HIP Add Impl::atomic\_[store,load] [\#3440](https://github.com/kokkos/kokkos/pull/3440) +- HIP enable global lock arrays [\#3418](https://github.com/kokkos/kokkos/pull/3418) +- HIP Implement multiple occupancy paths for various HIP kernel launchers [\#3366](https://github.com/kokkos/kokkos/pull/3366) **Implemented enhancements Policies:** -- MDRangePolicy: Let it be semiregular [\#3494](https://github.com/kokkos/kokkos/pull/#3494) -- MDRangePolicy: Check narrowing conversion in construction [\#3527](https://github.com/kokkos/kokkos/pull/#3527) -- MDRangePolicy: CombinedReducers support [\#3395](https://github.com/kokkos/kokkos/pull/#3395) -- Kokkos Graph: Interface and Default Implementation [\#3362](https://github.com/kokkos/kokkos/pull/#3362) -- Kokkos Graph: add Cuda Graph implementation [\#3369](https://github.com/kokkos/kokkos/pull/#3369) -- TeamPolicy: implemented autotuning of team sizes and vector lengths [\#3206](https://github.com/kokkos/kokkos/pull/#3206) -- RangePolicy: Initialize all data members in default constructor [\#3509](https://github.com/kokkos/kokkos/pull/#3509) +- MDRangePolicy: Let it be semiregular [\#3494](https://github.com/kokkos/kokkos/pull/3494) +- MDRangePolicy: Check narrowing conversion in construction [\#3527](https://github.com/kokkos/kokkos/pull/3527) +- MDRangePolicy: CombinedReducers support [\#3395](https://github.com/kokkos/kokkos/pull/3395) +- Kokkos Graph: Interface and Default Implementation [\#3362](https://github.com/kokkos/kokkos/pull/3362) +- Kokkos Graph: add Cuda Graph implementation [\#3369](https://github.com/kokkos/kokkos/pull/3369) +- TeamPolicy: implemented autotuning of team sizes and vector lengths [\#3206](https://github.com/kokkos/kokkos/pull/3206) +- RangePolicy: Initialize all data members in default constructor [\#3509](https://github.com/kokkos/kokkos/pull/3509) **Implemented enhancements BuildSystem:** -- Auto-generate core test files for all backends [\#3488](https://github.com/kokkos/kokkos/pull/#3488) -- Avoid rewriting test files when calling cmake [\#3548](https://github.com/kokkos/kokkos/pull/#3548) -- RULE\_LAUNCH\_COMPILE and RULE\_LAUNCH\_LINK system for nvcc\_wrapper [\#3136](https://github.com/kokkos/kokkos/pull/#3136) -- Adding -include as a known argument to nvcc\_wrapper [\#3434](https://github.com/kokkos/kokkos/pull/#3434) -- Install hpcbind script [\#3402](https://github.com/kokkos/kokkos/pull/#3402) -- cmake/kokkos\_tribits.cmake: add parsing for args [\#3457](https://github.com/kokkos/kokkos/pull/#3457) +- Auto-generate core test files for all backends [\#3488](https://github.com/kokkos/kokkos/pull/3488) +- Avoid rewriting test files when calling cmake [\#3548](https://github.com/kokkos/kokkos/pull/3548) +- RULE\_LAUNCH\_COMPILE and RULE\_LAUNCH\_LINK system for nvcc\_wrapper [\#3136](https://github.com/kokkos/kokkos/pull/3136) +- Adding -include as a known argument to nvcc\_wrapper [\#3434](https://github.com/kokkos/kokkos/pull/3434) +- Install hpcbind script [\#3402](https://github.com/kokkos/kokkos/pull/3402) +- cmake/kokkos\_tribits.cmake: add parsing for args [\#3457](https://github.com/kokkos/kokkos/pull/3457) **Implemented enhancements Tools:** -- Changed namespacing of Kokkos::Tools::Impl::Impl::tune\_policy [\#3455](https://github.com/kokkos/kokkos/pull/#3455) -- Delegate to an impl allocate/deallocate method to allow specifying a SpaceHandle for MemorySpaces [\#3530](https://github.com/kokkos/kokkos/pull/#3530) -- Use the Kokkos Profiling interface rather than the Impl interface [\#3518](https://github.com/kokkos/kokkos/pull/#3518) -- Runtime option for tuning [\#3459](https://github.com/kokkos/kokkos/pull/#3459) -- Dual View Tool Events [\#3326](https://github.com/kokkos/kokkos/pull/#3326) +- Changed namespacing of Kokkos::Tools::Impl::Impl::tune\_policy [\#3455](https://github.com/kokkos/kokkos/pull/3455) +- Delegate to an impl allocate/deallocate method to allow specifying a SpaceHandle for MemorySpaces [\#3530](https://github.com/kokkos/kokkos/pull/3530) +- Use the Kokkos Profiling interface rather than the Impl interface [\#3518](https://github.com/kokkos/kokkos/pull/3518) +- Runtime option for tuning [\#3459](https://github.com/kokkos/kokkos/pull/3459) +- Dual View Tool Events [\#3326](https://github.com/kokkos/kokkos/pull/3326) **Implemented enhancements Other:** -- Abort on errors instead of just printing [\#3528](https://github.com/kokkos/kokkos/pull/#3528) -- Enable C++14 macros unconditionally [\#3449](https://github.com/kokkos/kokkos/pull/#3449) -- Make ViewMapping trivially copyable [\#3436](https://github.com/kokkos/kokkos/pull/#3436) -- Rename struct ViewMapping to class [\#3435](https://github.com/kokkos/kokkos/pull/#3435) -- Replace enums in Kokkos\_ViewMapping.hpp (removes -Wextra) [\#3422](https://github.com/kokkos/kokkos/pull/#3422) -- Use bool for enums representing bools [\#3416](https://github.com/kokkos/kokkos/pull/#3416) -- Fence active instead of default execution space instances [\#3388](https://github.com/kokkos/kokkos/pull/#3388) -- Refactor parallel\_reduce fence usage [\#3359](https://github.com/kokkos/kokkos/pull/#3359) -- Moved Space EBO helpers to Kokkos\_EBO [\#3357](https://github.com/kokkos/kokkos/pull/#3357) -- Add remove\_cvref type trait [\#3340](https://github.com/kokkos/kokkos/pull/#3340) -- Adding identity type traits and update definition of identity\_t alias [\#3339](https://github.com/kokkos/kokkos/pull/#3339) -- Add is\_specialization\_of type trait [\#3338](https://github.com/kokkos/kokkos/pull/#3338) -- Make ScratchMemorySpace semi-regular [\#3309](https://github.com/kokkos/kokkos/pull/#3309) -- Optimize min/max atomics with early exit on no-op case [\#3265](https://github.com/kokkos/kokkos/pull/#3265) -- Refactor Backend Development [\#2941](https://github.com/kokkos/kokkos/pull/#2941) +- Abort on errors instead of just printing [\#3528](https://github.com/kokkos/kokkos/pull/3528) +- Enable C++14 macros unconditionally [\#3449](https://github.com/kokkos/kokkos/pull/3449) +- Make ViewMapping trivially copyable [\#3436](https://github.com/kokkos/kokkos/pull/3436) +- Rename struct ViewMapping to class [\#3435](https://github.com/kokkos/kokkos/pull/3435) +- Replace enums in Kokkos\_ViewMapping.hpp (removes -Wextra) [\#3422](https://github.com/kokkos/kokkos/pull/3422) +- Use bool for enums representing bools [\#3416](https://github.com/kokkos/kokkos/pull/3416) +- Fence active instead of default execution space instances [\#3388](https://github.com/kokkos/kokkos/pull/3388) +- Refactor parallel\_reduce fence usage [\#3359](https://github.com/kokkos/kokkos/pull/3359) +- Moved Space EBO helpers to Kokkos\_EBO [\#3357](https://github.com/kokkos/kokkos/pull/3357) +- Add remove\_cvref type trait [\#3340](https://github.com/kokkos/kokkos/pull/3340) +- Adding identity type traits and update definition of identity\_t alias [\#3339](https://github.com/kokkos/kokkos/pull/3339) +- Add is\_specialization\_of type trait [\#3338](https://github.com/kokkos/kokkos/pull/3338) +- Make ScratchMemorySpace semi-regular [\#3309](https://github.com/kokkos/kokkos/pull/3309) +- Optimize min/max atomics with early exit on no-op case [\#3265](https://github.com/kokkos/kokkos/pull/3265) +- Refactor Backend Development [\#2941](https://github.com/kokkos/kokkos/pull/2941) **Fixed bugs:** -- Fixup MDRangePolicy construction from Kokkos arrays [\#3591](https://github.com/kokkos/kokkos/pull/#3591) -- Add atomic functions for unsigned long long using gcc built-in [\#3588](https://github.com/kokkos/kokkos/pull/#3588) -- Fixup silent pointless comparison with zero in checked\_narrow\_cast (compiler workaround) [\#3566](https://github.com/kokkos/kokkos/pull/#3566) -- Fixes for ROCm 3.9 [\#3565](https://github.com/kokkos/kokkos/pull/#3565) -- Fix windows build issues which crept in for the CUDA build [\#3532](https://github.com/kokkos/kokkos/pull/#3532) -- HIP Fix atomics of large data types and clean up lock arrays [\#3529](https://github.com/kokkos/kokkos/pull/#3529) -- Pthreads fix exception resulting from 0 grain size [\#3510](https://github.com/kokkos/kokkos/pull/#3510) -- Fixup do not require atomic operation to be default constructible [\#3503](https://github.com/kokkos/kokkos/pull/#3503) -- Fix race condition in HIP backend [\#3467](https://github.com/kokkos/kokkos/pull/#3467) -- Replace KOKKOS\_DEBUG with KOKKOS\_ENABLE\_DEBUG [\#3458](https://github.com/kokkos/kokkos/pull/#3458) -- Fix multi-stream team scratch space definition for HIP [\#3398](https://github.com/kokkos/kokkos/pull/#3398) -- HIP fix template deduction [\#3393](https://github.com/kokkos/kokkos/pull/#3393) -- Fix compiling with HIP and C++17 [\#3390](https://github.com/kokkos/kokkos/pull/#3390) -- Fix sigFPE in HIP blocksize deduction [\#3378](https://github.com/kokkos/kokkos/pull/#3378) -- Type alias change: replace CS with CTS to avoid conflicts with NVSHMEM [\#3348](https://github.com/kokkos/kokkos/pull/#3348) -- Clang compilation of CUDA backend on Windows [\#3345](https://github.com/kokkos/kokkos/pull/#3345) -- Fix HBW support [\#3343](https://github.com/kokkos/kokkos/pull/#3343) -- Added missing fences to unique token [\#3260](https://github.com/kokkos/kokkos/pull/#3260) +- Fixup MDRangePolicy construction from Kokkos arrays [\#3591](https://github.com/kokkos/kokkos/pull/3591) +- Add atomic functions for unsigned long long using gcc built-in [\#3588](https://github.com/kokkos/kokkos/pull/3588) +- Fixup silent pointless comparison with zero in checked\_narrow\_cast (compiler workaround) [\#3566](https://github.com/kokkos/kokkos/pull/3566) +- Fixes for ROCm 3.9 [\#3565](https://github.com/kokkos/kokkos/pull/3565) +- Fix windows build issues which crept in for the CUDA build [\#3532](https://github.com/kokkos/kokkos/pull/3532) +- HIP Fix atomics of large data types and clean up lock arrays [\#3529](https://github.com/kokkos/kokkos/pull/3529) +- Pthreads fix exception resulting from 0 grain size [\#3510](https://github.com/kokkos/kokkos/pull/3510) +- Fixup do not require atomic operation to be default constructible [\#3503](https://github.com/kokkos/kokkos/pull/3503) +- Fix race condition in HIP backend [\#3467](https://github.com/kokkos/kokkos/pull/3467) +- Replace KOKKOS\_DEBUG with KOKKOS\_ENABLE\_DEBUG [\#3458](https://github.com/kokkos/kokkos/pull/3458) +- Fix multi-stream team scratch space definition for HIP [\#3398](https://github.com/kokkos/kokkos/pull/3398) +- HIP fix template deduction [\#3393](https://github.com/kokkos/kokkos/pull/3393) +- Fix compiling with HIP and C++17 [\#3390](https://github.com/kokkos/kokkos/pull/3390) +- Fix sigFPE in HIP blocksize deduction [\#3378](https://github.com/kokkos/kokkos/pull/3378) +- Type alias change: replace CS with CTS to avoid conflicts with NVSHMEM [\#3348](https://github.com/kokkos/kokkos/pull/3348) +- Clang compilation of CUDA backend on Windows [\#3345](https://github.com/kokkos/kokkos/pull/3345) +- Fix HBW support [\#3343](https://github.com/kokkos/kokkos/pull/3343) +- Added missing fences to unique token [\#3260](https://github.com/kokkos/kokkos/pull/3260) **Incompatibilities:** -- Remove unused utilities (forward, move, and expand\_variadic) from Kokkos::Impl [\#3535](https://github.com/kokkos/kokkos/pull/#3535) -- Remove unused traits [\#3534](https://github.com/kokkos/kokkos/pull/#3534) -- HIP: Remove old HCC code [\#3301](https://github.com/kokkos/kokkos/pull/#3301) -- Prepare for deprecation of ViewAllocateWithoutInitializing [\#3264](https://github.com/kokkos/kokkos/pull/#3264) -- Remove ROCm backend [\#3148](https://github.com/kokkos/kokkos/pull/#3148) +- Remove unused utilities (forward, move, and expand\_variadic) from Kokkos::Impl [\#3535](https://github.com/kokkos/kokkos/pull/3535) +- Remove unused traits [\#3534](https://github.com/kokkos/kokkos/pull/3534) +- HIP: Remove old HCC code [\#3301](https://github.com/kokkos/kokkos/pull/3301) +- Prepare for deprecation of ViewAllocateWithoutInitializing [\#3264](https://github.com/kokkos/kokkos/pull/3264) +- Remove ROCm backend [\#3148](https://github.com/kokkos/kokkos/pull/3148) ## [3.2.01](https://github.com/kokkos/kokkos/tree/3.2.01) (2020-11-17) [Full Changelog](https://github.com/kokkos/kokkos/compare/3.2.00...3.2.01) diff --git a/packages/kokkos/CMakeLists.txt b/packages/kokkos/CMakeLists.txt index 4a4e7a550192..93a796f200b5 100644 --- a/packages/kokkos/CMakeLists.txt +++ b/packages/kokkos/CMakeLists.txt @@ -150,8 +150,8 @@ ENDIF() set(Kokkos_VERSION_MAJOR 4) -set(Kokkos_VERSION_MINOR 2) -set(Kokkos_VERSION_PATCH 1) +set(Kokkos_VERSION_MINOR 3) +set(Kokkos_VERSION_PATCH 0) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") message(STATUS "Kokkos version: ${Kokkos_VERSION}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") diff --git a/packages/kokkos/Makefile.kokkos b/packages/kokkos/Makefile.kokkos index 6e28d2c0cc6a..2c74dd77bfb7 100644 --- a/packages/kokkos/Makefile.kokkos +++ b/packages/kokkos/Makefile.kokkos @@ -1,8 +1,8 @@ # Default settings common options. KOKKOS_VERSION_MAJOR = 4 -KOKKOS_VERSION_MINOR = 2 -KOKKOS_VERSION_PATCH = 1 +KOKKOS_VERSION_MINOR = 3 +KOKKOS_VERSION_PATCH = 0 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial @@ -12,14 +12,14 @@ KOKKOS_DEVICES ?= "Threads" # Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR # NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90 # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX -# IBM: BGQ,Power7,Power8,Power9 -# AMD-GPUS: GFX906,GFX908,GFX90A,GFX940,GFX942,GFX1030,GFX1100 +# IBM: Power8,Power9 +# AMD-GPUS: AMD_GFX906,AMD_GFX908,AMD_GFX90A,AMD_GFX940,AMD_GFX942,AMD_GFX1030,AMD_GFX1100 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3 # Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP,PVC KOKKOS_ARCH ?= "" # Options: yes,no KOKKOS_DEBUG ?= "no" -# Options: hwloc,librt,experimental_memkind +# Options: hwloc KOKKOS_USE_TPLS ?= "" # Options: c++17,c++1z,c++20,c++2a,c++23,c++2b KOKKOS_CXX_STANDARD ?= "c++17" @@ -46,7 +46,7 @@ uppercase_internal=$(if $1,$$(subst $(firstword $1),$(call uppercase_internal,$( uppercase=$(eval uppercase_RESULT:=$(call uppercase_internal,$(uppercase_TABLE),$1))$(uppercase_RESULT) # Return a 1 if a string contains a substring and 0 if not # Note the search string should be without '"' -# Example: $(call kokkos_has_string,"hwloc,librt",hwloc) +# Example: $(call kokkos_has_string,"hwloc,libdl",hwloc) # Will return a 1 kokkos_has_string=$(if $(findstring $(call uppercase,$2),$(call uppercase,$1)),1,0) # Returns 1 if the path exists, 0 otherwise @@ -63,11 +63,11 @@ KOKKOS_INTERNAL_ENABLE_CXX20 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD), KOKKOS_INTERNAL_ENABLE_CXX2A := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2a) KOKKOS_INTERNAL_ENABLE_CXX23 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++23) KOKKOS_INTERNAL_ENABLE_CXX2B := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2b) +KOKKOS_INTERNAL_ENABLE_CXX26 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++26) +KOKKOS_INTERNAL_ENABLE_CXX2C := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2c) # Check for external libraries. KOKKOS_INTERNAL_USE_HWLOC := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),hwloc) -KOKKOS_INTERNAL_USE_LIBRT := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),librt) -KOKKOS_INTERNAL_USE_MEMKIND := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),experimental_memkind) # Check for advanced settings. KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),compiler_warnings) @@ -308,7 +308,6 @@ endif # Intel based. KOKKOS_INTERNAL_USE_ARCH_KNC := $(call kokkos_has_string,$(KOKKOS_ARCH),KNC) -KOKKOS_INTERNAL_USE_ARCH_WSM := $(call kokkos_has_string,$(KOKKOS_ARCH),WSM) KOKKOS_INTERNAL_USE_ARCH_SNB := $(call kokkos_has_string,$(KOKKOS_ARCH),SNB) KOKKOS_INTERNAL_USE_ARCH_HSW := $(call kokkos_has_string,$(KOKKOS_ARCH),HSW) KOKKOS_INTERNAL_USE_ARCH_BDW := $(call kokkos_has_string,$(KOKKOS_ARCH),BDW) @@ -388,11 +387,9 @@ KOKKOS_INTERNAL_USE_ARCH_A64FX := $(call kokkos_has_string,$(KOKKOS_ARCH),A64FX) KOKKOS_INTERNAL_USE_ARCH_ARM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2)+$(KOKKOS_INTERNAL_USE_ARCH_A64FX) | bc)) # IBM based. -KOKKOS_INTERNAL_USE_ARCH_BGQ := $(call kokkos_has_string,$(KOKKOS_ARCH),BGQ) -KOKKOS_INTERNAL_USE_ARCH_POWER7 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power7) KOKKOS_INTERNAL_USE_ARCH_POWER8 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power8) KOKKOS_INTERNAL_USE_ARCH_POWER9 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power9) -KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BGQ)+$(KOKKOS_INTERNAL_USE_ARCH_POWER7)+$(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc)) +KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc)) # AMD based. KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX) @@ -403,22 +400,37 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN3), 0) KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen) endif endif -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA906),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX906)) -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA908),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX908)) -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA90A),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX90A)) + +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX906) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA906) +endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX908) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA908) +endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX90A) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA90A) +endif KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX940) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX942) -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030)) -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100)) +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030) +endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100) +endif # Any AVX? -KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM)) KOKKOS_INTERNAL_USE_ARCH_AVX := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)) KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3)) KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL)) # Incompatible flags? -KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_SKL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX)+$(KOKKOS_INTERNAL_USE_ARCH_ICL)+$(KOKKOS_INTERNAL_USE_ARCH_ICX)+$(KOKKOS_INTERNAL_USE_ARCH_SPR)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc) +KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_SKL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX)+$(KOKKOS_INTERNAL_USE_ARCH_ICL)+$(KOKKOS_INTERNAL_USE_ARCH_ICX)+$(KOKKOS_INTERNAL_USE_ARCH_SPR)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc) KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1") | bc) ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1) @@ -563,6 +575,16 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX2B), 1) KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX2B_FLAG) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX23") endif +ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX26), 1) + #I cannot make CMake add this in a good way - so add it here + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX26_FLAG) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX26") +endif +ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX2C), 1) + #I cannot make CMake add this in a good way - so add it here + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX2C_FLAG) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX26") +endif ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1) ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) @@ -602,27 +624,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HWLOC") endif -ifeq ($(KOKKOS_INTERNAL_USE_LIBRT), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_LIBRT") - KOKKOS_LIBS += -lrt - KOKKOS_TPL_LIBRARY_NAMES += rt -endif - -ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) - ifneq ($(KOKKOS_CMAKE), yes) - ifneq ($(MEMKIND_PATH),) - KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include - KOKKOS_LIBDIRS += -L$(MEMKIND_PATH)/lib - KOKKOS_CXXLDFLAGS += -L$(MEMKIND_PATH)/lib - KOKKOS_TPL_INCLUDE_DIRS += $(MEMKIND_PATH)/include - KOKKOS_TPL_LIBRARY_DIRS += $(MEMKIND_PATH)/lib - endif - KOKKOS_LIBS += -lmemkind -lnuma - KOKKOS_TPL_LIBRARY_NAMES += memkind numa - endif - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HBWSPACE") -endif - ifeq ($(KOKKOS_INTERNAL_ENABLE_LARGE_MEM_TESTS), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_LARGE_MEM_TESTS") endif @@ -689,10 +690,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) endif endif - ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_IMPL_CUDA_CLANG_WORKAROUND") - endif - ifeq ($(KOKKOS_INTERNAL_CUDA_DISABLE_MALLOC_ASYNC), 0) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC") else @@ -817,20 +814,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2), 1) endif endif -ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_SSE42") - - ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) - KOKKOS_CXXFLAGS += -xSSE4.2 - KOKKOS_LDFLAGS += -xSSE4.2 - else ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) - else - # Assume that this is a really a GNU compiler. - KOKKOS_CXXFLAGS += -msse4.2 - KOKKOS_LDFLAGS += -msse4.2 - endif -endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX") @@ -1239,7 +1222,6 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp") tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp") - tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp") ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") @@ -1279,10 +1261,6 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif - ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") - endif endif KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) @@ -1393,11 +1371,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) KOKKOS_TPL_LIBRARY_NAMES += hpx endif -# Don't include Kokkos_HBWSpace.cpp if not using MEMKIND to avoid a link warning. -ifneq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) - KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp,$(KOKKOS_SRC)) -endif - # With Cygwin functions such as fdopen and fileno are not defined # when strict ansi is enabled. strict ansi gets enabled with -std=c++14 # though. So we hard undefine it here. Not sure if that has any bad side effects @@ -1451,6 +1424,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) else tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENMP */") endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_OPENACC") +else + tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENACC */") +endif tmp := $(call desul_append_header, "") tmp := $(call desul_append_header, "$H""endif") @@ -1483,7 +1462,7 @@ include $(KOKKOS_PATH)/Makefile.targets kokkos-clean: rm -f $(KOKKOS_OBJ_LINK) $(DESUL_CONFIG_HEADER) $(DESUL_INTERNAL_CONFIG_TMP) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a KokkosCore_Config_SetupBackend.hpp \ KokkosCore_Config_FwdBackend.hpp KokkosCore_Config_DeclareBackend.hpp KokkosCore_Config_DeclareBackend.tmp \ - KokkosCore_Config_FwdBackend.tmp KokkosCore_Config_PostInclude.hpp KokkosCore_Config_PostInclude.tmp KokkosCore_Config_SetupBackend.tmp + KokkosCore_Config_FwdBackend.tmp KokkosCore_Config_SetupBackend.tmp libkokkos.a: $(KOKKOS_OBJ_LINK) $(KOKKOS_SRC) $(KOKKOS_HEADERS) ar cr libkokkos.a $(KOKKOS_OBJ_LINK) diff --git a/packages/kokkos/Makefile.targets b/packages/kokkos/Makefile.targets index ec8770dd7de0..e6900a822a89 100644 --- a/packages/kokkos/Makefile.targets +++ b/packages/kokkos/Makefile.targets @@ -20,8 +20,6 @@ Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Ta $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp -Kokkos_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp Kokkos_HostBarrier.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp Kokkos_Profiling.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling.cpp @@ -30,8 +28,6 @@ Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_ $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp -Kokkos_MemorySpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp Kokkos_HostSpace_deepcopy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp Kokkos_NumericTraits.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp @@ -82,8 +78,10 @@ Lock_Array_HIP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array endif ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1) -Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp +Kokkos_Threads_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Instance.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Instance.cpp +Kokkos_Threads_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Spinwait.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Spinwait.cpp endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) @@ -123,6 +121,3 @@ Kokkos_OpenACC_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenACC Kokkos_OpenACC_SharedAllocationRecord.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp endif - -Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp diff --git a/packages/kokkos/README.md b/packages/kokkos/README.md index 033346e956e0..19793bb82d94 100644 --- a/packages/kokkos/README.md +++ b/packages/kokkos/README.md @@ -28,7 +28,7 @@ To start learning about Kokkos: - [Use cases and Examples](https://kokkos.github.io/kokkos-core-wiki/usecases.html): a series of examples ranging from how to use Kokkos with MPI to Fortran interoperability. -For questions find us on Slack: https://kokkosteam.slack.com or open a github issue. +For questions find us on Slack: https://kokkosteam.slack.com or open a GitHub issue. For non-public questions send an email to: *crtrott(at)sandia.gov* @@ -48,10 +48,10 @@ Please see the [following page](https://kokkos.github.io/kokkos-core-wiki/citati # License -[![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) +[![License](https://img.shields.io/badge/License-Apache--2.0_WITH_LLVM--exception-blue)](https://spdx.org/licenses/LLVM-exception.html) Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights in this software. -The full license statement used in all headers is available [here](https://kokkos.github.io/kokkos-core-wiki/license.html) or -[here](https://github.com/kokkos/kokkos/blob/master/LICENSE). +The full license statement used in all headers is available [here](https://kokkos.org/kokkos-core-wiki/license.html) or +[here](https://github.com/kokkos/kokkos/blob/develop/LICENSE). diff --git a/packages/kokkos/SECURITY.md b/packages/kokkos/SECURITY.md new file mode 100644 index 000000000000..93cf6e3663e4 --- /dev/null +++ b/packages/kokkos/SECURITY.md @@ -0,0 +1,12 @@ +# Reporting Security Issues + +To report a security issue, please email +[lebrungrandt@ornl.gov](mailto:lebrungrandt@ornl.gov) +and [crtrott@sandia.gov](mailto:crtrott@sandia.gov) +with a description of the issue, the steps you took to create the issue, +affected versions, and, if known, mitigations for the issue. + +Our vulnerability management team will respond within 5 working days of your +email. If the issue is confirmed as a vulnerability, we will open a +Security Advisory and acknowledge your contributions as part of it. This project +follows a 90 day disclosure timeline. diff --git a/packages/kokkos/Spack.md b/packages/kokkos/Spack.md index 79606c259d5b..06c763a64ee0 100644 --- a/packages/kokkos/Spack.md +++ b/packages/kokkos/Spack.md @@ -159,7 +159,6 @@ If you don't specify a CUDA build variant in a `packages.yaml` and you build you > spack install superscience ```` you may end up just getting the default Kokkos (i.e. Serial). -Some examples are included in the `config/yaml` folder for common platforms. Before running `spack install ` we recommend running `spack spec ` to confirm your dependency tree is correct. For example, with Kokkos Kernels: ````bash diff --git a/packages/kokkos/algorithms/src/CMakeLists.txt b/packages/kokkos/algorithms/src/CMakeLists.txt index 169577894728..b490caca6282 100644 --- a/packages/kokkos/algorithms/src/CMakeLists.txt +++ b/packages/kokkos/algorithms/src/CMakeLists.txt @@ -30,5 +30,5 @@ KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkosalgorithms ${CMAKE_CURRENT_SOURCE_DIR} ) - - +KOKKOS_LINK_TPL(kokkoscontainers PUBLIC ROCTHRUST) +KOKKOS_LINK_TPL(kokkoscore PUBLIC ONEDPL) diff --git a/packages/kokkos/algorithms/src/Kokkos_Random.hpp b/packages/kokkos/algorithms/src/Kokkos_Random.hpp index 2d7d236d2fc2..7df12b8518eb 100644 --- a/packages/kokkos/algorithms/src/Kokkos_Random.hpp +++ b/packages/kokkos/algorithms/src/Kokkos_Random.hpp @@ -849,18 +849,17 @@ class Random_XorShift64 { return drand(end - start) + start; } - // Marsaglia polar method for drawing a standard normal distributed random + // Box-muller method for drawing a standard normal distributed random // number KOKKOS_INLINE_FUNCTION double normal() { - double S = 2.0; - double U; - while (S >= 1.0) { - U = 2.0 * drand() - 1.0; - const double V = 2.0 * drand() - 1.0; - S = U * U + V * V; - } - return U * std::sqrt(-2.0 * std::log(S) / S); + constexpr auto two_pi = 2 * Kokkos::numbers::pi_v; + + const double u = drand(); + const double v = drand(); + const double r = Kokkos::sqrt(-2.0 * Kokkos::log(u)); + const double theta = v * two_pi; + return r * Kokkos::cos(theta); } KOKKOS_INLINE_FUNCTION @@ -1094,18 +1093,17 @@ class Random_XorShift1024 { return drand(end - start) + start; } - // Marsaglia polar method for drawing a standard normal distributed random + // Box-muller method for drawing a standard normal distributed random // number KOKKOS_INLINE_FUNCTION double normal() { - double S = 2.0; - double U; - while (S >= 1.0) { - U = 2.0 * drand() - 1.0; - const double V = 2.0 * drand() - 1.0; - S = U * U + V * V; - } - return U * std::sqrt(-2.0 * std::log(S) / S); + constexpr auto two_pi = 2 * Kokkos::numbers::pi_v; + + const double u = drand(); + const double v = drand(); + const double r = Kokkos::sqrt(-2.0 * Kokkos::log(u)); + const double theta = v * two_pi; + return r * Kokkos::cos(theta); } KOKKOS_INLINE_FUNCTION @@ -1545,13 +1543,23 @@ template void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type begin, typename ViewType::const_value_type end) { - fill_random(typename ViewType::execution_space{}, a, g, begin, end); + Kokkos::fence( + "fill_random: fence before since no execution space instance provided"); + typename ViewType::execution_space exec; + fill_random(exec, a, g, begin, end); + exec.fence( + "fill_random: fence after since no execution space instance provided"); } template void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type range) { - fill_random(typename ViewType::execution_space{}, a, g, 0, range); + Kokkos::fence( + "fill_random: fence before since no execution space instance provided"); + typename ViewType::execution_space exec; + fill_random(exec, a, g, 0, range); + exec.fence( + "fill_random: fence after since no execution space instance provided"); } } // namespace Kokkos diff --git a/packages/kokkos/algorithms/src/Kokkos_Sort.hpp b/packages/kokkos/algorithms/src/Kokkos_Sort.hpp index f77484cc5559..136b4ec82dcd 100644 --- a/packages/kokkos/algorithms/src/Kokkos_Sort.hpp +++ b/packages/kokkos/algorithms/src/Kokkos_Sort.hpp @@ -23,6 +23,7 @@ #include "sorting/Kokkos_BinSortPublicAPI.hpp" #include "sorting/Kokkos_SortPublicAPI.hpp" +#include "sorting/Kokkos_SortByKeyPublicAPI.hpp" #include "sorting/Kokkos_NestedSortPublicAPI.hpp" #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_SORT diff --git a/packages/kokkos/algorithms/src/Kokkos_StdAlgorithms.hpp b/packages/kokkos/algorithms/src/Kokkos_StdAlgorithms.hpp index 436ae0d10bf8..b532a774e130 100644 --- a/packages/kokkos/algorithms/src/Kokkos_StdAlgorithms.hpp +++ b/packages/kokkos/algorithms/src/Kokkos_StdAlgorithms.hpp @@ -35,7 +35,6 @@ // following the std classification. // modifying ops -#include "std_algorithms/Kokkos_Swap.hpp" #include "std_algorithms/Kokkos_IterSwap.hpp" // non-modifying sequence diff --git a/packages/kokkos/algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp b/packages/kokkos/algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp new file mode 100644 index 000000000000..fc73eccad68c --- /dev/null +++ b/packages/kokkos/algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp @@ -0,0 +1,117 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SORT_BY_KEY_PUBLIC_API_HPP_ +#define KOKKOS_SORT_BY_KEY_PUBLIC_API_HPP_ + +#include "./impl/Kokkos_SortByKeyImpl.hpp" +#include +#include + +namespace Kokkos::Experimental { + +// --------------------------------------------------------------- +// basic overloads +// --------------------------------------------------------------- + +template +void sort_by_key( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { + // constraints + using KeysType = Kokkos::View; + using ValuesType = Kokkos::View; + ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(keys); + ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(values); + + static_assert(SpaceAccessibility::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the keys View argument!"); + static_assert( + SpaceAccessibility::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the values View argument!"); + + static_assert(KeysType::static_extent(0) == 0 || + ValuesType::static_extent(0) == 0 || + KeysType::static_extent(0) == ValuesType::static_extent(0)); + if (values.size() != keys.size()) + Kokkos::abort((std::string("values and keys extents must be the same. The " + "values extent is ") + + std::to_string(values.size()) + ", and the keys extent is " + + std::to_string(keys.size()) + ".") + .c_str()); + + if (keys.extent(0) <= 1) { + return; + } + + ::Kokkos::Impl::sort_by_key_device_view_without_comparator(exec, keys, + values); +} + +// --------------------------------------------------------------- +// overloads supporting a custom comparator +// --------------------------------------------------------------- + +template +void sort_by_key( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { + // constraints + using KeysType = Kokkos::View; + using ValuesType = Kokkos::View; + ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(keys); + ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(values); + + static_assert(SpaceAccessibility::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the keys View argument!"); + static_assert( + SpaceAccessibility::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the values View argument!"); + + static_assert(KeysType::static_extent(0) == 0 || + ValuesType::static_extent(0) == 0 || + KeysType::static_extent(0) == ValuesType::static_extent(0)); + if (values.size() != keys.size()) + Kokkos::abort((std::string("values and keys extents must be the same. The " + "values extent is ") + + std::to_string(values.size()) + ", and the keys extent is " + + std::to_string(keys.size()) + ".") + .c_str()); + + if (keys.extent(0) <= 1) { + return; + } + + ::Kokkos::Impl::sort_by_key_device_view_with_comparator(exec, keys, values, + comparator); +} + +} // namespace Kokkos::Experimental +#endif diff --git a/packages/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp b/packages/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp index a763c41e5807..308e9e3a008b 100644 --- a/packages/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp +++ b/packages/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp @@ -29,7 +29,7 @@ namespace Kokkos { // --------------------------------------------------------------- template -void sort([[maybe_unused]] const ExecutionSpace& exec, +void sort(const ExecutionSpace& exec, const Kokkos::View& view) { // constraints using ViewType = Kokkos::View; @@ -52,6 +52,7 @@ void sort([[maybe_unused]] const ExecutionSpace& exec, } if constexpr (Impl::better_off_calling_std_sort_v) { + exec.fence("Kokkos::sort without comparator use std::sort"); auto first = ::Kokkos::Experimental::begin(view); auto last = ::Kokkos::Experimental::end(view); std::sort(first, last); @@ -82,7 +83,7 @@ void sort(const Kokkos::View& view) { // --------------------------------------------------------------- template -void sort([[maybe_unused]] const ExecutionSpace& exec, +void sort(const ExecutionSpace& exec, const Kokkos::View& view, const ComparatorType& comparator) { // constraints @@ -105,6 +106,7 @@ void sort([[maybe_unused]] const ExecutionSpace& exec, } if constexpr (Impl::better_off_calling_std_sort_v) { + exec.fence("Kokkos::sort with comparator use std::sort"); auto first = ::Kokkos::Experimental::begin(view); auto last = ::Kokkos::Experimental::end(view); std::sort(first, last, comparator); diff --git a/packages/kokkos/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp index 50ac82331957..2fe58272d920 100644 --- a/packages/kokkos/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp +++ b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp @@ -18,7 +18,6 @@ #define KOKKOS_NESTED_SORT_IMPL_HPP_ #include -#include namespace Kokkos { namespace Experimental { @@ -99,7 +98,7 @@ KOKKOS_INLINE_FUNCTION void sort_nested_impl( keyView(elem1) = key2; keyView(elem2) = key1; if constexpr (!std::is_same_v) { - Kokkos::Experimental::swap(valueView(elem1), valueView(elem2)); + Kokkos::kokkos_swap(valueView(elem1), valueView(elem2)); } } } diff --git a/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp new file mode 100644 index 000000000000..36deccdfb1e2 --- /dev/null +++ b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp @@ -0,0 +1,401 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SORT_BY_KEY_FREE_FUNCS_IMPL_HPP_ +#define KOKKOS_SORT_BY_KEY_FREE_FUNCS_IMPL_HPP_ + +#include + +#if defined(KOKKOS_ENABLE_CUDA) + +// Workaround for `Instruction 'shfl' without '.sync' is not supported on +// .target sm_70 and higher from PTX ISA version 6.4`. +// Also see https://github.com/NVIDIA/cub/pull/170. +#if !defined(CUB_USE_COOPERATIVE_GROUPS) +#define CUB_USE_COOPERATIVE_GROUPS +#endif + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wshadow" + +#if defined(KOKKOS_COMPILER_CLANG) +// Some versions of Clang fail to compile Thrust, failing with errors like +// this: +// /thrust/system/cuda/detail/core/agent_launcher.h:557:11: +// error: use of undeclared identifier 'va_printf' +// The exact combination of versions for Clang and Thrust (or CUDA) for this +// failure was not investigated, however even very recent version combination +// (Clang 10.0.0 and Cuda 10.0) demonstrated failure. +// +// Defining _CubLog here locally allows us to avoid that code path, however +// disabling some debugging diagnostics +#pragma push_macro("_CubLog") +#ifdef _CubLog +#undef _CubLog +#endif +#define _CubLog +#include +#include +#pragma pop_macro("_CubLog") +#else +#include +#include +#endif + +#pragma GCC diagnostic pop + +#endif + +#if defined(KOKKOS_ENABLE_ROCTHRUST) +#include +#include +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) && \ + (ONEDPL_VERSION_MAJOR > 2022 || \ + (ONEDPL_VERSION_MAJOR == 2022 && ONEDPL_VERSION_MINOR >= 2)) +#define KOKKOS_ONEDPL_HAS_SORT_BY_KEY +#include +#include +#endif + +namespace Kokkos::Impl { + +template +constexpr inline bool is_admissible_to_kokkos_sort_by_key = + ::Kokkos::is_view::value&& T::rank() == 1 && + (std::is_same::value || + std::is_same::value || + std::is_same::value); + +template +KOKKOS_INLINE_FUNCTION constexpr void +static_assert_is_admissible_to_kokkos_sort_by_key(const ViewType& /* view */) { + static_assert(is_admissible_to_kokkos_sort_by_key, + "Kokkos::sort_by_key only accepts 1D values View with " + "LayoutRight, LayoutLeft or LayoutStride."); +} + +// For the fallback implementation for sort_by_key using Kokkos::sort, we need +// to consider if Kokkos::sort defers to the fallback implementation that copies +// the array to the host and uses std::sort, see +// copy_to_host_run_stdsort_copy_back() in impl/Kokkos_SortImpl.hpp. If +// sort_on_device_v is true, we assume that std::sort doesn't copy data. +// Otherwise, we manually copy all data to the host and provide Kokkos::sort +// with a host execution space. +template +inline constexpr bool sort_on_device_v = false; + +#if defined(KOKKOS_ENABLE_CUDA) +template +inline constexpr bool sort_on_device_v = true; + +template +void sort_by_key_cudathrust( + const Kokkos::Cuda& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + MaybeComparator&&... maybeComparator) { + const auto policy = thrust::cuda::par.on(exec.cuda_stream()); + auto keys_first = ::Kokkos::Experimental::begin(keys); + auto keys_last = ::Kokkos::Experimental::end(keys); + auto values_first = ::Kokkos::Experimental::begin(values); + thrust::sort_by_key(policy, keys_first, keys_last, values_first, + std::forward(maybeComparator)...); +} +#endif + +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +inline constexpr bool sort_on_device_v = true; + +template +void sort_by_key_rocthrust( + const Kokkos::HIP& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + MaybeComparator&&... maybeComparator) { + const auto policy = thrust::hip::par.on(exec.hip_stream()); + auto keys_first = ::Kokkos::Experimental::begin(keys); + auto keys_last = ::Kokkos::Experimental::end(keys); + auto values_first = ::Kokkos::Experimental::begin(values); + thrust::sort_by_key(policy, keys_first, keys_last, values_first, + std::forward(maybeComparator)...); +} +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) +template +inline constexpr bool sort_on_device_v = + std::is_same_v || + std::is_same_v; + +#ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY +template +void sort_by_key_onedpl( + const Kokkos::Experimental::SYCL& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + MaybeComparator&&... maybeComparator) { + if (keys.stride(0) != 1 && values.stride(0) != 1) { + Kokkos::abort( + "SYCL sort_by_key only supports rank-1 Views with stride(0) = 1."); + } + + // Can't use Experimental::begin/end here since the oneDPL then assumes that + // the data is on the host. + auto queue = exec.sycl_queue(); + auto policy = oneapi::dpl::execution::make_device_policy(queue); + const int n = keys.extent(0); + oneapi::dpl::sort_by_key(policy, keys.data(), keys.data() + n, values.data(), + std::forward(maybeComparator)...); +} +#endif +#endif + +template +void applyPermutation(const ExecutionSpace& space, + const PermutationView& permutation, + const ViewType& view) { + static_assert(std::is_integral::value); + + auto view_copy = Kokkos::create_mirror( + Kokkos::view_alloc(space, typename ExecutionSpace::memory_space{}, + Kokkos::WithoutInitializing), + view); + Kokkos::deep_copy(space, view_copy, view); + Kokkos::parallel_for( + "Kokkos::sort_by_key_via_sort::permute_" + view.label(), + Kokkos::RangePolicy(space, 0, view.extent(0)), + KOKKOS_LAMBDA(int i) { view(i) = view_copy(permutation(i)); }); +} + +template +void sort_by_key_via_sort( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + MaybeComparator&&... maybeComparator) { + static_assert(sizeof...(MaybeComparator) <= 1); + + auto const n = keys.size(); + + Kokkos::View permute( + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, + "Kokkos::sort_by_key_via_sort::permute"), + n); + + // iota + Kokkos::parallel_for( + "Kokkos::sort_by_key_via_sort::iota", + Kokkos::RangePolicy(exec, 0, n), + KOKKOS_LAMBDA(int i) { permute(i) = i; }); + + using Layout = + typename Kokkos::View::array_layout; + if constexpr (!sort_on_device_v) { + auto host_keys = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::HostSpace{}, Kokkos::WithoutInitializing), + keys); + auto host_permute = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::HostSpace{}, Kokkos::WithoutInitializing), + permute); + Kokkos::deep_copy(exec, host_keys, keys); + Kokkos::deep_copy(exec, host_permute, permute); + + exec.fence("Kokkos::Impl::sort_by_key_via_sort: before host sort"); + Kokkos::DefaultHostExecutionSpace host_exec; + + if constexpr (sizeof...(MaybeComparator) == 0) { + Kokkos::sort( + host_exec, host_permute, + KOKKOS_LAMBDA(int i, int j) { return host_keys(i) < host_keys(j); }); + } else { + auto keys_comparator = + std::get<0>(std::tuple(maybeComparator...)); + Kokkos::sort( + host_exec, host_permute, KOKKOS_LAMBDA(int i, int j) { + return keys_comparator(host_keys(i), host_keys(j)); + }); + } + host_exec.fence("Kokkos::Impl::sort_by_key_via_sort: after host sort"); + Kokkos::deep_copy(exec, permute, host_permute); + } else { +#ifdef KOKKOS_ENABLE_SYCL + auto* raw_keys_in_comparator = keys.data(); + auto stride = keys.stride(0); + if constexpr (sizeof...(MaybeComparator) == 0) { + Kokkos::sort( + exec, permute, KOKKOS_LAMBDA(int i, int j) { + return raw_keys_in_comparator[i * stride] < + raw_keys_in_comparator[j * stride]; + }); + } else { + auto keys_comparator = + std::get<0>(std::tuple(maybeComparator...)); + Kokkos::sort( + exec, permute, KOKKOS_LAMBDA(int i, int j) { + return keys_comparator(raw_keys_in_comparator[i * stride], + raw_keys_in_comparator[j * stride]); + }); + } +#else + if constexpr (sizeof...(MaybeComparator) == 0) { + Kokkos::sort( + exec, permute, + KOKKOS_LAMBDA(int i, int j) { return keys(i) < keys(j); }); + } else { + auto keys_comparator = + std::get<0>(std::tuple(maybeComparator...)); + Kokkos::sort( + exec, permute, KOKKOS_LAMBDA(int i, int j) { + return keys_comparator(keys(i), keys(j)); + }); + } +#endif + } + + applyPermutation(exec, permute, keys); + applyPermutation(exec, permute, values); +} + +// ------------------------------------------------------ +// +// specialize cases for sorting by key without comparator +// +// ------------------------------------------------------ + +#if defined(KOKKOS_ENABLE_CUDA) +template +void sort_by_key_device_view_without_comparator( + const Kokkos::Cuda& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { + sort_by_key_cudathrust(exec, keys, values); +} +#endif + +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_by_key_device_view_without_comparator( + const Kokkos::HIP& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { + sort_by_key_rocthrust(exec, keys, values); +} +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) +template +void sort_by_key_device_view_without_comparator( + const Kokkos::Experimental::SYCL& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { +#ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY + if (keys.stride(0) == 1 && values.stride(0) == 1) + sort_by_key_onedpl(exec, keys, values); + else +#endif + sort_by_key_via_sort(exec, keys, values); +} +#endif + +// fallback case +template +std::enable_if_t::value> +sort_by_key_device_view_without_comparator( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { + sort_by_key_via_sort(exec, keys, values); +} + +// --------------------------------------------------- +// +// specialize cases for sorting by key with comparator +// +// --------------------------------------------------- + +#if defined(KOKKOS_ENABLE_CUDA) +template +void sort_by_key_device_view_with_comparator( + const Kokkos::Cuda& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { + sort_by_key_cudathrust(exec, keys, values, comparator); +} +#endif + +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_by_key_device_view_with_comparator( + const Kokkos::HIP& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { + sort_by_key_rocthrust(exec, keys, values, comparator); +} +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) +template +void sort_by_key_device_view_with_comparator( + const Kokkos::Experimental::SYCL& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { +#ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY + if (keys.stride(0) == 1 && values.stride(0) == 1) + sort_by_key_onedpl(exec, keys, values, comparator); + else +#endif + sort_by_key_via_sort(exec, keys, values, comparator); +} +#endif + +// fallback case +template +std::enable_if_t::value> +sort_by_key_device_view_with_comparator( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { + sort_by_key_via_sort(exec, keys, values, comparator); +} + +#undef KOKKOS_ONEDPL_HAS_SORT_BY_KEY + +} // namespace Kokkos::Impl +#endif diff --git a/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp index d87ab09e7724..4c174b5fda94 100644 --- a/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp +++ b/packages/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp @@ -63,6 +63,11 @@ #endif +#if defined(KOKKOS_ENABLE_ROCTHRUST) +#include +#include +#endif + #if defined(KOKKOS_ENABLE_ONEDPL) #include #include @@ -184,6 +189,26 @@ void sort_cudathrust(const Cuda& space, } #endif +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_rocthrust(const HIP& space, + const Kokkos::View& view, + MaybeComparator&&... maybeComparator) { + using ViewType = Kokkos::View; + static_assert(ViewType::rank == 1, + "Kokkos::sort: currently only supports rank-1 Views."); + + if (view.extent(0) <= 1) { + return; + } + const auto exec = thrust::hip::par.on(space.hip_stream()); + auto first = ::Kokkos::Experimental::begin(view); + auto last = ::Kokkos::Experimental::end(view); + thrust::sort(exec, first, last, + std::forward(maybeComparator)...); +} +#endif + #if defined(KOKKOS_ENABLE_ONEDPL) template void sort_onedpl(const Kokkos::Experimental::SYCL& space, @@ -274,6 +299,14 @@ void sort_device_view_without_comparator( } #endif +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_device_view_without_comparator( + const HIP& exec, const Kokkos::View& view) { + sort_rocthrust(exec, view); +} +#endif + #if defined(KOKKOS_ENABLE_ONEDPL) template void sort_device_view_without_comparator( @@ -320,6 +353,15 @@ void sort_device_view_with_comparator( } #endif +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_device_view_with_comparator( + const HIP& exec, const Kokkos::View& view, + const ComparatorType& comparator) { + sort_rocthrust(exec, view, comparator); +} +#endif + #if defined(KOKKOS_ENABLE_ONEDPL) template void sort_device_view_with_comparator( diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Copy.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Copy.hpp index b7ce1ba5edb3..c5406c72b0d8 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Copy.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Copy.hpp @@ -50,7 +50,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -66,7 +66,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -93,7 +93,7 @@ template & source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp index 8f9e0f19b808..82071a9362eb 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp @@ -50,7 +50,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_backward(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -65,7 +65,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_backward(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -92,7 +92,7 @@ template & source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp index ba18bc76b936..599fde5737ae 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp @@ -54,7 +54,8 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_if(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest, Predicate pred) { + const ::Kokkos::View& dest, + Predicate pred) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -69,7 +70,8 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_if(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest, Predicate pred) { + const ::Kokkos::View& dest, + Predicate pred) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -96,7 +98,7 @@ template & source, - ::Kokkos::View& dest, Predicate pred) { + const ::Kokkos::View& dest, Predicate pred) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyN.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyN.hpp index 43c91204837e..637d8d4cbc51 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyN.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_CopyN.hpp @@ -51,7 +51,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_n(const ExecutionSpace& ex, const ::Kokkos::View& source, Size count, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -66,7 +66,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_n(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, Size count, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -93,7 +93,7 @@ template & source, Size count, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Equal.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Equal.hpp index a72a49cc22b8..593c42f87e13 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Equal.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Equal.hpp @@ -80,7 +80,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool equal(const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -96,7 +96,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool equal(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -111,7 +111,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool equal(const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2, + const ::Kokkos::View& view2, BinaryPredicateType predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -128,7 +128,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool equal(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2, + const ::Kokkos::View& view2, BinaryPredicateType predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -227,7 +227,7 @@ template & view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -243,7 +243,7 @@ template & view1, - ::Kokkos::View& view2, + const ::Kokkos::View& view2, BinaryPredicateType predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp index a796a306dda0..5bb2d1039dc6 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp @@ -19,7 +19,6 @@ #include #include "impl/Kokkos_Constraints.hpp" -#include "Kokkos_Swap.hpp" namespace Kokkos { namespace Experimental { @@ -33,7 +32,7 @@ struct StdIterSwapFunctor { KOKKOS_FUNCTION void operator()(int i) const { (void)i; - ::Kokkos::Experimental::swap(*m_a, *m_b); + ::Kokkos::kokkos_swap(*m_a, *m_b); } KOKKOS_FUNCTION @@ -58,6 +57,16 @@ void iter_swap(IteratorType1 a, IteratorType2 b) { Impl::iter_swap_impl(a, b); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template +KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::kokkos_swap instead!") +KOKKOS_FUNCTION + void swap(T& a, T& b) noexcept(::Kokkos::kokkos_swap(std::declval(), + std::declval())) { + ::Kokkos::kokkos_swap(a, b); +} +#endif + } // namespace Experimental } // namespace Kokkos diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp index 4b5c69df4512..e13479c370b8 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp @@ -54,7 +54,7 @@ template < bool lexicographical_compare( const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -71,7 +71,7 @@ template < bool lexicographical_compare( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -112,7 +112,8 @@ template < bool lexicographical_compare( const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2, ComparatorType comp) { + const ::Kokkos::View& view2, + ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -129,7 +130,8 @@ template < bool lexicographical_compare( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2, ComparatorType comp) { + const ::Kokkos::View& view2, + ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -161,7 +163,7 @@ template & view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -187,7 +189,8 @@ template & view1, - ::Kokkos::View& view2, ComparatorType comp) { + const ::Kokkos::View& view2, + ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Move.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Move.hpp index f04ea12ba88a..ac308ea1845c 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Move.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Move.hpp @@ -50,7 +50,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto move(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -64,7 +64,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto move(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -92,7 +92,7 @@ template & source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp index 375474ca57f9..2789ab217968 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp @@ -41,7 +41,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto move_backward(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -65,7 +65,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto move_backward(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -94,7 +94,7 @@ template & source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp index 37336c983ab0..66f39c4eaa60 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp @@ -50,7 +50,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto reverse_copy(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -65,7 +65,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto reverse_copy(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -94,7 +94,7 @@ template & source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp index 39f33b64879a..d66763d304c4 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp @@ -40,7 +40,7 @@ template , int> = 0> auto swap_ranges(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -64,7 +64,7 @@ template , int> = 0> auto swap_ranges(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -94,7 +94,7 @@ template & source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Transform.hpp b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Transform.hpp index 838c9169e25c..84cbed524d37 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Transform.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Transform.hpp @@ -58,7 +58,7 @@ template , int> = 0> auto transform(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest, + const ::Kokkos::View& dest, UnaryOperation unary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -73,7 +73,7 @@ template , int> = 0> auto transform(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest, + const ::Kokkos::View& dest, UnaryOperation unary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -119,7 +119,7 @@ template & source1, const ::Kokkos::View& source2, - ::Kokkos::View& dest, + const ::Kokkos::View& dest, BinaryOperation binary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2); @@ -137,7 +137,7 @@ template & source1, const ::Kokkos::View& source2, - ::Kokkos::View& dest, + const ::Kokkos::View& dest, BinaryOperation binary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2); @@ -174,7 +174,8 @@ template & source, - ::Kokkos::View& dest, UnaryOperation unary_op) { + const ::Kokkos::View& dest, + UnaryOperation unary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -207,7 +208,7 @@ KOKKOS_FUNCTION auto transform( const TeamHandleType& teamHandle, const ::Kokkos::View& source1, const ::Kokkos::View& source2, - ::Kokkos::View& dest, + const ::Kokkos::View& dest, BinaryOperation binary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2); diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp index 8151ee349552..5a7fe16984a2 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp @@ -47,8 +47,9 @@ struct ExclusiveScanDefaultFunctorForKnownNeutralElement { KOKKOS_FUNCTION void operator()(const IndexType i, ValueType& update, const bool final_pass) const { + const auto tmp = m_first_from[i]; if (final_pass) m_first_dest[i] = update + m_init_value; - update += m_first_from[i]; + update += tmp; } }; @@ -73,6 +74,7 @@ struct ExclusiveScanDefaultFunctorWithValueWrapper { KOKKOS_FUNCTION void operator()(const IndexType i, value_type& update, const bool final_pass) const { + const auto tmp = value_type{m_first_from[i], false}; if (final_pass) { if (i == 0) { m_first_dest[i] = m_init_value; @@ -81,7 +83,6 @@ struct ExclusiveScanDefaultFunctorWithValueWrapper { } } - const auto tmp = value_type{m_first_from[i], false}; this->join(update, tmp); } @@ -132,6 +133,7 @@ struct TransformExclusiveScanFunctorWithValueWrapper { KOKKOS_FUNCTION void operator()(const IndexType i, value_type& update, const bool final_pass) const { + const auto tmp = value_type{m_unary_op(m_first_from[i]), false}; if (final_pass) { if (i == 0) { // for both ExclusiveScan and TransformExclusiveScan, @@ -142,7 +144,6 @@ struct TransformExclusiveScanFunctorWithValueWrapper { } } - const auto tmp = value_type{m_unary_op(m_first_from[i]), false}; this->join(update, tmp); } @@ -190,6 +191,7 @@ struct TransformExclusiveScanFunctorWithoutValueWrapper { KOKKOS_FUNCTION void operator()(const IndexType i, ValueType& update, const bool final_pass) const { + const auto tmp = ValueType{m_unary_op(m_first_from[i])}; if (final_pass) { if (i == 0) { // for both ExclusiveScan and TransformExclusiveScan, @@ -200,7 +202,6 @@ struct TransformExclusiveScanFunctorWithoutValueWrapper { } } - const auto tmp = ValueType{m_unary_op(m_first_from[i])}; this->join(update, tmp); } diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp index 50224c8874ed..456df43aed21 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp @@ -46,15 +46,14 @@ struct StdRemoveIfStage1Functor { void operator()(const IndexType i, IndexType& update, const bool final_pass) const { auto& myval = m_first_from[i]; - if (final_pass) { - if (!m_must_remove(myval)) { + + if (!m_must_remove(myval)) { + if (final_pass) { // calling move here is ok because we are inside final pass // we are calling move assign as specified by the std m_first_dest[update] = std::move(myval); } - } - if (!m_must_remove(myval)) { update += 1; } } @@ -108,7 +107,9 @@ IteratorType remove_if_exespace_impl(const std::string& label, // create helper tmp view using value_type = typename IteratorType::value_type; using tmp_view_type = Kokkos::View; - tmp_view_type tmp_view("std_remove_if_tmp_view", keep_count); + tmp_view_type tmp_view(Kokkos::view_alloc(Kokkos::WithoutInitializing, ex, + "std_remove_if_tmp_view"), + keep_count); using tmp_readwrite_iterator_type = decltype(begin(tmp_view)); // in stage 1, *move* all elements to keep from original range to tmp diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp index 428dc0d744a4..b4046c7645bd 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp @@ -21,7 +21,6 @@ #include "Kokkos_Constraints.hpp" #include "Kokkos_HelperPredicates.hpp" #include -#include #include namespace Kokkos { @@ -39,7 +38,7 @@ struct StdReverseFunctor { KOKKOS_FUNCTION void operator()(index_type i) const { - ::Kokkos::Experimental::swap(m_first[i], m_last[-i - 1]); + ::Kokkos::kokkos_swap(m_first[i], m_last[-i - 1]); } KOKKOS_FUNCTION diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp index 50bc7c8d610a..94147485071a 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp @@ -126,10 +126,11 @@ KOKKOS_FUNCTION IteratorType shift_left_team_impl( // execution space impl because for this team impl we are // within a parallel region, so for now we solve serially - const std::size_t numElementsToMove = + using difference_type = typename IteratorType::difference_type; + const difference_type numElementsToMove = ::Kokkos::Experimental::distance(first + n, last); Kokkos::single(Kokkos::PerTeam(teamHandle), [=]() { - for (std::size_t i = 0; i < numElementsToMove; ++i) { + for (difference_type i = 0; i < numElementsToMove; ++i) { first[i] = std::move(first[i + n]); } }); diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp index cac20bfbba6a..0414e6f1c251 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp @@ -103,26 +103,6 @@ IteratorType shift_right_exespace_impl( return first + n; } -template -struct StdShiftRightTeamSingleFunctor { - Iterator m_first; - Iterator m_last; - std::size_t m_shift; - - KOKKOS_FUNCTION - void operator()() const { - // the impl function calling this functor guarantees that - // - m_shift is non-negative - // - m_first, m_last identify a valid range with m_last > m_first - // - m_shift is less than m_last - m_first - // so I can safely use std::size_t here - } - - KOKKOS_FUNCTION - StdShiftRightTeamSingleFunctor(Iterator _first, Iterator _last, std::size_t n) - : m_first(std::move(_first)), m_last(std::move(_last)), m_shift(n) {} -}; - template KOKKOS_FUNCTION IteratorType shift_right_team_impl( const TeamHandleType& teamHandle, IteratorType first, IteratorType last, @@ -145,10 +125,11 @@ KOKKOS_FUNCTION IteratorType shift_right_team_impl( // execution space impl because for this team impl we are // within a parallel region, so for now we solve serially - const std::size_t numElementsToMove = + using difference_type = typename IteratorType::difference_type; + const difference_type numElementsToMove = ::Kokkos::Experimental::distance(first, last - n); Kokkos::single(Kokkos::PerTeam(teamHandle), [=]() { - for (std::size_t i = 0; i < numElementsToMove; ++i) { + for (difference_type i = 0; i < numElementsToMove; ++i) { last[-i - 1] = std::move(last[-n - i - 1]); } }); diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp index 5bc77ed7ddcd..930a14ac48c3 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp @@ -21,7 +21,6 @@ #include "Kokkos_Constraints.hpp" #include "Kokkos_HelperPredicates.hpp" #include -#include #include namespace Kokkos { @@ -36,7 +35,7 @@ struct StdSwapRangesFunctor { KOKKOS_FUNCTION void operator()(index_type i) const { - ::Kokkos::Experimental::swap(m_first1[i], m_first2[i]); + ::Kokkos::kokkos_swap(m_first1[i], m_first2[i]); } KOKKOS_FUNCTION diff --git a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp index 11afa8ed6e08..286358245857 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp +++ b/packages/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp @@ -105,7 +105,9 @@ IteratorType unique_exespace_impl(const std::string& label, // using the same algorithm used for unique_copy but we now move things using value_type = typename IteratorType::value_type; using tmp_view_type = Kokkos::View; - tmp_view_type tmp_view("std_unique_tmp_view", num_elements_to_explore); + tmp_view_type tmp_view(Kokkos::view_alloc(ex, Kokkos::WithoutInitializing, + "std_unique_tmp_view"), + num_elements_to_explore); // scan extent is: num_elements_to_explore - 1 // for same reason as the one explained in unique_copy diff --git a/packages/kokkos/algorithms/unit_tests/CMakeLists.txt b/packages/kokkos/algorithms/unit_tests/CMakeLists.txt index 419f5ec1d132..db184bc8a999 100644 --- a/packages/kokkos/algorithms/unit_tests/CMakeLists.txt +++ b/packages/kokkos/algorithms/unit_tests/CMakeLists.txt @@ -25,6 +25,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) set(ALGO_SORT_SOURCES) foreach(SOURCE_Input TestSort + TestSortByKey TestSortCustomComp TestBinSortA TestBinSortB @@ -57,35 +58,37 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) configure_file(${dir}/dummy.cpp ${file}) list(APPEND ALGO_RANDOM_SOURCES ${file}) endforeach() + endif() +endforeach() - # ------------------------------------------ - # std set A - # ------------------------------------------ - set(STDALGO_SOURCES_A) - foreach(Name +# ------------------------------------------ +# std set A +# ------------------------------------------ +set(STDALGO_SOURCES_A) +foreach(Name StdReducers StdAlgorithmsConstraints RandomAccessIterator - ) - list(APPEND STDALGO_SOURCES_A Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_SOURCES_A Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std set B - # ------------------------------------------ - set(STDALGO_SOURCES_B) - foreach(Name +# ------------------------------------------ +# std set B +# ------------------------------------------ +set(STDALGO_SOURCES_B) +foreach(Name StdAlgorithmsCommon StdAlgorithmsMinMaxElementOps - ) - list(APPEND STDALGO_SOURCES_B Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_SOURCES_B Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std set C - # ------------------------------------------ - set(STDALGO_SOURCES_C) - foreach(Name +# ------------------------------------------ +# std set C +# ------------------------------------------ +set(STDALGO_SOURCES_C) +foreach(Name StdAlgorithmsCommon StdAlgorithmsLexicographicalCompare StdAlgorithmsForEach @@ -100,15 +103,15 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsSearch_n StdAlgorithmsMismatch StdAlgorithmsMoveBackward - ) - list(APPEND STDALGO_SOURCES_C Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_SOURCES_C Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std set D - # ------------------------------------------ - set(STDALGO_SOURCES_D) - foreach(Name +# ------------------------------------------ +# std set D +# ------------------------------------------ +set(STDALGO_SOURCES_D) +foreach(Name StdAlgorithmsCommon StdAlgorithmsModOps StdAlgorithmsModSeqOps @@ -128,15 +131,15 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsReverse StdAlgorithmsShiftLeft StdAlgorithmsShiftRight - ) - list(APPEND STDALGO_SOURCES_D Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_SOURCES_D Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std set E - # ------------------------------------------ - set(STDALGO_SOURCES_E) - foreach(Name +# ------------------------------------------ +# std set E +# ------------------------------------------ +set(STDALGO_SOURCES_E) +foreach(Name StdAlgorithmsCommon StdAlgorithmsIsSorted StdAlgorithmsIsSortedUntil @@ -149,83 +152,83 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsTransformUnaryOp StdAlgorithmsTransformExclusiveScan StdAlgorithmsTransformInclusiveScan - ) - list(APPEND STDALGO_SOURCES_E Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_SOURCES_E Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team Q - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_Q) - foreach(Name +# ------------------------------------------ +# std team Q +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_Q) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamInclusiveScan StdAlgorithmsTeamTransformInclusiveScan - ) - list(APPEND STDALGO_TEAM_SOURCES_Q Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_Q Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team P - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_P) - foreach(Name +# ------------------------------------------ +# std team P +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_P) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamExclusiveScan StdAlgorithmsTeamTransformExclusiveScan - ) - list(APPEND STDALGO_TEAM_SOURCES_P Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_P Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team M - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_M) - foreach(Name +# ------------------------------------------ +# std team M +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_M) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamTransformUnaryOp StdAlgorithmsTeamTransformBinaryOp StdAlgorithmsTeamGenerate StdAlgorithmsTeamGenerate_n StdAlgorithmsTeamSwapRanges - ) - list(APPEND STDALGO_TEAM_SOURCES_M Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_M Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team L - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_L) - foreach(Name +# ------------------------------------------ +# std team L +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_L) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamIsSorted StdAlgorithmsTeamIsSortedUntil StdAlgorithmsTeamIsPartitioned StdAlgorithmsTeamPartitionCopy StdAlgorithmsTeamPartitionPoint - ) - list(APPEND STDALGO_TEAM_SOURCES_L Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_L Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team I - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_I) - foreach(Name +# ------------------------------------------ +# std team I +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_I) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamUnique StdAlgorithmsTeamAdjacentDifference StdAlgorithmsTeamReduce StdAlgorithmsTeamTransformReduce - ) - list(APPEND STDALGO_TEAM_SOURCES_I Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_I Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team H - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_H) - foreach(Name +# ------------------------------------------ +# std team H +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_H) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamCopy StdAlgorithmsTeamCopy_n @@ -236,43 +239,43 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsTeamRemoveIf StdAlgorithmsTeamRemoveCopy StdAlgorithmsTeamRemoveCopyIf - ) - list(APPEND STDALGO_TEAM_SOURCES_H Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_H Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team G - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_G) - foreach(Name +# ------------------------------------------ +# std team G +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_G) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamMove StdAlgorithmsTeamMoveBackward StdAlgorithmsTeamShiftLeft StdAlgorithmsTeamShiftRight - ) - list(APPEND STDALGO_TEAM_SOURCES_G Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_G Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team F - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_F) - foreach(Name +# ------------------------------------------ +# std team F +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_F) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamReverse StdAlgorithmsTeamReverseCopy StdAlgorithmsTeamRotate StdAlgorithmsTeamRotateCopy - ) - list(APPEND STDALGO_TEAM_SOURCES_F Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_F Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team E - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_E) - foreach(Name +# ------------------------------------------ +# std team E +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_E) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamFill StdAlgorithmsTeamFill_n @@ -280,28 +283,28 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsTeamReplaceIf StdAlgorithmsTeamReplaceCopy StdAlgorithmsTeamReplaceCopyIf - ) - list(APPEND STDALGO_TEAM_SOURCES_E Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_E Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team D - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_D) - foreach(Name +# ------------------------------------------ +# std team D +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_D) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamMinElement StdAlgorithmsTeamMaxElement StdAlgorithmsTeamMinMaxElement - ) - list(APPEND STDALGO_TEAM_SOURCES_D Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_D Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team C - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_C) - foreach(Name +# ------------------------------------------ +# std team C +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_C) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamFind StdAlgorithmsTeamFindIf @@ -310,29 +313,29 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsTeamAnyOf StdAlgorithmsTeamNoneOf StdAlgorithmsTeamSearchN - ) - list(APPEND STDALGO_TEAM_SOURCES_C Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_C Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team B - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_B) - foreach(Name +# ------------------------------------------ +# std team B +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_B) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamEqual StdAlgorithmsTeamSearch StdAlgorithmsTeamFindEnd StdAlgorithmsTeamFindFirstOf - ) - list(APPEND STDALGO_TEAM_SOURCES_B Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_B Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team A - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_A) - foreach(Name +# ------------------------------------------ +# std team A +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_A) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamAdjacentFind StdAlgorithmsTeamCount @@ -341,11 +344,8 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsTeamForEachN StdAlgorithmsTeamLexicographicalCompare StdAlgorithmsTeamMismatch - ) - list(APPEND STDALGO_TEAM_SOURCES_A Test${Name}.cpp) - endforeach() - - endif() + ) + list(APPEND STDALGO_TEAM_SOURCES_A Test${Name}.cpp) endforeach() # FIXME_OPENMPTARGET - remove sort test as it leads to ICE with clang/16 and above at compile time. diff --git a/packages/kokkos/algorithms/unit_tests/Makefile b/packages/kokkos/algorithms/unit_tests/Makefile index 601217799a88..d3946c149baf 100644 --- a/packages/kokkos/algorithms/unit_tests/Makefile +++ b/packages/kokkos/algorithms/unit_tests/Makefile @@ -27,13 +27,13 @@ TARGETS = tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ $(if $(filter Test$(device).cpp, $(shell ls Test$(device).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " > Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ ) \ ) diff --git a/packages/kokkos/algorithms/unit_tests/TestSortByKey.hpp b/packages/kokkos/algorithms/unit_tests/TestSortByKey.hpp new file mode 100644 index 000000000000..16f68eaaf267 --- /dev/null +++ b/packages/kokkos/algorithms/unit_tests/TestSortByKey.hpp @@ -0,0 +1,241 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TEST_SORT_BY_KEY_HPP +#define KOKKOS_ALGORITHMS_UNITTESTS_TEST_SORT_BY_KEY_HPP + +#include +#include +#include +#include + +#include // pair + +namespace Test { +namespace SortImpl { + +struct Less { + template + KOKKOS_INLINE_FUNCTION bool operator()(const ValueType &lhs, + const ValueType &rhs) const { + return lhs < rhs; + } +}; + +struct Greater { + template + KOKKOS_INLINE_FUNCTION bool operator()(const ValueType &lhs, + const ValueType &rhs) const { + return lhs > rhs; + } +}; + +template +struct is_sorted_by_key_struct { + Keys keys; + Keys keys_orig; + Permute permute; + Comparator comparator; + + is_sorted_by_key_struct(Keys keys_, Keys keys_orig_, Permute permute_, + Comparator comparator_ = Comparator{}) + : keys(keys_), + keys_orig(keys_orig_), + permute(permute_), + comparator(comparator_) {} + KOKKOS_INLINE_FUNCTION + void operator()(int i, unsigned int &count) const { + if (i < keys.extent_int(0) - 1 && comparator(keys(i + 1), keys(i))) ++count; + if (keys(i) != keys_orig(permute(i))) ++count; + } +}; + +template +void iota(ExecutionSpace const &space, ViewType const &v, + typename ViewType::value_type value = 0) { + using ValueType = typename ViewType::value_type; + Kokkos::parallel_for( + "ArborX::Algorithms::iota", + Kokkos::RangePolicy(space, 0, v.extent(0)), + KOKKOS_LAMBDA(int i) { v(i) = value + (ValueType)i; }); +} + +} // namespace SortImpl + +TEST(TEST_CATEGORY, SortByKeyEmptyView) { + using ExecutionSpace = TEST_EXECSPACE; + + // does not matter if we use int or something else + Kokkos::View keys("keys", 0); + Kokkos::View values("values", 0); + + ASSERT_NO_THROW( + Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values)); +} + +TEST(TEST_CATEGORY, SortByKey) { + using ExecutionSpace = TEST_EXECSPACE; + using MemorySpace = typename ExecutionSpace::memory_space; + + ExecutionSpace space{}; + + for (auto keys_vector : {std::vector{36, 19, 25, 17, 3, 7, 1, 2, 9}, + std::vector{36, 19, 25, 17, 3, 9, 1, 2, 7}, + std::vector{100, 19, 36, 17, 3, 25, 1, 2, 7}, + std::vector{15, 5, 11, 3, 4, 8}}) { + auto const n = keys_vector.size(); + + auto keys = Kokkos::create_mirror_view_and_copy( + MemorySpace{}, + Kokkos::View( + keys_vector.data(), n)); + + auto keys_orig = Kokkos::create_mirror(space, keys); + Kokkos::deep_copy(space, keys_orig, keys); + + Kokkos::View permute("permute", n); + SortImpl::iota(space, permute); + + Kokkos::Experimental::sort_by_key(space, keys, permute); + + unsigned int sort_fails = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(space, 0, n), + SortImpl::is_sorted_by_key_struct(keys, keys_orig, + permute), + sort_fails); + + ASSERT_EQ(sort_fails, 0u); + } +} + +TEST(TEST_CATEGORY, SortByKeyWithComparator) { + using ExecutionSpace = TEST_EXECSPACE; + using MemorySpace = typename ExecutionSpace::memory_space; + + ExecutionSpace space{}; + + SortImpl::Greater comparator; + + for (auto keys_vector : {std::vector{36, 19, 25, 17, 3, 7, 1, 2, 9}, + std::vector{36, 19, 25, 17, 3, 9, 1, 2, 7}, + std::vector{100, 19, 36, 17, 3, 25, 1, 2, 7}, + std::vector{15, 5, 11, 3, 4, 8}}) { + auto const n = keys_vector.size(); + + auto keys = Kokkos::create_mirror_view_and_copy( + MemorySpace{}, + Kokkos::View( + keys_vector.data(), n)); + + auto keys_orig = Kokkos::create_mirror(space, keys); + Kokkos::deep_copy(space, keys_orig, keys); + + Kokkos::View permute("permute", n); + SortImpl::iota(space, permute); + + Kokkos::Experimental::sort_by_key(space, keys, permute, comparator); + + unsigned int sort_fails = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(space, 0, n), + SortImpl::is_sorted_by_key_struct( + keys, keys_orig, permute, comparator), + sort_fails); + + ASSERT_EQ(sort_fails, 0u); + } +} + +TEST(TEST_CATEGORY, SortByKeyStaticExtents) { + using ExecutionSpace = TEST_EXECSPACE; + + ExecutionSpace space{}; + + Kokkos::View keys("keys"); + + Kokkos::View values_static("values_static"); + ASSERT_NO_THROW( + Kokkos::Experimental::sort_by_key(space, keys, values_static)); + + Kokkos::View values_dynamic("values_dynamic", 10); + ASSERT_NO_THROW( + Kokkos::Experimental::sort_by_key(space, keys, values_dynamic)); +} + +template +void buildViewsForStrided(ExecutionSpace const &space, int n, Keys &keys, + Values &values) { + Kokkos::parallel_for( + "create_data", + Kokkos::MDRangePolicy, ExecutionSpace>(space, {0, 0, 0}, + {n, n, n}), + KOKKOS_LAMBDA(int i, int j, int k) { + keys(i, j, k) = n - i; + values(i, j, k) = j; + }); +} + +TEST(TEST_CATEGORY, SortByKeyWithStrides) { + using ExecutionSpace = TEST_EXECSPACE; + + ExecutionSpace space{}; + + auto const n = 10; + + Kokkos::View keys("keys", n, n, n); + Kokkos::View values("values", n, n, n); + buildViewsForStrided(space, n, keys, values); + + auto keys_sub = Kokkos::subview(keys, Kokkos::ALL(), 1, 2); + auto values_sub = Kokkos::subview(values, 4, Kokkos::ALL(), 6); + + auto keys_orig = Kokkos::create_mirror(space, keys_sub); + Kokkos::deep_copy(space, keys_orig, keys_sub); + + Kokkos::Experimental::sort_by_key(space, keys_sub, values_sub); + + unsigned int sort_fails = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(space, 0, n), + SortImpl::is_sorted_by_key_struct( + keys_sub, keys_orig, values_sub), + sort_fails); + + ASSERT_EQ(sort_fails, 0u); +} + +TEST(TEST_CATEGORY, SortByKeyKeysLargerThanValues) { + using ExecutionSpace = TEST_EXECSPACE; + + // does not matter if we use int or something else + Kokkos::View keys("keys", 3); + Kokkos::View values("values", 1); + + ASSERT_DEATH( + Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values), + "values and keys extents must be the same"); + ASSERT_DEATH(Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values, + SortImpl::Greater{}), + "values and keys extents must be the same"); +} + +} // namespace Test +#endif diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp index 3eb963faf2d2..67052e2f9d4d 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp @@ -239,16 +239,8 @@ KOKKOS_FUNCTION bool team_members_have_matching_result( // set accum to 1 if a mismach is found const bool mismatch = memberValue != target; int accum = static_cast(mismatch); - // FIXME_OPENMPTARGET: team API does not meet the TeamHandle concept and - // ignores the reducer passed -#if defined KOKKOS_ENABLE_OPENMPTARGET - Kokkos::Sum dummyReducer(accum); - const auto result = teamHandle.team_reduce(accum, dummyReducer); - return (result == 0); -#else teamHandle.team_reduce(Kokkos::Sum(accum)); return (accum == 0); -#endif } template diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp index 6ab68a1987df..b364c53a8882 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp @@ -16,6 +16,7 @@ #include #include +#include namespace Test { namespace stdalgos { @@ -132,47 +133,6 @@ void my_host_exclusive_scan(it1 first, it1 last, it2 dest, ValType init, } } -template -void verify_data(ViewType1 data_view, // contains data - ViewType2 test_view, // the view to test - ValueType init_value, BinaryOp bop) { - //! always careful because views might not be deep copyable - - auto data_view_dc = create_deep_copyable_compatible_clone(data_view); - auto data_view_h = - create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc); - - using gold_view_value_type = typename ViewType2::value_type; - Kokkos::View gold_h( - "goldh", data_view.extent(0)); - my_host_exclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h), - KE::begin(gold_h), init_value, bop); - - auto test_view_dc = create_deep_copyable_compatible_clone(test_view); - auto test_view_h = - create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); - if (test_view_h.extent(0) > 0) { - for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " " - // << gold_h(i) << " " << test_view_h(i) << " " - // << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - if (std::is_same::value) { - ASSERT_EQ(gold_h(i), test_view_h(i)); - } else { - const auto error = - std::abs(static_cast(gold_h(i) - test_view_h(i))); - if (error > 1e-10) { - std::cout << i << " " << std::setprecision(15) << data_view_h(i) - << " " << gold_h(i) << " " << test_view_h(i) << " " - << std::abs(static_cast(gold_h(i) - test_view_h(i))) - << std::endl; - } - EXPECT_LT(error, 1e-10); - } - } - } -} - template struct MultiplyFunctor { KOKKOS_INLINE_FUNCTION @@ -189,107 +149,153 @@ struct SumFunctor { } }; +struct VerifyData { + template + void operator()(ViewType1 data_view, // contains data + ViewType2 test_view, // the view to test + ValueType init_value, BinaryOp bop) { + //! always careful because views might not be deep copyable + + auto data_view_dc = create_deep_copyable_compatible_clone(data_view); + auto data_view_h = + create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc); + + using gold_view_value_type = typename ViewType2::value_type; + Kokkos::View gold_h( + "goldh", data_view.extent(0)); + my_host_exclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h), + KE::begin(gold_h), init_value, bop); + + auto test_view_dc = create_deep_copyable_compatible_clone(test_view); + auto test_view_h = + create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); + if (test_view_h.extent(0) > 0) { + for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { + if (std::is_same::value) { + ASSERT_EQ(gold_h(i), test_view_h(i)); + } else { + const auto error = + std::abs(static_cast(gold_h(i) - test_view_h(i))); + ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error + << static_cast(test_view_h(i)) << " " + << static_cast(gold_h(i)); + } + } + } + } + + template + void operator()(ViewType1 data_view, // contains data + ViewType2 test_view, // the view to test + ValueType init_value) { + (*this)(data_view, test_view, init_value, SumFunctor()); + } +}; + std::string value_type_to_string(int) { return "int"; } std::string value_type_to_string(double) { return "double"; } -template -void run_single_scenario_default_op(const InfoType& scenario_info, - ValueType init_value) { - using default_op = SumFunctor; +template +void run_single_scenario(const InfoType& scenario_info, ValueType init_value, + OpOrEmpty... empty_or_op) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // std::cout << "exclusive_scan default op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " - // << "init = " << init_value << std::endl; auto view_dest = create_view(Tag{}, view_ext, "exclusive_scan"); auto view_from = create_view(Tag{}, view_ext, "exclusive_scan"); fill_view(view_from, name); + // view_dest is filled with zeros before calling the algorithm everytime to + // ensure the algorithm does something meaningful { fill_zero(view_dest); auto r = KE::exclusive_scan(exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), - init_value); + init_value, empty_or_op...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, default_op()); + VerifyData()(view_from, view_dest, init_value, empty_or_op...); } { fill_zero(view_dest); auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), - init_value); + init_value, empty_or_op...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, default_op()); + VerifyData()(view_from, view_dest, init_value, empty_or_op...); } { fill_zero(view_dest); - auto r = KE::exclusive_scan(exespace(), view_from, view_dest, init_value); + auto r = KE::exclusive_scan(exespace(), view_from, view_dest, init_value, + empty_or_op...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, default_op()); + VerifyData()(view_from, view_dest, init_value, empty_or_op...); } { fill_zero(view_dest); auto r = KE::exclusive_scan("label", exespace(), view_from, view_dest, - init_value); + init_value, empty_or_op...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, default_op()); + VerifyData()(view_from, view_dest, init_value, empty_or_op...); } Kokkos::fence(); } -template -void run_single_scenario_custom_op(const InfoType& scenario_info, - ValueType init_value, BinaryOp bop) { +template +void run_single_scenario_inplace(const InfoType& scenario_info, + ValueType init_value, + OpOrEmpty... empty_or_op) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // std::cout << "exclusive_scan custom op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " - // << "init = " << init_value << std::endl; - auto view_dest = create_view(Tag{}, view_ext, "exclusive_scan"); - auto view_from = create_view(Tag{}, view_ext, "exclusive_scan"); - fill_view(view_from, name); + // since here we call the in-place operation, we need to use two views: + // view1: filled according to what the scenario asks for and is not modified + // view2: filled according to what the scenario asks for and used for the + // in-place op Therefore, after the op is done, view2 should contain the + // result of doing exclusive scan NOTE: view2 is filled below every time + // because the algorithm acts in place + + auto view1 = + create_view(Tag{}, view_ext, "exclusive_scan_inplace_view1"); + fill_view(view1, name); + auto view2 = + create_view(Tag{}, view_ext, "exclusive_scan_inplace_view2"); { - fill_zero(view_dest); - auto r = KE::exclusive_scan(exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), - init_value, bop); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, bop); + fill_view(view2, name); + auto r = KE::exclusive_scan(exespace(), KE::cbegin(view2), KE::cend(view2), + KE::begin(view2), init_value, empty_or_op...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, init_value, empty_or_op...); } { - fill_zero(view_dest); - auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), - init_value, bop); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, bop); + fill_view(view2, name); + auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view2), + KE::cend(view2), KE::begin(view2), init_value, + empty_or_op...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, init_value, empty_or_op...); } { - fill_zero(view_dest); - auto r = - KE::exclusive_scan(exespace(), view_from, view_dest, init_value, bop); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, bop); + fill_view(view2, name); + auto r = KE::exclusive_scan(exespace(), view2, view2, init_value, + empty_or_op...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, init_value, empty_or_op...); } { - fill_zero(view_dest); - auto r = KE::exclusive_scan("label", exespace(), view_from, view_dest, - init_value, bop); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, bop); + fill_view(view2, name); + auto r = KE::exclusive_scan("label", exespace(), view2, view2, init_value, + empty_or_op...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, init_value, empty_or_op...); } Kokkos::fence(); @@ -303,34 +309,39 @@ void run_exclusive_scan_all_scenarios() { {"medium", 1103}, {"large", 10513}}; for (const auto& it : scenarios) { - run_single_scenario_default_op(it, ValueType{0}); - run_single_scenario_default_op(it, ValueType{1}); - run_single_scenario_default_op(it, ValueType{-2}); - run_single_scenario_default_op(it, ValueType{3}); + run_single_scenario(it, ValueType{0}); + run_single_scenario(it, ValueType{1}); + run_single_scenario(it, ValueType{-2}); + run_single_scenario(it, ValueType{3}); + + run_single_scenario_inplace(it, ValueType{0}); + run_single_scenario_inplace(it, ValueType{-2}); #if !defined KOKKOS_ENABLE_OPENMPTARGET // custom multiply op is only run for small views otherwise it overflows if (it.first == "small-a" || it.first == "small-b") { using custom_bop_t = MultiplyFunctor; - run_single_scenario_custom_op(it, ValueType{0}, - custom_bop_t()); - run_single_scenario_custom_op(it, ValueType{1}, - custom_bop_t()); - run_single_scenario_custom_op(it, ValueType{-2}, - custom_bop_t()); - run_single_scenario_custom_op(it, ValueType{3}, - custom_bop_t()); - } + run_single_scenario(it, ValueType{0}, custom_bop_t()); + run_single_scenario(it, ValueType{1}, custom_bop_t()); + run_single_scenario(it, ValueType{-2}, custom_bop_t()); + run_single_scenario(it, ValueType{3}, custom_bop_t()); - using custom_bop_t = SumFunctor; - run_single_scenario_custom_op(it, ValueType{0}, - custom_bop_t()); - run_single_scenario_custom_op(it, ValueType{1}, + run_single_scenario_inplace(it, ValueType{0}, custom_bop_t()); - run_single_scenario_custom_op(it, ValueType{-2}, - custom_bop_t()); - run_single_scenario_custom_op(it, ValueType{3}, + run_single_scenario_inplace(it, ValueType{-2}, custom_bop_t()); + } + + using custom_bop_t = SumFunctor; + run_single_scenario(it, ValueType{0}, custom_bop_t()); + run_single_scenario(it, ValueType{1}, custom_bop_t()); + run_single_scenario(it, ValueType{-2}, custom_bop_t()); + run_single_scenario(it, ValueType{3}, custom_bop_t()); + + run_single_scenario_inplace(it, ValueType{0}, + custom_bop_t()); + run_single_scenario_inplace(it, ValueType{-2}, + custom_bop_t()); #endif } } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp index 8e60a43e5ffb..a08a73721088 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp @@ -16,6 +16,7 @@ #include #include +#include namespace Test { namespace stdalgos { @@ -143,51 +144,6 @@ void my_host_inclusive_scan(it1 first, it1 last, it2 dest, BinOp bop, } } -template -void verify_data(ViewType1 data_view, // contains data - ViewType2 test_view, // the view to test - BinaryOp bop, Args... args /* copy on purpose */) { - //! always careful because views might not be deep copyable - - auto data_view_dc = create_deep_copyable_compatible_clone(data_view); - auto data_view_h = - create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc); - - using gold_view_value_type = typename ViewType2::value_type; - Kokkos::View gold_h( - "goldh", data_view.extent(0)); - my_host_inclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h), - KE::begin(gold_h), bop, args...); - - auto test_view_dc = create_deep_copyable_compatible_clone(test_view); - auto test_view_h = - create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); - - const auto ext = test_view_h.extent(0); - if (ext > 0) { - for (std::size_t i = 0; i < ext; ++i) { - // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " " - // << gold_h(i) << " " << test_view_h(i) << " " - // << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - - if (std::is_same::value) { - ASSERT_EQ(gold_h(i), test_view_h(i)); - } else { - const auto error = - std::abs(static_cast(gold_h(i) - test_view_h(i))); - if (error > 1e-10) { - std::cout << i << " " << std::setprecision(15) << data_view_h(i) - << " " << gold_h(i) << " " << test_view_h(i) << " " - << std::abs(static_cast(gold_h(i) - test_view_h(i))) - << std::endl; - } - EXPECT_LT(error, 1e-10); - } - } - // std::cout << " last el: " << test_view_h(ext-1) << std::endl; - } -} - template struct MultiplyFunctor { KOKKOS_INLINE_FUNCTION @@ -204,107 +160,151 @@ struct SumFunctor { } }; +struct VerifyData { + template + void operator()(ViewType1 data_view, // contains data + ViewType2 test_view, // the view to test + BinaryOp bop, Args... args /* copy on purpose */) { + //! always careful because views might not be deep copyable + + auto data_view_dc = create_deep_copyable_compatible_clone(data_view); + auto data_view_h = + create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc); + + using gold_view_value_type = typename ViewType2::value_type; + Kokkos::View gold_h( + "goldh", data_view.extent(0)); + my_host_inclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h), + KE::begin(gold_h), bop, args...); + + auto test_view_dc = create_deep_copyable_compatible_clone(test_view); + auto test_view_h = + create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); + + const auto ext = test_view_h.extent(0); + if (ext > 0) { + for (std::size_t i = 0; i < ext; ++i) { + if (std::is_same::value) { + ASSERT_EQ(gold_h(i), test_view_h(i)); + } else { + const auto error = + std::abs(static_cast(gold_h(i) - test_view_h(i))); + ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error + << static_cast(test_view_h(i)) << " " + << static_cast(gold_h(i)); + } + } + } + } + + template + void operator()(ViewType1 data_view, // contains data + ViewType2 test_view) // the view to test + { + using value_type = typename ViewType1::non_const_value_type; + (*this)(data_view, test_view, SumFunctor()); + } +}; + std::string value_type_to_string(int) { return "int"; } std::string value_type_to_string(double) { return "double"; } -template -void run_single_scenario_default_op(const InfoType& scenario_info) { - using default_op = SumFunctor; +template +void run_single_scenario(const InfoType& scenario_info, + Args... args /* copy on purpose */) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // std::cout << "inclusive_scan default op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << std::endl; auto view_dest = create_view(Tag{}, view_ext, "inclusive_scan"); auto view_from = create_view(Tag{}, view_ext, "inclusive_scan"); fill_view(view_from, name); + // view_dest is filled with zeros before calling the algorithm everytime to + // ensure the algorithm does something meaningful { fill_zero(view_dest); - auto r = KE::inclusive_scan(exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest)); + auto r = + KE::inclusive_scan(exespace(), KE::cbegin(view_from), + KE::cend(view_from), KE::begin(view_dest), args...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, default_op()); + VerifyData()(view_from, view_dest, args...); } { fill_zero(view_dest); - auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest)); + auto r = + KE::inclusive_scan("label", exespace(), KE::cbegin(view_from), + KE::cend(view_from), KE::begin(view_dest), args...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, default_op()); + VerifyData()(view_from, view_dest, args...); } { fill_zero(view_dest); - auto r = KE::inclusive_scan(exespace(), view_from, view_dest); + auto r = KE::inclusive_scan(exespace(), view_from, view_dest, args...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, default_op()); + VerifyData()(view_from, view_dest, args...); } { fill_zero(view_dest); - auto r = KE::inclusive_scan("label", exespace(), view_from, view_dest); + auto r = + KE::inclusive_scan("label", exespace(), view_from, view_dest, args...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, default_op()); + VerifyData()(view_from, view_dest, args...); } Kokkos::fence(); } -template -void run_single_scenario_custom_op(const InfoType& scenario_info, BinaryOp bop, - Args... args /* copy on purpose */) { +template +void run_single_scenario_inplace(const InfoType& scenario_info, + Args... args /* copy on purpose */) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // if (1 == sizeof...(Args)) { - // std::cout << "inclusive_scan custom op and init value: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " << std::endl; - // } else { - // std::cout << "inclusive_scan custom op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " << std::endl; - // } + // since here we call the in-place operation, we need to use two views: + // view1: filled according to what the scenario asks for and is not modified + // view2: filled according to what the scenario asks for and used for the + // in-place op Therefore, after the op is done, view_2 should contain the + // result of doing exclusive scan NOTE: view2 is filled below every time + // because the algorithm acts in place - auto view_dest = create_view(Tag{}, view_ext, "inclusive_scan"); - auto view_from = create_view(Tag{}, view_ext, "inclusive_scan"); - fill_view(view_from, name); + auto view1 = + create_view(Tag{}, view_ext, "inclusive_scan_inplace_view1"); + fill_view(view1, name); + + auto view2 = + create_view(Tag{}, view_ext, "inclusive_scan_inplace_view2"); { - fill_zero(view_dest); - auto r = KE::inclusive_scan(exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), bop, - args...); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, bop, args...); + fill_view(view2, name); + auto r = KE::inclusive_scan(exespace(), KE::cbegin(view2), KE::cend(view2), + KE::begin(view2), args...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, args...); } { - fill_zero(view_dest); - auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), bop, - args...); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, bop, args...); + fill_view(view2, name); + auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view2), + KE::cend(view2), KE::begin(view2), args...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, args...); } { - fill_zero(view_dest); - auto r = KE::inclusive_scan(exespace(), view_from, view_dest, bop, args...); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, bop, args...); + fill_view(view2, name); + auto r = KE::inclusive_scan(exespace(), view2, view2, args...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, args...); } { - fill_zero(view_dest); - auto r = KE::inclusive_scan("label", exespace(), view_from, view_dest, bop, - args...); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, bop, args...); + fill_view(view2, name); + auto r = KE::inclusive_scan("label", exespace(), view2, view2, args...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, args...); } Kokkos::fence(); @@ -318,27 +318,35 @@ void run_inclusive_scan_all_scenarios() { {"medium-a", 313}, {"medium-b", 1103}, {"large", 10513}}; for (const auto& it : scenarios) { - run_single_scenario_default_op(it); + run_single_scenario(it); + run_single_scenario_inplace(it); #if !defined KOKKOS_ENABLE_OPENMPTARGET // the sum custom op is always run using sum_binary_op = SumFunctor; sum_binary_op sbop; - run_single_scenario_custom_op(it, sbop); - run_single_scenario_custom_op(it, sbop, ValueType{0}); - run_single_scenario_custom_op(it, sbop, ValueType{1}); - run_single_scenario_custom_op(it, sbop, ValueType{-2}); - run_single_scenario_custom_op(it, sbop, ValueType{3}); + run_single_scenario(it, sbop); + run_single_scenario(it, sbop, ValueType{0}); + run_single_scenario(it, sbop, ValueType{1}); + run_single_scenario(it, sbop, ValueType{-2}); + run_single_scenario(it, sbop, ValueType{3}); + + run_single_scenario_inplace(it, sbop, ValueType{0}); + run_single_scenario_inplace(it, sbop, ValueType{-2}); // custom multiply only for small views to avoid overflows if (it.first == "small-a" || it.first == "small-b") { using mult_binary_op = MultiplyFunctor; mult_binary_op mbop; - run_single_scenario_custom_op(it, mbop); - run_single_scenario_custom_op(it, mbop, ValueType{0}); - run_single_scenario_custom_op(it, mbop, ValueType{1}); - run_single_scenario_custom_op(it, mbop, ValueType{-2}); - run_single_scenario_custom_op(it, mbop, ValueType{3}); + run_single_scenario(it, mbop); + run_single_scenario(it, mbop, ValueType{0}); + run_single_scenario(it, mbop, ValueType{1}); + run_single_scenario(it, mbop, ValueType{-2}); + run_single_scenario(it, mbop, ValueType{3}); + + run_single_scenario_inplace(it, mbop); + run_single_scenario_inplace(it, mbop, ValueType{0}); + run_single_scenario_inplace(it, mbop, ValueType{-2}); } #endif } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp index f31d49e06b4a..75d4f0afebce 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp @@ -146,7 +146,7 @@ void run_single_scenario(const InfoType& scenario_info) { resultsA[3] = KE::is_sorted("label", exespace(), view); const auto allA = std::all_of(resultsA.cbegin(), resultsA.cend(), [=](bool v) { return v == gold; }); - EXPECT_TRUE(allA); + EXPECT_TRUE(allA) << name << ", " << view_tag_to_string(Tag{}); #if !defined KOKKOS_ENABLE_OPENMPTARGET CustomLessThanComparator comp; @@ -159,7 +159,7 @@ void run_single_scenario(const InfoType& scenario_info) { resultsB[3] = KE::is_sorted("label", exespace(), view, comp); const auto allB = std::all_of(resultsB.cbegin(), resultsB.cend(), [=](bool v) { return v == gold; }); - EXPECT_TRUE(allB); + EXPECT_TRUE(allB) << name << ", " << view_tag_to_string(Tag{}); #endif Kokkos::fence(); @@ -173,9 +173,6 @@ void run_is_sorted_all_scenarios() { {"medium-a", 1003}, {"medium-b", 1003}, {"large-a", 101513}, {"large-b", 101513}}; - std::cout << "is_sorted: " << view_tag_to_string(Tag{}) - << ", all overloads \n"; - for (const auto& it : scenarios) { run_single_scenario(it); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp index dcfe8ad67e11..29ac7cc9bc12 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp @@ -145,10 +145,10 @@ void run_single_scenario(const InfoType& scenario_info) { KE::is_sorted_until("label", exespace(), KE::begin(view), KE::end(view)); auto r3 = KE::is_sorted_until(exespace(), view); auto r4 = KE::is_sorted_until("label", exespace(), view); - ASSERT_EQ(r1, gold); - ASSERT_EQ(r2, gold); - ASSERT_EQ(r3, gold); - ASSERT_EQ(r4, gold); + ASSERT_EQ(r1, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r2, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r3, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r4, gold) << name << ", " << view_tag_to_string(Tag{}); #if !defined KOKKOS_ENABLE_OPENMPTARGET CustomLessThanComparator comp; @@ -160,10 +160,10 @@ void run_single_scenario(const InfoType& scenario_info) { auto r8 = KE::is_sorted_until("label", exespace(), view, comp); #endif - ASSERT_EQ(r1, gold); - ASSERT_EQ(r2, gold); - ASSERT_EQ(r3, gold); - ASSERT_EQ(r4, gold); + ASSERT_EQ(r1, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r2, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r3, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r4, gold) << name << ", " << view_tag_to_string(Tag{}); Kokkos::fence(); } @@ -176,9 +176,6 @@ void run_is_sorted_until_all_scenarios() { {"medium-a", 1003}, {"medium-b", 1003}, {"large-a", 101513}, {"large-b", 101513}}; - std::cout << "is_sorted_until: " << view_tag_to_string(Tag{}) - << ", all overloads \n"; - for (const auto& it : scenarios) { run_single_scenario(it); } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp index 4604764097eb..1b1a02f39c4f 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp @@ -48,7 +48,7 @@ struct MyMovableType { TEST(std_algorithms_mod_ops_test, move) { MyMovableType a; using move_t = decltype(std::move(a)); - static_assert(std::is_rvalue_reference::value, ""); + static_assert(std::is_rvalue_reference::value); // move constr MyMovableType b(std::move(a)); @@ -70,7 +70,7 @@ struct StdAlgoModSeqOpsTestMove { void operator()(const int index) const { typename ViewType::value_type a{11}; using move_t = decltype(std::move(a)); - static_assert(std::is_rvalue_reference::value, ""); + static_assert(std::is_rvalue_reference::value); m_view(index) = std::move(a); } @@ -89,50 +89,6 @@ TEST(std_algorithms_mod_ops_test, move_within_parfor) { } } -// ------------ -// swap -// ------------ -TEST(std_algorithms_mod_ops_test, swap) { - { - int a = 1; - int b = 2; - KE::swap(a, b); - ASSERT_EQ(a, 2); - ASSERT_EQ(b, 1); - } - - { - double a = 3.; - double b = 1.; - KE::swap(a, b); - EXPECT_DOUBLE_EQ(a, 1.); - EXPECT_DOUBLE_EQ(b, 3.); - } -} - -template -struct StdAlgoModSeqOpsTestSwap { - ViewType m_view; - - KOKKOS_INLINE_FUNCTION - void operator()(const int index) const { - typename ViewType::value_type newval{11}; - KE::swap(m_view(index), newval); - } - - StdAlgoModSeqOpsTestSwap(ViewType aIn) : m_view(aIn) {} -}; - -TEST(std_algorithms_mod_ops_test, swap_within_parfor) { - auto a = create_view(stdalgos::DynamicTag{}, 10, "a"); - StdAlgoModSeqOpsTestSwap fnc(a); - Kokkos::parallel_for(a.extent(0), fnc); - auto a_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a); - for (std::size_t i = 0; i < a.extent(0); ++i) { - EXPECT_DOUBLE_EQ(a_h(0), 11.); - } -} - // ------------ // iter_swap // ------------ diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp index f169fd9ce881..a36c9db2b9eb 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp @@ -110,11 +110,9 @@ void verify_data(const std::string& name, ResultType my_result, ViewTypeDestFalse view_dest_false, PredType pred) { using value_type = typename ViewTypeFrom::value_type; static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); const std::size_t ext = view_from.extent(0); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp index b5aa27c7c387..7c3c465dc8d0 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp @@ -166,6 +166,10 @@ void run_all_scenarios() { } TEST(std_algorithms_copy_if_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp index c6b2566c6cfb..2c8fee02f473 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp @@ -121,7 +121,9 @@ struct TestFunctorA { } }; -template +struct InPlace {}; + +template void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { /* description: use a rank-2 view randomly filled with values, @@ -147,9 +149,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { using space_t = Kokkos::DefaultExecutionSpace; Kokkos::TeamPolicy policy(numTeams, Kokkos::AUTO()); - // create the destination view - Kokkos::View destView("destView", numTeams, numCols); - // exclusive_scan returns an iterator so to verify that it is correct // each team stores the distance of the returned iterator from the beginning // of the interval that team operates on and then we check that these @@ -168,12 +167,19 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { rand_pool pool(lowerBound * upperBound); Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound); - // use CTAD for functor auto initValuesView = Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h); - TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, - initValuesView, binaryOp, apiId); - Kokkos::parallel_for(policy, fnc); + + Kokkos::View destView("destView", numTeams, numCols); + if constexpr (std::is_same_v) { + TestFunctorA fnc(sourceView, sourceView, distancesView, + intraTeamSentinelView, initValuesView, binaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } else { + TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, + initValuesView, binaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } // ----------------------------------------------- // run cpp-std kernel and check @@ -223,11 +229,16 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { #undef exclusive_scan } - auto dataViewAfterOp_h = create_host_space_copy(destView); - expect_equal_host_views(stdDestView, dataViewAfterOp_h); + if constexpr (std::is_same_v) { + auto dataViewAfterOp_h = create_host_space_copy(sourceView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } else { + auto dataViewAfterOp_h = create_host_space_copy(destView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } } -template +template void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { @@ -236,16 +247,24 @@ void run_all_scenarios() { #else for (int apiId : {0, 1}) { #endif - test_A(numTeams, numCols, apiId); + test_A(numTeams, numCols, apiId); } } } } TEST(std_algorithms_exclusive_scan_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); + + run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); } } // namespace TeamExclusiveScan diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp index 0daf9dbfe824..b5f4cdd6123f 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp @@ -139,7 +139,9 @@ struct TestFunctorA { } }; -template +struct InPlace {}; + +template void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { /* description: use a rank-2 view randomly filled with values, @@ -165,9 +167,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { using space_t = Kokkos::DefaultExecutionSpace; Kokkos::TeamPolicy policy(numTeams, Kokkos::AUTO()); - // create the destination view - Kokkos::View destView("destView", numTeams, numCols); - // inclusive_scan returns an iterator so to verify that it is correct // each team stores the distance of the returned iterator from the beginning // of the interval that team operates on and then we check that these @@ -186,12 +185,20 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { rand_pool pool(lowerBound * upperBound); Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound); - // use CTAD for functor auto initValuesView = Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h); - TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, - initValuesView, binaryOp, apiId); - Kokkos::parallel_for(policy, fnc); + + // create the destination view + Kokkos::View destView("destView", numTeams, numCols); + if constexpr (std::is_same_v) { + TestFunctorA fnc(sourceView, sourceView, distancesView, + intraTeamSentinelView, initValuesView, binaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } else { + TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, + initValuesView, binaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } // ----------------------------------------------- // run cpp-std kernel and check @@ -251,25 +258,38 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { #undef inclusive_scan } - auto dataViewAfterOp_h = create_host_space_copy(destView); - expect_equal_host_views(stdDestView, dataViewAfterOp_h); + if constexpr (std::is_same_v) { + auto dataViewAfterOp_h = create_host_space_copy(sourceView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } else { + auto dataViewAfterOp_h = create_host_space_copy(destView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } } -template +template void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { for (int apiId : {0, 1, 2, 3, 4, 5}) { - test_A(numTeams, numCols, apiId); + test_A(numTeams, numCols, apiId); } } } } TEST(std_algorithms_inclusive_scan_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); + + run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); } } // namespace TeamInclusiveScan diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp index 24b840154b73..6bb0d249988d 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp @@ -212,6 +212,10 @@ void run_all_scenarios() { } TEST(std_algorithms_remove_copy_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp index ce18eb4d3198..cff9aa178a29 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp @@ -168,6 +168,10 @@ void run_all_scenarios() { } TEST(std_algorithms_remove_copy_if_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp index 9f30812d8ef0..60fa369af180 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp @@ -108,7 +108,9 @@ struct TestFunctorA { } }; -template +struct InPlace {}; + +template void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { /* description: use a rank-2 view randomly filled with values, @@ -134,9 +136,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { using space_t = Kokkos::DefaultExecutionSpace; Kokkos::TeamPolicy policy(numTeams, Kokkos::AUTO()); - // create the destination view - Kokkos::View destView("destView", numTeams, numCols); - // tranform_exclusive_scan returns an iterator so to verify that it is correct // each team stores the distance of the returned iterator from the beginning // of the interval that team operates on and then we check that these @@ -156,12 +155,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { rand_pool pool(lowerBound * upperBound); Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound); - // use CTAD for functor auto initValuesView = Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h); - TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, - initValuesView, binaryOp, unaryOp, apiId); - Kokkos::parallel_for(policy, fnc); + + // create the destination view + Kokkos::View destView("destView", numTeams, numCols); + if constexpr (std::is_same_v) { + TestFunctorA fnc(sourceView, sourceView, distancesView, + intraTeamSentinelView, initValuesView, binaryOp, unaryOp, + apiId); + Kokkos::parallel_for(policy, fnc); + } else { + TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, + initValuesView, binaryOp, unaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } // ----------------------------------------------- // run cpp-std kernel and check @@ -200,16 +208,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { #undef transform_exclusive_scan } - auto dataViewAfterOp_h = create_host_space_copy(destView); - expect_equal_host_views(stdDestView, dataViewAfterOp_h); + if constexpr (std::is_same_v) { + auto dataViewAfterOp_h = create_host_space_copy(sourceView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } else { + auto dataViewAfterOp_h = create_host_space_copy(destView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } } -template +template void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { for (int apiId : {0, 1}) { - test_A(numTeams, numCols, apiId); + test_A(numTeams, numCols, apiId); } } } @@ -219,6 +232,10 @@ TEST(std_algorithms_transform_exclusive_scan_team_test, test) { run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); + + run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); } } // namespace TeamTransformExclusiveScan diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp index 4b3166023267..10454d65515b 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp @@ -131,7 +131,9 @@ struct TestFunctorA { } }; -template +struct InPlace {}; + +template void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { /* description: use a rank-2 view randomly filled with values, @@ -157,9 +159,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { using space_t = Kokkos::DefaultExecutionSpace; Kokkos::TeamPolicy policy(numTeams, Kokkos::AUTO()); - // create the destination view - Kokkos::View destView("destView", numTeams, numCols); - // tranform_inclusive_scan returns an iterator so to verify that it is correct // each team stores the distance of the returned iterator from the beginning // of the interval that team operates on and then we check that these @@ -179,12 +178,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { rand_pool pool(lowerBound * upperBound); Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound); - // use CTAD for functor auto initValuesView = Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h); - TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, - initValuesView, binaryOp, unaryOp, apiId); - Kokkos::parallel_for(policy, fnc); + + // create the destination view + Kokkos::View destView("destView", numTeams, numCols); + if constexpr (std::is_same_v) { + TestFunctorA fnc(sourceView, sourceView, distancesView, + intraTeamSentinelView, initValuesView, binaryOp, unaryOp, + apiId); + Kokkos::parallel_for(policy, fnc); + } else { + TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, + initValuesView, binaryOp, unaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } // ----------------------------------------------- // run cpp-std kernel and check @@ -236,16 +244,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { } #undef transform_inclusive_scan - auto dataViewAfterOp_h = create_host_space_copy(destView); - expect_equal_host_views(stdDestView, dataViewAfterOp_h); + if constexpr (std::is_same_v) { + auto dataViewAfterOp_h = create_host_space_copy(sourceView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } else { + auto dataViewAfterOp_h = create_host_space_copy(destView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } } -template +template void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { for (int apiId : {0, 1, 2, 3}) { - test_A(numTeams, numCols, apiId); + test_A(numTeams, numCols, apiId); } } } @@ -255,6 +268,10 @@ TEST(std_algorithms_transform_inclusive_scan_team_test, test) { run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); + + run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); } } // namespace TeamTransformInclusiveScan diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp index 87687b60a16e..0d3289e196f0 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp @@ -186,6 +186,10 @@ void run_all_scenarios() { } TEST(std_algorithms_unique_copy_team_test, test) { + // FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp index 9dac3ce75ffa..fa2804256ac2 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp @@ -16,6 +16,7 @@ #include #include +#include namespace Test { namespace stdalgos { @@ -160,24 +161,15 @@ void verify_data(ViewType1 data_view, // contains data create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); if (test_view_h.extent(0) > 0) { for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " " - // << gold_h(i) << " " << test_view_h(i) << " " - // << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - if (std::is_same::value) { ASSERT_EQ(gold_h(i), test_view_h(i)); } else { const auto error = std::abs(gold_h(i) - test_view_h(i)); - if (error > 1e-10) { - std::cout << i << " " << std::setprecision(15) << data_view_h(i) - << " " << gold_h(i) << " " << test_view_h(i) << " " - << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - } - EXPECT_LT(error, 1e-10); + ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error + << static_cast(test_view_h(i)) << " " + << static_cast(gold_h(i)); } } - // std::cout << " last el: " << test_view_h(test_view_h.extent(0)-1) << - // std::endl; } } @@ -205,17 +197,13 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value, BinaryOp bop, UnaryOp uop) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // std::cout << "transform_exclusive_scan custom op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " - // << "init = " << init_value << std::endl; - - auto view_dest = - create_view(Tag{}, view_ext, "transform_exclusive_scan"); - auto view_from = - create_view(Tag{}, view_ext, "transform_exclusive_scan"); + + auto view_from = create_view(Tag{}, view_ext, + "transform_exclusive_scan_view_from"); fill_view(view_from, name); + auto view_dest = create_view(Tag{}, view_ext, + "transform_exclusive_scan_view_dest"); { fill_zero(view_dest); auto r = KE::transform_exclusive_scan( @@ -253,6 +241,65 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value, Kokkos::fence(); } +template +void run_single_scenario_inplace(const InfoType& scenario_info, + ValueType init_value, BinaryOp bop, + UnaryOp uop) { + const auto name = std::get<0>(scenario_info); + const std::size_t view_ext = std::get<1>(scenario_info); + + // since here we call the in-place operation, we need to use two views: + // view1: filled according to what the scenario asks for and is not modified + // view2: filled according to what the scenario asks for and used for the + // in-place op Therefore, after the op is done, view2 should contain the + // result of doing exclusive scan NOTE: view2 is filled below every time + // because the algorithm acts in place + + auto view1 = + create_view(Tag{}, view_ext, "transform_exclusive_scan_view1"); + fill_view(view1, name); + + auto view2 = + create_view(Tag{}, view_ext, "transform_exclusive_scan_view2"); + + { + fill_view(view2, name); + auto r = KE::transform_exclusive_scan(exespace(), KE::cbegin(view2), + KE::cend(view2), KE::begin(view2), + init_value, bop, uop); + ASSERT_EQ(r, KE::end(view2)); + verify_data(view1, view2, init_value, bop, uop); + } + + { + fill_view(view2, name); + auto r = KE::transform_exclusive_scan( + "label", exespace(), KE::cbegin(view2), KE::cend(view2), + KE::begin(view2), init_value, bop, uop); + ASSERT_EQ(r, KE::end(view2)); + verify_data(view1, view2, init_value, bop, uop); + } + + { + fill_view(view2, name); + auto r = KE::transform_exclusive_scan(exespace(), view2, view2, init_value, + bop, uop); + ASSERT_EQ(r, KE::end(view2)); + verify_data(view1, view2, init_value, bop, uop); + } + + { + fill_view(view2, name); + auto r = KE::transform_exclusive_scan("label", exespace(), view2, view2, + init_value, bop, uop); + ASSERT_EQ(r, KE::end(view2)); + verify_data(view1, view2, init_value, bop, uop); + } + + Kokkos::fence(); +} + template void run_all_scenarios() { const std::map scenarios = { @@ -267,6 +314,11 @@ void run_all_scenarios() { run_single_scenario(it, ValueType{1}, bop_t(), uop_t()); run_single_scenario(it, ValueType{-2}, bop_t(), uop_t()); run_single_scenario(it, ValueType{3}, bop_t(), uop_t()); + + run_single_scenario_inplace(it, ValueType{0}, bop_t(), + uop_t()); + run_single_scenario_inplace(it, ValueType{-2}, bop_t(), + uop_t()); } } diff --git a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp index a90a68ca1d75..fb81ae91b049 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp @@ -16,6 +16,7 @@ #include #include +#include namespace Test { namespace stdalgos { @@ -172,24 +173,15 @@ void verify_data(ViewType1 data_view, // contains data create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); if (test_view_h.extent(0) > 0) { for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " " - // << gold_h(i) << " " << test_view_h(i) << " " - // << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - if (std::is_same::value) { ASSERT_EQ(gold_h(i), test_view_h(i)); } else { const auto error = std::abs(gold_h(i) - test_view_h(i)); - if (error > 1e-10) { - std::cout << i << " " << std::setprecision(15) << data_view_h(i) - << " " << gold_h(i) << " " << test_view_h(i) << " " - << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - } - EXPECT_LT(error, 1e-10); + ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error + << static_cast(test_view_h(i)) << " " + << static_cast(gold_h(i)); } } - // std::cout << " last el: " << test_view_h(test_view_h.extent(0)-1) << - // std::endl; } } @@ -210,30 +202,11 @@ struct SumBinaryFunctor { std::string value_type_to_string(int) { return "int"; } std::string value_type_to_string(double) { return "double"; } -template -void print_scenario_details(const std::string& name, BopT bop, UopT uop) { - (void)bop; - (void)uop; - std::cout << "transform_inclusive_scan: " << name << ", " - << view_tag_to_string(Tag{}) << std::endl; -} - -template -void print_scenario_details(const std::string& name, BopT bop, UopT uop, - ValueType init_value) { - (void)bop; - (void)uop; - std::cout << "transform_inclusive_scan: " << name << ", " - << view_tag_to_string(Tag{}) << ", " - << "init = " << init_value << std::endl; -} - template void run_single_scenario(const InfoType& scenario_info, Args... args /* by value on purpose*/) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // print_scenario_details(name, args...); auto view_dest = create_view(Tag{}, view_ext, "transform_inclusive_scan"); @@ -278,6 +251,63 @@ void run_single_scenario(const InfoType& scenario_info, Kokkos::fence(); } +template +void run_single_scenario_inplace(const InfoType& scenario_info, + Args... args /* by value on purpose*/) { + const auto name = std::get<0>(scenario_info); + const std::size_t view_ext = std::get<1>(scenario_info); + + // since here we call the in-place operation, we need to use two views: + // view1: filled according to scenario and is not modified + // view2: filled according scenario and used for the in-place op + // Therefore, after the op is done, view_2 should contain the + // result of doing exclusive scan. + // NOTE: view2 must be filled before every call to the algorithm + // because the algorithm acts in place + + auto view_1 = create_view(Tag{}, view_ext, + "transform_inclusive_scan_view_1"); + fill_view(view_1, name); + + auto view_2 = create_view(Tag{}, view_ext, + "transform_inclusive_scan_view_2"); + + { + fill_view(view_2, name); + auto r = KE::transform_inclusive_scan(exespace(), KE::cbegin(view_2), + KE::cend(view_2), KE::begin(view_2), + args...); + ASSERT_EQ(r, KE::end(view_2)); + verify_data(view_1, view_2, args...); + } + + { + fill_view(view_2, name); + auto r = KE::transform_inclusive_scan("label", exespace(), + KE::cbegin(view_2), KE::cend(view_2), + KE::begin(view_2), args...); + ASSERT_EQ(r, KE::end(view_2)); + verify_data(view_1, view_2, args...); + } + + { + fill_view(view_2, name); + auto r = KE::transform_inclusive_scan(exespace(), view_2, view_2, args...); + ASSERT_EQ(r, KE::end(view_2)); + verify_data(view_1, view_2, args...); + } + + { + fill_view(view_2, name); + auto r = KE::transform_inclusive_scan("label", exespace(), view_2, view_2, + args...); + ASSERT_EQ(r, KE::end(view_2)); + verify_data(view_1, view_2, args...); + } + + Kokkos::fence(); +} + template void run_all_scenarios() { const std::map scenarios = { @@ -294,15 +324,23 @@ void run_all_scenarios() { run_single_scenario(it, bop_t(), uop_t(), ValueType{2}); run_single_scenario(it, bop_t(), uop_t(), ValueType{-1}); run_single_scenario(it, bop_t(), uop_t(), ValueType{-2}); + + run_single_scenario_inplace(it, bop_t(), uop_t()); + run_single_scenario_inplace(it, bop_t(), uop_t(), + ValueType{0}); + run_single_scenario_inplace(it, bop_t(), uop_t(), + ValueType{2}); + run_single_scenario_inplace(it, bop_t(), uop_t(), + ValueType{-2}); } } #if !defined KOKKOS_ENABLE_OPENMPTARGET TEST(std_algorithms_numeric_ops_test, transform_inclusive_scan) { run_all_scenarios(); - // run_all_scenarios(); - // run_all_scenarios(); - // run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); } #endif diff --git a/packages/kokkos/algorithms/unit_tests/TestStdReducers.cpp b/packages/kokkos/algorithms/unit_tests/TestStdReducers.cpp index 3847e1e6a366..c05006a1617c 100644 --- a/packages/kokkos/algorithms/unit_tests/TestStdReducers.cpp +++ b/packages/kokkos/algorithms/unit_tests/TestStdReducers.cpp @@ -83,9 +83,6 @@ auto run_min_or_max_test(ViewType view, StdReducersTestEnumOrder enValue) { static_assert(std::is_same::value, "test is only enabled for HostSpace"); - std::cout << "checking reduction with order: " << order_to_string(enValue) - << "\n"; - using view_value_type = typename ViewType::value_type; using reducer_type = std::conditional_t< (flag == 0), Kokkos::MaxFirstLoc, @@ -132,18 +129,24 @@ TEST(std_algorithms_reducers, max_first_loc) { const auto pair1 = run_min_or_max_test<0, hostspace, index_type>( view_h, StdReducersTestEnumOrder::LeftToRight); - ASSERT_EQ(pair1.first, gold_value); - ASSERT_EQ(pair1.second, gold_location); + ASSERT_EQ(pair1.first, gold_value) + << order_to_string(StdReducersTestEnumOrder::LeftToRight); + ASSERT_EQ(pair1.second, gold_location) + << order_to_string(StdReducersTestEnumOrder::LeftToRight); const auto pair2 = run_min_or_max_test<0, hostspace, index_type>( view_h, StdReducersTestEnumOrder::RightToLeft); - ASSERT_EQ(pair2.first, gold_value); - ASSERT_EQ(pair2.second, gold_location); + ASSERT_EQ(pair2.first, gold_value) + << order_to_string(StdReducersTestEnumOrder::RightToLeft); + ASSERT_EQ(pair2.second, gold_location) + << order_to_string(StdReducersTestEnumOrder::RightToLeft); const auto pair3 = run_min_or_max_test<0, hostspace, index_type>( view_h, StdReducersTestEnumOrder::Random); - ASSERT_EQ(pair3.first, gold_value); - ASSERT_EQ(pair3.second, gold_location); + ASSERT_EQ(pair3.first, gold_value) + << order_to_string(StdReducersTestEnumOrder::Random); + ASSERT_EQ(pair3.second, gold_location) + << order_to_string(StdReducersTestEnumOrder::Random); } TEST(std_algorithms_reducers, min_first_loc) { @@ -191,9 +194,6 @@ void run_min_max_test(ViewType view, StdReducersTestEnumOrder enValue, static_assert(std::is_same::value, "test is only enabled for HostSpace"); - std::cout << "checking reduction with order: " << order_to_string(enValue) - << "\n"; - using view_value_type = typename ViewType::value_type; using reducer_type = Kokkos::MinMaxFirstLastLoc; @@ -212,10 +212,10 @@ void run_min_max_test(ViewType view, StdReducersTestEnumOrder enValue, reduction_value_type{view(index), view(index), index, index}); } - ASSERT_EQ(red_result.min_val, gold_values.first); - ASSERT_EQ(red_result.max_val, gold_values.second); - ASSERT_EQ(red_result.min_loc, gold_locs.first); - ASSERT_EQ(red_result.max_loc, gold_locs.second); + ASSERT_EQ(red_result.min_val, gold_values.first) << order_to_string(enValue); + ASSERT_EQ(red_result.max_val, gold_values.second) << order_to_string(enValue); + ASSERT_EQ(red_result.min_loc, gold_locs.first) << order_to_string(enValue); + ASSERT_EQ(red_result.max_loc, gold_locs.second) << order_to_string(enValue); } TEST(std_algorithms_reducers, min_max_first_last_loc) { diff --git a/packages/kokkos/benchmarks/CMakeLists.txt b/packages/kokkos/benchmarks/CMakeLists.txt index 42279bf55db8..abf502835947 100644 --- a/packages/kokkos/benchmarks/CMakeLists.txt +++ b/packages/kokkos/benchmarks/CMakeLists.txt @@ -1 +1,12 @@ +#FIXME_OPENMPTARGET - compiling in debug mode causes ICE. +KOKKOS_ADD_BENCHMARK_DIRECTORIES(atomic) +KOKKOS_ADD_BENCHMARK_DIRECTORIES(gather) KOKKOS_ADD_BENCHMARK_DIRECTORIES(gups) +KOKKOS_ADD_BENCHMARK_DIRECTORIES(launch_latency) +KOKKOS_ADD_BENCHMARK_DIRECTORIES(stream) + +#FIXME_OPENMPTARGET - These two benchmarks cause ICE. Commenting them for now but a deeper analysis on the cause and a possible fix will follow. +IF(NOT Kokkos_ENABLE_OPENMPTARGET) + KOKKOS_ADD_BENCHMARK_DIRECTORIES(policy_performance) + KOKKOS_ADD_BENCHMARK_DIRECTORIES(bytes_and_flops) +ENDIF() diff --git a/packages/kokkos/benchmarks/atomic/CMakeLists.txt b/packages/kokkos/benchmarks/atomic/CMakeLists.txt new file mode 100644 index 000000000000..85f7412f492f --- /dev/null +++ b/packages/kokkos/benchmarks/atomic/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + atomic + SOURCES main.cpp +) diff --git a/packages/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt b/packages/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt new file mode 100644 index 000000000000..0ce44a6f1a8e --- /dev/null +++ b/packages/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + bytes_and_flops + SOURCES bench_double.cpp bench_float.cpp bench_int32_t.cpp bench_int64_t.cpp main.cpp +) diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp b/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp index 2589fd7309b2..88830af624b6 100644 --- a/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp +++ b/packages/kokkos/benchmarks/bytes_and_flops/bench.hpp @@ -37,22 +37,22 @@ struct RunStride { }; #define STRIDE 1 -#include +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 2 -#include +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 4 -#include +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 8 -#include +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 16 -#include +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 32 -#include +#include "bench_stride.hpp" #undef STRIDE template diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench_double.cpp b/packages/kokkos/benchmarks/bytes_and_flops/bench_double.cpp index f955c996660a..2fda1ae3d427 100644 --- a/packages/kokkos/benchmarks/bytes_and_flops/bench_double.cpp +++ b/packages/kokkos/benchmarks/bytes_and_flops/bench_double.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "bench.hpp" template void run_stride_unroll(int N, int K, int R, int D, int U, int F, int T, int S, int B, int I); diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench_float.cpp b/packages/kokkos/benchmarks/bytes_and_flops/bench_float.cpp index 137ff67d4040..3210116a9ee7 100644 --- a/packages/kokkos/benchmarks/bytes_and_flops/bench_float.cpp +++ b/packages/kokkos/benchmarks/bytes_and_flops/bench_float.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "bench.hpp" template void run_stride_unroll(int N, int K, int R, int D, int U, int F, int T, int S, int B, int I); diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench_int32_t.cpp b/packages/kokkos/benchmarks/bytes_and_flops/bench_int32_t.cpp index 29ccec014149..24a5dcd38997 100644 --- a/packages/kokkos/benchmarks/bytes_and_flops/bench_int32_t.cpp +++ b/packages/kokkos/benchmarks/bytes_and_flops/bench_int32_t.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "bench.hpp" template void run_stride_unroll(int N, int K, int R, int D, int U, int F, int T, int S, int B, int I); diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench_int64_t.cpp b/packages/kokkos/benchmarks/bytes_and_flops/bench_int64_t.cpp index c153d5eff397..0634700c31e1 100644 --- a/packages/kokkos/benchmarks/bytes_and_flops/bench_int64_t.cpp +++ b/packages/kokkos/benchmarks/bytes_and_flops/bench_int64_t.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "bench.hpp" template void run_stride_unroll(int N, int K, int R, int D, int U, int F, int T, int S, int B, int I); diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp b/packages/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp index b63d486fc9e4..80f017fbe8fc 100644 --- a/packages/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp +++ b/packages/kokkos/benchmarks/bytes_and_flops/bench_stride.hpp @@ -15,28 +15,28 @@ //@HEADER #define UNROLL 1 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 2 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 3 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 4 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 5 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 6 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 7 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 8 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL template diff --git a/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp b/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp index 0f7a298c1bb6..78cfd48effec 100644 --- a/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp +++ b/packages/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp @@ -26,7 +26,7 @@ struct Run { Kokkos::deep_copy(C, Scalar(3.5)); Kokkos::Timer timer; - for (int i = 0; i < I; ++i) { + for (int iter = 0; iter < I; ++iter) { Kokkos::parallel_for( "BenchmarkKernel", Kokkos::TeamPolicy<>(N, T).set_scratch_size(0, Kokkos::PerTeam(S)), diff --git a/packages/kokkos/benchmarks/bytes_and_flops/main.cpp b/packages/kokkos/benchmarks/bytes_and_flops/main.cpp index 20077757d1ff..fdfcc4ea64ff 100644 --- a/packages/kokkos/benchmarks/bytes_and_flops/main.cpp +++ b/packages/kokkos/benchmarks/bytes_and_flops/main.cpp @@ -16,7 +16,7 @@ #include #include -#include +#include "bench.hpp" #include extern template void run_stride_unroll(int, int, int, int, int, int, int, @@ -86,7 +86,7 @@ int main(int argc, char* argv[]) { printf("D must be one of 1,2,4,8,16,32\n"); return 0; } - if ((P < 1) && (P > 2)) { + if ((P < 1) || (P > 4)) { printf("P must be one of 1,2,3,4\n"); return 0; } diff --git a/packages/kokkos/benchmarks/gather/CMakeLists.txt b/packages/kokkos/benchmarks/gather/CMakeLists.txt new file mode 100644 index 000000000000..24c706277259 --- /dev/null +++ b/packages/kokkos/benchmarks/gather/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + gather + SOURCES main.cpp +) diff --git a/packages/kokkos/benchmarks/gather/gather.hpp b/packages/kokkos/benchmarks/gather/gather.hpp index d83461702c78..90b1101c1d5e 100644 --- a/packages/kokkos/benchmarks/gather/gather.hpp +++ b/packages/kokkos/benchmarks/gather/gather.hpp @@ -20,28 +20,28 @@ struct RunGather { }; #define UNROLL 1 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 2 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 3 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 4 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 5 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 6 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 7 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 8 -#include +#include "gather_unroll.hpp" #undef UNROLL template diff --git a/packages/kokkos/benchmarks/gather/gather_unroll.hpp b/packages/kokkos/benchmarks/gather/gather_unroll.hpp index 5ee5742a3f72..1aa73091bc5a 100644 --- a/packages/kokkos/benchmarks/gather/gather_unroll.hpp +++ b/packages/kokkos/benchmarks/gather/gather_unroll.hpp @@ -138,7 +138,7 @@ struct RunGather { printf( "SNKDRUF: %i %i %i %i %i %i %i Time: %lfs Bandwidth: %lfGiB/s GFlop/s: " "%lf GGather/s: %lf\n", - sizeof(Scalar) / 4, N, K, D, R, UNROLL, F, seconds, + static_cast(sizeof(Scalar) / 4), N, K, D, R, UNROLL, F, seconds, 1.0 * bytes / seconds / 1024 / 1024 / 1024, 1.e-9 * flops / seconds, 1.e-9 * gather_ops / seconds); } diff --git a/packages/kokkos/benchmarks/gather/main.cpp b/packages/kokkos/benchmarks/gather/main.cpp index 7f4fc9ede6ce..07fca9fdc64d 100644 --- a/packages/kokkos/benchmarks/gather/main.cpp +++ b/packages/kokkos/benchmarks/gather/main.cpp @@ -16,7 +16,7 @@ #include #include -#include +#include "gather.hpp" #include int main(int argc, char* argv[]) { diff --git a/packages/kokkos/benchmarks/launch_latency/CMakeLists.txt b/packages/kokkos/benchmarks/launch_latency/CMakeLists.txt new file mode 100644 index 000000000000..bb14da749d12 --- /dev/null +++ b/packages/kokkos/benchmarks/launch_latency/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + launch_latency + SOURCES launch_latency.cpp +) diff --git a/packages/kokkos/benchmarks/launch_latency/launch_latency.cpp b/packages/kokkos/benchmarks/launch_latency/launch_latency.cpp new file mode 100644 index 000000000000..73b176ab8dd7 --- /dev/null +++ b/packages/kokkos/benchmarks/launch_latency/launch_latency.cpp @@ -0,0 +1,283 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/*! \file launch_latency.cpp + + Tests of parallel_for and parallel_reduce latency for different + circumstances. + + Three launch kinds are tested: parallel_for, parallel_reduce into scalar, + and parallel_reduce into view + + N controls how large the parallel loops is + V controls how large the functor is + M controls across how many launches the latency is averaged + K controls how larege the nested loop is (no larger than V) + + For each launch kind, + 1. Avg functor dispatch latency: (time to do M launches) / M + 2. Avg functor completion throughput: (M launches + sync) / M + 3. Avg functor completion latency: (M (launch + sync)) / M +*/ + +#include + +template +struct TestFunctor { + double values[V]; + Kokkos::View a; + int K; + TestFunctor(Kokkos::View a_, int K_) : a(a_), K(K_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + for (int j = 0; j < K; j++) a(i) += 1.0 * i * values[j]; + } +}; + +template +struct TestRFunctor { + double values[V]; + Kokkos::View a; + int K; + TestRFunctor(Kokkos::View a_, int K_) : a(a_), K(K_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, double& lsum) const { + for (int j = 0; j < K; j++) a(i) += 1.0 * i * values[j]; + lsum += a(i); + } +}; + +struct Opts { + bool par_for = true; + bool par_reduce = true; + bool par_reduce_view = true; +}; + +template +void run(int N, int M, int K, const Opts& opts) { + std::string l_no_fence, l_fence, l_red_no_fence, l_red_fence, + l_red_view_no_fence, l_red_view_fence; + { + std::ostringstream ostream; + ostream << "RunNoFence_" << N << "_" << K << std::endl; + l_no_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunFence_" << N << "_" << K << std::endl; + l_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunReduceNoFence_" << N << "_" << K << std::endl; + l_red_no_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunReduceFence_" << N << "_" << K << std::endl; + l_red_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunReduceViewNoFence_" << N << "_" << K << std::endl; + l_red_view_no_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunReduceViewFence_" << N << "_" << K << std::endl; + l_red_view_fence = ostream.str(); + } + + double result; + Kokkos::View a("A", N); + Kokkos::View v_result("result"); + TestFunctor f(a, K); + TestRFunctor rf(a, K); + Kokkos::Timer timer; + + // initialize to an obviously wrong value + double time_no_fence = -1; // launch loop + double time_no_fence_fenced = -1; // launch loop then fence + double time_fence = -1; // launch&fence loop + + double time_red_no_fence = -1; + double time_red_no_fence_fenced = -1; + double time_red_fence = -1; + + double time_red_view_no_fence = -1; + double time_red_view_no_fence_fenced = -1; + double time_red_view_fence = -1; + + if (opts.par_for) { + // warmup + for (int i = 0; i < 4; ++i) { + Kokkos::parallel_for(l_no_fence, N, f); + } + Kokkos::fence(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_for(l_no_fence, N, f); + } + time_no_fence = timer.seconds(); + Kokkos::fence(); + time_no_fence_fenced = timer.seconds(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_for(l_fence, N, f); + Kokkos::fence(); + } + time_fence = timer.seconds(); + } + + if (opts.par_reduce) { + // warmup + for (int i = 0; i < 4; ++i) { + Kokkos::parallel_reduce(l_red_no_fence, N, rf, result); + } + Kokkos::fence(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_reduce(l_red_no_fence, N, rf, result); + } + time_red_no_fence = timer.seconds(); + Kokkos::fence(); + time_red_no_fence_fenced = timer.seconds(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_reduce(l_red_fence, N, rf, result); + Kokkos::fence(); + } + time_red_fence = timer.seconds(); + Kokkos::fence(); + } + + if (opts.par_reduce_view) { + // warmup + for (int i = 0; i < 4; ++i) { + Kokkos::parallel_reduce(l_red_view_no_fence, N, rf, v_result); + } + Kokkos::fence(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_reduce(l_red_view_no_fence, N, rf, v_result); + } + time_red_view_no_fence = timer.seconds(); + Kokkos::fence(); + time_red_view_no_fence_fenced = timer.seconds(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_reduce(l_red_view_fence, N, rf, v_result); + Kokkos::fence(); + } + time_red_view_fence = timer.seconds(); + Kokkos::fence(); + timer.reset(); + } + + const double x = 1.e6 / M; + printf("%i %i %i %i", N, V, K, M); + if (opts.par_for) { + printf(" parallel_for: %lf %lf ( %lf )", x * time_no_fence, x * time_fence, + x * time_no_fence_fenced); + } + if (opts.par_reduce) { + printf(" parallel_reduce: %lf %lf ( %lf )", x * time_red_no_fence, + x * time_red_fence, x * time_red_no_fence_fenced); + } + if (opts.par_reduce_view) { + printf(" parallel_reduce(view): %lf %lf ( %lf )", + x * time_red_view_no_fence, x * time_red_view_fence, + x * time_red_view_no_fence_fenced); + } + printf("\n"); +} +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + { + int N = 10000; + int M = 20; + int K = 1; + + Opts opts; + + printf("==========================\n"); + printf("Kokkos Launch Latency Test\n"); + printf("==========================\n"); + printf("\n"); + printf("Usage: %s ARGUMENTS [OPTIONS...]\n\n", argv[0]); + printf("Arguments: N M K\n"); + printf(" N: loop length\n"); + printf(" M: how many kernels to dispatch\n"); + printf( + " K: nested loop length (capped by size of functor member array\n\n"); + printf("Options:\n"); + printf(" --no-parallel-for: skip parallel_for benchmark\n"); + printf(" --no-parallel-reduce: skip parallel_reduce benchmark\n"); + printf( + " --no-parallel-reduce-view: skip parallel_reduce into view " + "benchmark\n"); + printf("\n\n"); + printf(" Output V is the size of the functor member array\n"); + printf("\n\n"); + + for (int i = 1; i < argc; ++i) { + const std::string_view arg(argv[i]); + + // anything that doesn't start with -- + if (arg.size() < 2 || + (arg.size() >= 2 && arg[0] != '-' && arg[1] != '-')) { + if (i == 1) + N = atoi(arg.data()); + else if (i == 2) + M = atoi(arg.data()); + else if (i == 3) + K = atoi(arg.data()); + else { + throw std::runtime_error("unexpected argument!"); + } + } else if (arg == "--no-parallel-for") { + opts.par_for = false; + } else if (arg == "--no-parallel-reduce") { + opts.par_reduce = false; + } else if (arg == "--no-parallel-reduce-view") { + opts.par_reduce_view = false; + } else { + std::stringstream ss; + ss << "unexpected argument \"" << arg << "\" at position " << i; + throw std::runtime_error(ss.str()); + } + } + + printf("N V K M time_no_fence time_fence (time_no_fence_fenced)\n"); + + /* A backend may have different launch strategies for functors of different + * sizes: test a variety of functor sizes.*/ + run<1>(N, M, K <= 1 ? K : 1, opts); + run<16>(N, M, K <= 16 ? K : 16, opts); + run<200>(N, M, K <= 200 ? K : 200, opts); + run<3000>(N, M, K <= 3000 ? K : 3000, opts); + run<30000>(N, M, K <= 30000 ? K : 30000, opts); + } + Kokkos::finalize(); +} diff --git a/packages/kokkos/benchmarks/policy_performance/CMakeLists.txt b/packages/kokkos/benchmarks/policy_performance/CMakeLists.txt new file mode 100644 index 000000000000..929b9c970237 --- /dev/null +++ b/packages/kokkos/benchmarks/policy_performance/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + policy_performance + SOURCES main.cpp +) diff --git a/packages/kokkos/benchmarks/policy_performance/main.cpp b/packages/kokkos/benchmarks/policy_performance/main.cpp index 28cfde552a59..0983a3d535c9 100644 --- a/packages/kokkos/benchmarks/policy_performance/main.cpp +++ b/packages/kokkos/benchmarks/policy_performance/main.cpp @@ -106,8 +106,9 @@ int main(int argc, char* argv[]) { Kokkos::parallel_reduce( "parallel_reduce warmup", Kokkos::TeamPolicy<>(10, 1), - KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type team, - double& lval) { lval += 1; }, + KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type&, double& lval) { + lval += 1; + }, result); using view_type_1d = Kokkos::View; diff --git a/packages/kokkos/benchmarks/policy_performance/policy_perf_test.hpp b/packages/kokkos/benchmarks/policy_performance/policy_perf_test.hpp index cc2cc40257b8..0e23d221f671 100644 --- a/packages/kokkos/benchmarks/policy_performance/policy_perf_test.hpp +++ b/packages/kokkos/benchmarks/policy_performance/policy_perf_test.hpp @@ -21,13 +21,13 @@ struct ParallelScanFunctor { using value_type = double; ViewType v; - ParallelScanFunctor(const ViewType& v_) : v(v_) {} + explicit ParallelScanFunctor(const ViewType& v_) : v(v_) {} KOKKOS_INLINE_FUNCTION - void operator()(const int idx, value_type& val, const bool& final) const { + void operator()(const int idx, value_type& val, const bool& is_final) const { // inclusive scan val += v(idx); - if (final) { + if (is_final) { v(idx) = val; } } @@ -109,7 +109,7 @@ void test_policy(int team_range, int thread_range, int vector_range, vector_result = 0.0; Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, vector_range), - [&](const int vi, double& vval) { vval += 1; }, + [&](const int, double& vval) { vval += 1; }, vector_result); } v2(idx, t) = vector_result; @@ -128,7 +128,7 @@ void test_policy(int team_range, int thread_range, int vector_range, team_result = 0.0; Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, thread_range), - [&](const int t, double& lval) { lval += 1; }, team_result); + [&](const int, double& lval) { lval += 1; }, team_result); } v1(idx) = team_result; // prevent compiler optimizing loop away @@ -170,13 +170,13 @@ void test_policy(int team_range, int thread_range, int vector_range, for (int tr = 0; tr < thread_repeat; ++tr) { Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, thread_range), - [&](const int t, double& lval) { + [&](const int, double& lval) { double vector_result = 0.0; for (int vr = 0; vr < inner_repeat; ++vr) { vector_result = 0.0; Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, vector_range), - [&](const int vi, double& vval) { vval += 1; }, + [&](const int, double& vval) { vval += 1; }, vector_result); lval += vector_result; } diff --git a/packages/kokkos/benchmarks/stream/CMakeLists.txt b/packages/kokkos/benchmarks/stream/CMakeLists.txt new file mode 100644 index 000000000000..0dded6e3a541 --- /dev/null +++ b/packages/kokkos/benchmarks/stream/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + stream + SOURCES stream-kokkos.cpp +) diff --git a/packages/kokkos/bin/nvcc_wrapper b/packages/kokkos/bin/nvcc_wrapper index c1400872402b..9b935835d5ff 100755 --- a/packages/kokkos/bin/nvcc_wrapper +++ b/packages/kokkos/bin/nvcc_wrapper @@ -229,7 +229,7 @@ do fi ;; #Handle known nvcc args - --dryrun|--verbose|--keep|--source-in-ptx|-src-in-ptx|--keep-dir*|-G|-lineinfo|-extended-lambda|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|--fmad=*|--use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) + --dryrun|-dryrun|--verbose|--keep|-keep|--source-in-ptx|-src-in-ptx|--keep-dir*|-keep-dir*|-G|-lineinfo|--generate-line-info|-extended-lambda|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-res-usage|-fmad=*|--use_fast_math|-use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) cuda_args="$cuda_args $1" ;; #Handle more known nvcc args diff --git a/packages/kokkos/cmake/KokkosConfig.cmake.in b/packages/kokkos/cmake/KokkosConfig.cmake.in index e26c75b31224..1b6d1b66ff5d 100644 --- a/packages/kokkos/cmake/KokkosConfig.cmake.in +++ b/packages/kokkos/cmake/KokkosConfig.cmake.in @@ -39,10 +39,12 @@ IF("launch_compiler" IN_LIST Kokkos_FIND_COMPONENTS) GLOBAL CHECK_CUDA_COMPILES) -ELSEIF(@Kokkos_ENABLE_CUDA@ AND NOT "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS) +ELSEIF(@Kokkos_ENABLE_CUDA@ + AND NOT @KOKKOS_COMPILE_LANGUAGE@ STREQUAL CUDA + AND NOT "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS) # - # if CUDA was enabled, separable compilation was not specified, and current compiler - # cannot compile CUDA, then set the RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK globally and + # if CUDA was enabled, the compilation language was not set to CUDA, and separable compilation was not + # specified, then set the RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK globally and # kokkos_launch_compiler will re-direct to the compiler used to compile CUDA code during installation. # kokkos_launch_compiler will re-direct if ${CMAKE_CXX_COMPILER} and -DKOKKOS_DEPENDENCE is present, # otherwise, the original command will be executed diff --git a/packages/kokkos/cmake/KokkosCore_config.h.in b/packages/kokkos/cmake/KokkosCore_config.h.in index 9930d2abf0f8..2df0f6c52054 100644 --- a/packages/kokkos/cmake/KokkosCore_config.h.in +++ b/packages/kokkos/cmake/KokkosCore_config.h.in @@ -23,8 +23,6 @@ #cmakedefine KOKKOS_ENABLE_CUDA #cmakedefine KOKKOS_ENABLE_HIP #cmakedefine KOKKOS_ENABLE_HPX -#cmakedefine KOKKOS_ENABLE_MEMKIND -#cmakedefine KOKKOS_ENABLE_LIBRT #cmakedefine KOKKOS_ENABLE_SYCL #cmakedefine KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED @@ -32,6 +30,7 @@ #cmakedefine KOKKOS_ENABLE_CXX17 #cmakedefine KOKKOS_ENABLE_CXX20 #cmakedefine KOKKOS_ENABLE_CXX23 +#cmakedefine KOKKOS_ENABLE_CXX26 #cmakedefine KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE #cmakedefine KOKKOS_ENABLE_CUDA_UVM @@ -45,7 +44,6 @@ #cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK #cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK #cmakedefine KOKKOS_ENABLE_TUNING -#cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE_3 #cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE_4 #cmakedefine KOKKOS_ENABLE_DEPRECATION_WARNINGS #cmakedefine KOKKOS_ENABLE_LARGE_MEM_TESTS @@ -53,17 +51,15 @@ #cmakedefine KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION // deprecated #cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION #cmakedefine KOKKOS_ENABLE_IMPL_MDSPAN +#cmakedefine KOKKOS_ENABLE_ATOMICS_BYPASS /* TPL Settings */ #cmakedefine KOKKOS_ENABLE_HWLOC -#cmakedefine KOKKOS_USE_LIBRT -#cmakedefine KOKKOS_ENABLE_HBWSPACE #cmakedefine KOKKOS_ENABLE_LIBDL #cmakedefine KOKKOS_ENABLE_LIBQUADMATH -#cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND #cmakedefine KOKKOS_ENABLE_ONEDPL +#cmakedefine KOKKOS_ENABLE_ROCTHRUST -#cmakedefine KOKKOS_ARCH_SSE42 #cmakedefine KOKKOS_ARCH_ARMV80 #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX #cmakedefine KOKKOS_ARCH_ARMV81 @@ -78,6 +74,7 @@ #cmakedefine KOKKOS_ARCH_POWER7 #cmakedefine KOKKOS_ARCH_POWER8 #cmakedefine KOKKOS_ARCH_POWER9 +#cmakedefine KOKKOS_ARCH_RISCV_SG2042 #cmakedefine KOKKOS_ARCH_INTEL_GEN #cmakedefine KOKKOS_ARCH_INTEL_DG1 #cmakedefine KOKKOS_ARCH_INTEL_GEN9 diff --git a/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake b/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake index 792c92c07e9d..5a62c530fce6 100644 --- a/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLCUDA.cmake @@ -7,7 +7,8 @@ IF (NOT CUDAToolkit_ROOT) ENDIF() ENDIF() -IF(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0") +# FIXME CMake 3.28.4 creates more targets than we export +IF(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0" AND CMAKE_VERSION VERSION_LESS "3.28.4") find_package(CUDAToolkit) ELSE() include(${CMAKE_CURRENT_LIST_DIR}/CudaToolkit.cmake) diff --git a/packages/kokkos/cmake/Modules/FindTPLLIBRT.cmake b/packages/kokkos/cmake/Modules/FindTPLLIBRT.cmake deleted file mode 100644 index e75da56b5b53..000000000000 --- a/packages/kokkos/cmake/Modules/FindTPLLIBRT.cmake +++ /dev/null @@ -1 +0,0 @@ -KOKKOS_FIND_IMPORTED(LIBRT HEADER time.h LIBRARY rt) diff --git a/packages/kokkos/cmake/Modules/FindTPLMEMKIND.cmake b/packages/kokkos/cmake/Modules/FindTPLMEMKIND.cmake deleted file mode 100644 index 20aaff22955c..000000000000 --- a/packages/kokkos/cmake/Modules/FindTPLMEMKIND.cmake +++ /dev/null @@ -1 +0,0 @@ -KOKKOS_FIND_IMPORTED(MEMKIND HEADER memkind.h LIBRARY memkind) diff --git a/packages/kokkos/cmake/Modules/FindTPLONEDPL.cmake b/packages/kokkos/cmake/Modules/FindTPLONEDPL.cmake index 01791cff443c..603510c315e4 100644 --- a/packages/kokkos/cmake/Modules/FindTPLONEDPL.cmake +++ b/packages/kokkos/cmake/Modules/FindTPLONEDPL.cmake @@ -43,4 +43,7 @@ ELSE() COMPILE_DEFINITIONS PSTL_USE_PARALLEL_POLICIES=0 _GLIBCXX_USE_TBB_PAR_BACKEND=0 ) ENDIF() + + # Export oneDPL as a Kokkos dependency + KOKKOS_EXPORT_CMAKE_TPL(oneDPL) ENDIF() diff --git a/packages/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake b/packages/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake new file mode 100644 index 000000000000..dae7dc3c9520 --- /dev/null +++ b/packages/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake @@ -0,0 +1,15 @@ +# ROCm 5.6 and earlier set AMDGPU_TARGETS and GPU_TARGETS to all the supported +# architectures. Therefore, we end up compiling Kokkos for all the supported +# architecture. Starting with ROCm 5.7 AMDGPU_TARGETS and GPU_TARGETS are empty. +# It is the user's job to set the variables. Since we are injecting the +# architecture flag ourselves, we can let the variables empty. To replicate the +# behavior of ROCm 5.7 and later for earlier version of ROCm we set +# AMDGPU_TARGETS and GPU_TARGETS to empty and set the values in the cache. If +# the values are not cached, FIND_PACKAGE(rocthrust) will overwrite them. +SET(AMDGPU_TARGETS "" CACHE STRING "AMD GPU targets to compile for") +SET(GPU_TARGETS "" CACHE STRING "GPU targets to compile for") +FIND_PACKAGE(rocthrust REQUIRED) +KOKKOS_CREATE_IMPORTED_TPL(ROCTHRUST INTERFACE LINK_LIBRARIES roc::rocthrust) + +# Export ROCTHRUST as a Kokkos dependency +KOKKOS_EXPORT_CMAKE_TPL(rocthrust) diff --git a/packages/kokkos/cmake/kokkos_arch.cmake b/packages/kokkos/cmake/kokkos_arch.cmake index 30764bde8605..34e9f05986fc 100644 --- a/packages/kokkos/cmake/kokkos_arch.cmake +++ b/packages/kokkos/cmake/kokkos_arch.cmake @@ -49,7 +49,6 @@ DECLARE_AND_CHECK_HOST_ARCH(ARMV81 "ARMv8.1 Compatible CPU") DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX "ARMv8 Cavium ThunderX CPU") DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX2 "ARMv8 Cavium ThunderX2 CPU") DECLARE_AND_CHECK_HOST_ARCH(A64FX "ARMv8.2 with SVE Support") -DECLARE_AND_CHECK_HOST_ARCH(WSM "Intel Westmere CPU") DECLARE_AND_CHECK_HOST_ARCH(SNB "Intel Sandy/Ivy Bridge CPUs") DECLARE_AND_CHECK_HOST_ARCH(HSW "Intel Haswell CPUs") DECLARE_AND_CHECK_HOST_ARCH(BDW "Intel Broadwell Xeon E-class CPUs") @@ -60,13 +59,12 @@ DECLARE_AND_CHECK_HOST_ARCH(SKX "Intel Skylake Xeon Server CPUs (A DECLARE_AND_CHECK_HOST_ARCH(KNC "Intel Knights Corner Xeon Phi") DECLARE_AND_CHECK_HOST_ARCH(KNL "Intel Knights Landing Xeon Phi") DECLARE_AND_CHECK_HOST_ARCH(SPR "Intel Sapphire Rapids Xeon Server CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(BGQ "IBM Blue Gene Q") -DECLARE_AND_CHECK_HOST_ARCH(POWER7 "IBM POWER7 CPUs") DECLARE_AND_CHECK_HOST_ARCH(POWER8 "IBM POWER8 CPUs") DECLARE_AND_CHECK_HOST_ARCH(POWER9 "IBM POWER9 CPUs") DECLARE_AND_CHECK_HOST_ARCH(ZEN "AMD Zen architecture") DECLARE_AND_CHECK_HOST_ARCH(ZEN2 "AMD Zen2 architecture") DECLARE_AND_CHECK_HOST_ARCH(ZEN3 "AMD Zen3 architecture") +DECLARE_AND_CHECK_HOST_ARCH(RISCV_SG2042 "SG2042 (RISC-V) CPUs") IF(Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR Kokkos_ENABLE_SYCL) SET(KOKKOS_SHOW_CUDA_ARCHS ON) @@ -191,9 +189,6 @@ IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang) ELSEIF(CUDAToolkit_BIN_DIR) GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS --cuda-path=${CUDAToolkit_BIN_DIR}/..) ENDIF() - IF (KOKKOS_ENABLE_CUDA) - SET(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND ON CACHE BOOL "enable CUDA Clang workarounds" FORCE) - ENDIF() ELSEIF (KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) SET(CUDA_ARCH_FLAG "-gpu") GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -cuda) @@ -342,18 +337,6 @@ IF (KOKKOS_ARCH_ZEN3) SET(KOKKOS_ARCH_AVX2 ON) ENDIF() -IF (KOKKOS_ARCH_WSM) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xSSE4.2 - MSVC NO-VALUE-SPECIFIED - NVHPC -tp=px - DEFAULT -msse4.2 - ) - SET(KOKKOS_ARCH_SSE42 ON) -ENDIF() - IF (KOKKOS_ARCH_SNB OR KOKKOS_ARCH_AMDAVX) SET(KOKKOS_ARCH_AVX ON) COMPILER_SPECIFIC_FLAGS( @@ -378,6 +361,23 @@ IF (KOKKOS_ARCH_HSW) ) ENDIF() +IF (KOKKOS_ARCH_RISCV_SG2042) + IF(NOT + (KOKKOS_CXX_COMPILER_ID STREQUAL GNU + AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) + OR + (KOKKOS_CXX_COMPILER_ID STREQUAL Clang + AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14) + ) + MESSAGE(SEND_ERROR "Only gcc >= 12 and clang >= 14 support RISC-V.") + ENDIF() + COMPILER_SPECIFIC_FLAGS( + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID + DEFAULT -march=rv64imafdcv + ) +ENDIF() + + IF (KOKKOS_ARCH_BDW) SET(KOKKOS_ARCH_AVX2 ON) COMPILER_SPECIFIC_FLAGS( @@ -571,6 +571,11 @@ IF (KOKKOS_ENABLE_HIP) COMPILER_SPECIFIC_FLAGS( DEFAULT -fgpu-rdc ) + IF (NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + COMPILER_SPECIFIC_LINK_OPTIONS( + DEFAULT --hip-link + ) + ENDIF() ELSE() COMPILER_SPECIFIC_FLAGS( DEFAULT -fno-gpu-rdc @@ -588,32 +593,44 @@ IF (KOKKOS_ENABLE_SYCL) ENDIF() # Check support for device_global variables -# FIXME_SYCL Once the feature test macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL is -# available, use that instead. -IF(KOKKOS_ENABLE_SYCL AND NOT BUILD_SHARED_LIBS) - INCLUDE(CheckCXXSourceCompiles) +# FIXME_SYCL If SYCL_EXT_ONEAPI_DEVICE_GLOBAL is defined, we can use device +# global variables with shared libraries using the "non-separable compilation" +# implementation. Otherwise, the feature is not supported when building shared +# libraries. Thus, we don't even check for support if shared libraries are +# requested and SYCL_EXT_ONEAPI_DEVICE_GLOBAL is not defined. +IF(KOKKOS_ENABLE_SYCL) STRING(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") - CHECK_CXX_SOURCE_COMPILES(" - #include - using namespace sycl::ext::oneapi::experimental; - using namespace sycl; - - SYCL_EXTERNAL device_global Foo; - - void bar(queue q) { - q.single_task([=] { - Foo = 42; - }); - } - - int main(){ return 0; } - " - KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) - - IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED - ) + INCLUDE(CheckCXXSymbolExists) + CHECK_CXX_SYMBOL_EXISTS(SYCL_EXT_ONEAPI_DEVICE_GLOBAL "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + IF (KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + SET(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON) + # Use the non-separable compilation implementation to support shared libraries as well. + COMPILER_SPECIFIC_FLAGS(DEFAULT -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) + ELSEIF(NOT BUILD_SHARED_LIBS) + INCLUDE(CheckCXXSourceCompiles) + CHECK_CXX_SOURCE_COMPILES(" + #include + using namespace sycl::ext::oneapi::experimental; + using namespace sycl; + + SYCL_EXTERNAL device_global Foo; + + void bar(queue q) { + q.single_task([=] { + Foo = 42; + }); + } + + int main(){ return 0; } + " + KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) + + IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) + # Only the separable compilation implementation is supported. + COMPILER_SPECIFIC_FLAGS( + DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED + ) + ENDIF() ENDIF() ENDIF() @@ -767,30 +784,35 @@ IF (KOKKOS_ENABLE_OPENMPTARGET) COMPILER_SPECIFIC_FLAGS( IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__ ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN9) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen9" -D__STRICT_ANSI__ - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN11) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen11" -D__STRICT_ANSI__ - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen12lp" -D__STRICT_ANSI__ - ) - ELSEIF(KOKKOS_ARCH_INTEL_DG1) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device dg1" -D__STRICT_ANSI__ - ) - ELSEIF(KOKKOS_ARCH_INTEL_XEHP) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.50.4" -D__STRICT_ANSI__ + ELSE() + COMPILER_SPECIFIC_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -D__STRICT_ANSI__ ) - ELSEIF(KOKKOS_ARCH_INTEL_PVC) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.60.7" -D__STRICT_ANSI__ + IF(KOKKOS_ARCH_INTEL_GEN9) + COMPILER_SPECIFIC_LINK_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen9" + ) + ELSEIF(KOKKOS_ARCH_INTEL_GEN11) + COMPILER_SPECIFIC_LINK_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen11" + ) + ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP) + COMPILER_SPECIFIC_LINK_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen12lp" + ) + ELSEIF(KOKKOS_ARCH_INTEL_DG1) + COMPILER_SPECIFIC_LINK_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device dg1" + ) + ELSEIF(KOKKOS_ARCH_INTEL_XEHP) + COMPILER_SPECIFIC_LINK_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.50.4" + ) + ELSEIF(KOKKOS_ARCH_INTEL_PVC) + COMPILER_SPECIFIC_LINK_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.60.7" ) + ENDIF() ENDIF() ENDIF() @@ -1130,3 +1152,14 @@ MESSAGE(STATUS "Architectures:") FOREACH(Arch ${KOKKOS_ENABLED_ARCH_LIST}) MESSAGE(STATUS " ${Arch}") ENDFOREACH() + + +IF(KOKKOS_ENABLE_ATOMICS_BYPASS) + IF(NOT _HOST_PARALLEL STREQUAL "NoTypeDefined" OR NOT _DEVICE_PARALLEL STREQUAL "NoTypeDefined") + MESSAGE(FATAL_ERROR "Not allowed to disable atomics (via -DKokkos_ENABLE_AROMICS_BYPASS=ON) if neither a host parallel nor a device backend is enabled!") + ENDIF() + IF(NOT KOKKOS_ENABLE_SERIAL) + MESSAGE(FATAL_ERROR "Implementation bug") # safeguard + ENDIF() + MESSAGE(STATUS "Atomics: **DISABLED**") +ENDIF() diff --git a/packages/kokkos/cmake/kokkos_compiler_id.cmake b/packages/kokkos/cmake/kokkos_compiler_id.cmake index 04589befc3ad..9135ca2b41c0 100644 --- a/packages/kokkos/cmake/kokkos_compiler_id.cmake +++ b/packages/kokkos/cmake/kokkos_compiler_id.cmake @@ -152,6 +152,7 @@ ENDIF() SET(KOKKOS_MESSAGE_TEXT "Compiler not supported by Kokkos. Required compiler versions:") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CPU) 8.0.0 or higher") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CUDA) 10.0.0 or higher") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(OpenMPTarget) 15.0.0 or higher") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC 8.2.0 or higher") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel 19.0.5 or higher") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(CPU) 2021.1.1 or higher") @@ -210,6 +211,10 @@ ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL XL OR KOKKOS_CXX_COMPILER_ID STREQUAL XLClang) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") +ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_OPENMPTARGET) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 15.0.0) + MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + ENDIF() ENDIF() IF(NOT DEFINED KOKKOS_CXX_HOST_COMPILER_ID) diff --git a/packages/kokkos/cmake/kokkos_enable_options.cmake b/packages/kokkos/cmake/kokkos_enable_options.cmake index 89e23b019bdc..a437f6132aa9 100644 --- a/packages/kokkos/cmake/kokkos_enable_options.cmake +++ b/packages/kokkos/cmake/kokkos_enable_options.cmake @@ -48,7 +48,6 @@ KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to allow lambda # resolved but we keep the option around a bit longer to be safe. KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC ON "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)") KOKKOS_ENABLE_OPTION(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler") -KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_3 OFF "Whether code deprecated in major release 3 is available" ) KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available" ) KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" ) KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP") @@ -74,6 +73,7 @@ KOKKOS_ENABLE_OPTION(HIP_MULTIPLE_KERNEL_INSTANTIATIONS OFF "Whether multiple ke # This option will go away eventually, but allows fallback to old implementation when needed. KOKKOS_ENABLE_OPTION(DESUL_ATOMICS_EXTERNAL OFF "Whether to use an external desul installation") +KOKKOS_ENABLE_OPTION(ATOMICS_BYPASS OFF "**NOT RECOMMENDED** Whether to make atomics non-atomic for non-threaded MPI-only use cases") KOKKOS_ENABLE_OPTION(IMPL_MDSPAN OFF "Whether to enable experimental mdspan support") KOKKOS_ENABLE_OPTION(MDSPAN_EXTERNAL OFF BOOL "Whether to use an external version of mdspan") diff --git a/packages/kokkos/cmake/kokkos_pick_cxx_std.cmake b/packages/kokkos/cmake/kokkos_pick_cxx_std.cmake index d4eca651d423..ae14a10d531f 100644 --- a/packages/kokkos/cmake/kokkos_pick_cxx_std.cmake +++ b/packages/kokkos/cmake/kokkos_pick_cxx_std.cmake @@ -7,6 +7,7 @@ KOKKOS_OPTION(CXX_STANDARD "" STRING "[[DEPRECATED - USE CMAKE_CXX_STANDARD INST SET(KOKKOS_ENABLE_CXX17 OFF) SET(KOKKOS_ENABLE_CXX20 OFF) SET(KOKKOS_ENABLE_CXX23 OFF) +SET(KOKKOS_ENABLE_CXX26 OFF) IF (KOKKOS_CXX_STANDARD) MESSAGE(FATAL_ERROR "Setting the variable Kokkos_CXX_STANDARD in configuration is deprecated - set CMAKE_CXX_STANDARD directly instead") ENDIF() diff --git a/packages/kokkos/cmake/kokkos_test_cxx_std.cmake b/packages/kokkos/cmake/kokkos_test_cxx_std.cmake index 7ad49fdd2d9d..b075a3e36b56 100644 --- a/packages/kokkos/cmake/kokkos_test_cxx_std.cmake +++ b/packages/kokkos/cmake/kokkos_test_cxx_std.cmake @@ -74,6 +74,10 @@ ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "23") kokkos_set_cxx_standard_feature(23) SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2B") SET(KOKKOS_ENABLE_CXX23 ON) +ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "26") + kokkos_set_cxx_standard_feature(26) + SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2C") + SET(KOKKOS_ENABLE_CXX26 ON) ELSE() MESSAGE(FATAL_ERROR "Kokkos requires C++17 or newer but requested ${KOKKOS_CXX_STANDARD}!") ENDIF() diff --git a/packages/kokkos/cmake/kokkos_tpls.cmake b/packages/kokkos/cmake/kokkos_tpls.cmake index f124596a84e0..6ef3b79bde25 100644 --- a/packages/kokkos/cmake/kokkos_tpls.cmake +++ b/packages/kokkos/cmake/kokkos_tpls.cmake @@ -32,19 +32,21 @@ FUNCTION(KOKKOS_TPL_OPTION PKG DEFAULT) ENDFUNCTION() KOKKOS_TPL_OPTION(HWLOC Off TRIBITS HWLOC) -KOKKOS_TPL_OPTION(MEMKIND Off) -IF(KOKKOS_ENABLE_MEMKIND) - SET(KOKKOS_ENABLE_HBWSPACE ON) -ENDIF() KOKKOS_TPL_OPTION(CUDA ${Kokkos_ENABLE_CUDA} TRIBITS CUDA) -KOKKOS_TPL_OPTION(LIBRT Off) IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT KOKKOS_HAS_TRILINOS) SET(ROCM_DEFAULT ON) ELSE() SET(ROCM_DEFAULT OFF) ENDIF() +IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_HAS_TRILINOS) + SET(ROCTHRUST_DEFAULT ON) +ELSE() + SET(ROCTHRUST_DEFAULT OFF) +ENDIF() KOKKOS_TPL_OPTION(ROCM ${ROCM_DEFAULT}) +KOKKOS_TPL_OPTION(ROCTHRUST ${ROCTHRUST_DEFAULT}) + IF(KOKKOS_ENABLE_SYCL AND NOT KOKKOS_HAS_TRILINOS) SET(ONEDPL_DEFAULT ON) ELSE() @@ -77,21 +79,18 @@ KOKKOS_TPL_OPTION(LIBQUADMATH ${LIBQUADMATH_DEFAULT} TRIBITS quadmath) #Make sure we use our local FindKokkosCuda.cmake KOKKOS_IMPORT_TPL(HPX INTERFACE) -IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - KOKKOS_IMPORT_TPL(CUDA INTERFACE) -ENDIF() +KOKKOS_IMPORT_TPL(CUDA INTERFACE) KOKKOS_IMPORT_TPL(HWLOC) -KOKKOS_IMPORT_TPL(LIBRT) KOKKOS_IMPORT_TPL(LIBDL) -KOKKOS_IMPORT_TPL(MEMKIND) IF (NOT WIN32) KOKKOS_IMPORT_TPL(THREADS INTERFACE) ENDIF() IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) KOKKOS_IMPORT_TPL(ROCM INTERFACE) - KOKKOS_IMPORT_TPL(ONEDPL INTERFACE) ENDIF() +KOKKOS_IMPORT_TPL(ONEDPL INTERFACE) KOKKOS_IMPORT_TPL(LIBQUADMATH) +KOKKOS_IMPORT_TPL(ROCTHRUST) IF (Kokkos_ENABLE_DESUL_ATOMICS_EXTERNAL) find_package(desul REQUIRED COMPONENTS atomics) @@ -119,7 +118,3 @@ STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}") #Convert to a regular variable UNSET(KOKKOS_TPL_EXPORTS CACHE) SET(KOKKOS_TPL_EXPORTS ${KOKKOS_TPL_EXPORT_TEMP}) -IF (KOKKOS_ENABLE_MEMKIND) - SET(KOKKOS_ENABLE_HBWSPACE) - LIST(APPEND KOKKOS_MEMSPACE_LIST HBWSpace) -ENDIF() diff --git a/packages/kokkos/cmake/kokkos_tribits.cmake b/packages/kokkos/cmake/kokkos_tribits.cmake index b30ca70ab954..060a7a8472c7 100644 --- a/packages/kokkos/cmake/kokkos_tribits.cmake +++ b/packages/kokkos/cmake/kokkos_tribits.cmake @@ -237,18 +237,10 @@ ENDMACRO() ## KOKKOS_DECLARE is the declaration set ## KOKKOS_POST_INCLUDE is included at the end of Kokkos_Core.hpp MACRO(KOKKOS_CONFIGURE_CORE) - SET(FWD_BACKEND_LIST) - FOREACH(MEMSPACE ${KOKKOS_MEMSPACE_LIST}) - LIST(APPEND FWD_BACKEND_LIST ${MEMSPACE}) - ENDFOREACH() - FOREACH(BACKEND_ ${KOKKOS_ENABLED_DEVICES}) - LIST(APPEND FWD_BACKEND_LIST ${BACKEND_}) - ENDFOREACH() - MESSAGE(STATUS "Kokkos Devices: ${KOKKOS_ENABLED_DEVICES}, Kokkos Backends: ${FWD_BACKEND_LIST}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" "${FWD_BACKEND_LIST}") + MESSAGE(STATUS "Kokkos Backends: ${KOKKOS_ENABLED_DEVICES}") + KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" "${KOKKOS_ENABLED_DEVICES}") KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_SetupBackend.hpp "KOKKOS_SETUP" "setup/Kokkos_Setup" "${DEVICE_SETUP_LIST}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" "${FWD_BACKEND_LIST}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_PostInclude.hpp "KOKKOS_POST_INCLUDE" "Kokkos_Post_Include" "${KOKKOS_BACKEND_POST_INCLUDE_LIST}") + KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" "${KOKKOS_ENABLED_DEVICES}") SET(_DEFAULT_HOST_MEMSPACE "::Kokkos::HostSpace") KOKKOS_OPTION(DEFAULT_DEVICE_MEMORY_SPACE "" STRING "Override default device memory space") KOKKOS_OPTION(DEFAULT_HOST_MEMORY_SPACE "" STRING "Override default host memory space") @@ -309,7 +301,6 @@ MACRO(KOKKOS_INSTALL_ADDITIONAL_FILES) "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_FwdBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_SetupBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_DeclareBackend.hpp" - "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_PostInclude.hpp" DESTINATION ${KOKKOS_HEADER_DIR}) ENDMACRO() diff --git a/packages/kokkos/config/test_all_sandia b/packages/kokkos/config/test_all_sandia deleted file mode 100755 index 193a162a4e6e..000000000000 --- a/packages/kokkos/config/test_all_sandia +++ /dev/null @@ -1,773 +0,0 @@ -#!/bin/bash -e - -# -# Global config -# - -set -o pipefail - -# Determine current machine. - -MACHINE="" -HOSTNAME=$(hostname) -PROCESSOR=`uname -p` - -if [[ "$HOSTNAME" =~ (white|ride).* ]]; then - MACHINE=white - module load git -fi - -if [[ "$HOSTNAME" =~ .*bowman.* ]]; then - MACHINE=bowman - module load git -fi - -if [[ "$HOSTNAME" == n* ]]; then # Warning: very generic name - if [[ "$PROCESSOR" = "aarch64" ]]; then - MACHINE=sullivan - module load git - fi -fi - -if [[ "$HOSTNAME" == node* ]]; then # Warning: very generic name - if [[ "$MACHINE" = "" ]]; then - MACHINE=shepard - module load git - fi -fi - -if [[ "$HOSTNAME" == apollo\.* ]]; then - MACHINE=apollo - module load git -fi - -if [[ "$HOSTNAME" == sullivan ]]; then - MACHINE=sullivan - module load git -fi - -if [[ "$HOSTNAME" == mayer\.* ]]; then - MACHINE=mayer -# module load git -fi -if [[ "$HOSTNAME" == cn* ]]; then # Warning: very generic name - MACHINE=mayer -fi - -if [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then - if [[ "$MACHINE" = "" ]]; then - MACHINE=sems - module load sems-git - fi -fi - -if [[ "$MACHINE" = "" ]]; then - echo "Unrecognized machine" >&2 - exit 1 -fi - -echo "Running on machine: $MACHINE" - -GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" -IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" -ARM_GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" -INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" -CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial" -CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial" -CUDA_IBM_BUILD_LIST="Cuda_OpenMP,Cuda_Serial" - -GCC_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized" -IBM_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" -CLANG_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" -INTEL_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" -#CUDA_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" -CUDA_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Wsign-compare,-Wtype-limits,-Wuninitialized" -PGI_WARNING_FLAGS="" - -# Default. Machine specific can override. -DEBUG=False -ARGS="" -CUSTOM_BUILD_LIST="" -DRYRUN=False -BUILD_ONLY=False -declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=1 -TEST_SCRIPT=False -SKIP_HWLOC=False -SPOT_CHECK=False - -PRINT_HELP=False -OPT_FLAG="" -CXX_FLAGS_EXTRA="" -LD_FLAGS_EXTRA="" -KOKKOS_OPTIONS="" - -# -# Handle arguments. -# - -while [[ $# > 0 ]] -do - key="$1" - - case $key in - --kokkos-path*) - KOKKOS_PATH="${key#*=}" - ;; - --build-list*) - CUSTOM_BUILD_LIST="${key#*=}" - ;; - --debug*) - DEBUG=True - ;; - --build-only*) - BUILD_ONLY=True - ;; - --test-script*) - TEST_SCRIPT=True - ;; - --skip-hwloc*) - SKIP_HWLOC=True - ;; - --num*) - NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}" - ;; - --dry-run*) - DRYRUN=True - ;; - --spot-check*) - SPOT_CHECK=True - ;; - --arch*) - ARCH_FLAG="--arch=${key#*=}" - ;; - --opt-flag*) - OPT_FLAG="${key#*=}" - ;; - --with-cuda-options*) - KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}" - ;; - --with-options*) - KOKKOS_OPTIONS="--with-options=enable_large_mem_tests,${key#*=}" - ;; - --cxxflags-extra*) - CXX_FLAGS_EXTRA="${key#*=}" - ;; - --ldflags-extra*) - LD_FLAGS_EXTRA="${key#*=}" - ;; - --help*) - PRINT_HELP=True - ;; - *) - # args, just append - ARGS="$ARGS $1" - ;; - esac - - shift -done - -SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd ) - -# Set kokkos path. -if [ -z "$KOKKOS_PATH" ]; then - KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT -else - # Ensure KOKKOS_PATH is abs path. - KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd ) -fi - -UNCOMMITTED=`cd ${KOKKOS_PATH}; git status --porcelain 2>/dev/null` -if ! [ -z "$UNCOMMITTED" ]; then - echo "WARNING!! THE FOLLOWING CHANGES ARE UNCOMMITTED!! :" - echo "$UNCOMMITTED" - echo "" -fi - -GITSTATUS=`cd ${KOKKOS_PATH}; git log -n 1 --format=oneline` -echo "Repository Status: " ${GITSTATUS} -echo "" -echo "" - -# -# Machine specific config. -# - -if [ "$MACHINE" = "sems" ]; then - source /projects/sems/modulefiles/utils/sems-modules-init.sh - - BASE_MODULE_LIST="sems-env,kokkos-env,kokkos-hwloc/1.10.1/base,sems-/" - CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-/,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base" - CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-/,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base" - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="" - fi - - if [ "$SPOT_CHECK" = "True" ]; then - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" - "gcc/6.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" - "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" - "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" - "cuda/8.0.44 $CUDA8_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - else - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/16.0.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.9.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/8.0.44 $CUDA8_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - fi -elif [ "$MACHINE" = "white" ]; then - source /etc/profile.d/modules.sh - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=32 - - BASE_MODULE_LIST="/" - IBM_MODULE_LIST="/xl/" - CUDA_MODULE_LIST="/,gcc/6.4.0,ibm/xl/16.1.0" - - # Don't do pthread on white. - GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/5.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/6.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" - "cuda/9.0.103 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=Power8,Kepler37" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "bowman" ]; then - source /etc/profile.d/modules.sh - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=32 - - BASE_MODULE_LIST="/compilers/" - - OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("intel/16.4.258 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/17.2.174 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/18.0.128 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - ) - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=KNL" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "sullivan" ]; then - source /etc/profile.d/modules.sh - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=96 - - BASE_MODULE_LIST="/" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/6.1.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS") - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=ARMv8-ThunderX" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "mayer" ]; then - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=96 - - BASE_MODULE_LIST="/" - ARM_MODULE_LIST="/" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/7.2.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "arm/1.4.0 $ARM_MODULE_LIST $ARM_GCC_BUILD_LIST armclang++ $CLANG_WARNING_FLAGS") - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=ARMv8-TX2" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "shepard" ]; then - source /etc/profile.d/modules.sh - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=32 - - BASE_MODULE_LIST="/" - BASE_MODULE_LIST_INTEL="/compilers/" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("intel/17.4.196 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/18.0.128 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "pgi/17.10.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" - ) - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=HSW" - fi - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "apollo" ]; then - source /projects/sems/modulefiles/utils/sems-modules-init.sh - module use /home/projects/modulefiles/local/x86-64 - module load kokkos-env - - module load sems-git - module load sems-tex - module load sems-cmake/3.5.2 - module load sems-gdb - - SKIP_HWLOC=True - - BASE_MODULE_LIST="sems-env,kokkos-env,sems-/,kokkos-hwloc/1.10.1/base" - CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-/,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base" - CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-/,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base" - - CLANG_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,/,cuda/9.0.69" - NVCC_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,/,sems-gcc/5.3.0" - - BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_OpenMP" - BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_Pthread" - BUILD_LIST_CLANG="Serial,Pthread,OpenMP" - - if [ "$SPOT_CHECK" = "True" ]; then - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS" - "gcc/5.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" - "intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" - "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" - "clang/6.0 $CLANG_MODULE_LIST "Cuda_Pthread,OpenMP" clang++ $CUDA_WARNING_FLAGS" - "cuda/9.1 $CUDA_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - else - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("cuda/9.1 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "clang/6.0 $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS" - "clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" - "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - ) - fi - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=SNB,Volta70" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -else - echo "Unhandled machine $MACHINE" >&2 - exit 1 -fi - -export OMP_NUM_THREADS=4 - -declare -i NUM_RESULTS_TO_KEEP=7 - -RESULT_ROOT_PREFIX=TestAll - -if [ "$PRINT_HELP" = "True" ]; then - echo "test_all_sandia :" - echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory" - echo " Defaults to root repo containing this script" - echo "--debug: Run tests in debug. Defaults to False" - echo "--test-script: Test this script, not Kokkos" - echo "--skip-hwloc: Do not do hwloc tests" - echo "--num=N: Number of jobs to run in parallel" - echo "--spot-check: Minimal test set to issue pull request" - echo "--dry-run: Just print what would be executed" - echo "--build-only: Just do builds, don't run anything" - echo "--opt-flag=FLAG: Optimization flag (default: -O3)" - echo "--cxxflags-extra=FLAGS: Extra flags to be added to CXX_FLAGS" - echo "--ldflags-extra=FLAGS: Extra flags to be added to LD_FLAGS" - echo "--arch=ARCHITECTURE: overwrite architecture flags" - echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS" - echo "--build-list=BUILD,BUILD,BUILD..." - echo " Provide a comma-separated list of builds instead of running all builds" - echo " Valid items:" - echo " OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial" - echo " Cuda_OpenMP, Cuda_Pthread, Cuda_Serial" - echo "" - - echo "ARGS: list of expressions matching compilers to test" - echo " supported compilers sems" - for COMPILER_DATA in "${COMPILERS[@]}"; do - ARR=($COMPILER_DATA) - COMPILER=${ARR[0]} - echo " $COMPILER" - done - echo "" - - echo "Examples:" - echo " Run all tests" - echo " % test_all_sandia" - echo "" - echo " Run all gcc tests" - echo " % test_all_sandia gcc" - echo "" - echo " Run all gcc/4.8.4 and all intel tests" - echo " % test_all_sandia gcc/4.8.4 intel" - echo "" - echo " Run all tests in debug" - echo " % test_all_sandia --debug" - echo "" - echo " Run gcc/4.8.4 and only do OpenMP and OpenMP_Serial builds" - echo " % test_all_sandia gcc/4.8.4 --build-list=OpenMP,OpenMP_Serial" - echo "" - echo "If you want to kill the tests, do:" - echo " hit ctrl-z" - echo " % kill -9 %1" - echo - exit 0 -fi - -# Set build type. -if [ "$DEBUG" = "True" ]; then - BUILD_TYPE=debug -else - BUILD_TYPE=release -fi - -# If no args provided, do all compilers. -if [ -z "$ARGS" ]; then - ARGS='?' -fi - -# Process args to figure out which compilers to test. -COMPILERS_TO_TEST="" - -for ARG in $ARGS; do - for COMPILER_DATA in "${COMPILERS[@]}"; do - ARR=($COMPILER_DATA) - COMPILER=${ARR[0]} - - if [[ "$COMPILER" = $ARG* ]]; then - if [[ "$COMPILERS_TO_TEST" != *${COMPILER}* ]]; then - COMPILERS_TO_TEST="$COMPILERS_TO_TEST $COMPILER" - else - echo "Tried to add $COMPILER twice" - fi - fi - done -done - -# -# Functions. -# - -# get_compiler_name -get_compiler_name() { - echo $1 | cut -d/ -f1 -} - -# get_compiler_version -get_compiler_version() { - echo $1 | cut -d/ -f2 -} - -# Do not call directly. -get_compiler_data() { - local compiler=$1 - local item=$2 - local compiler_name=$(get_compiler_name $compiler) - local compiler_vers=$(get_compiler_version $compiler) - - local compiler_data - for compiler_data in "${COMPILERS[@]}" ; do - local arr=($compiler_data) - - if [ "$compiler" = "${arr[0]}" ]; then - echo "${arr[$item]}" | tr , ' ' | sed -e "s//$compiler_name/g" -e "s//$compiler_vers/g" - return 0 - fi - done - - # Not found. - echo "Unreconized compiler $compiler" >&2 - exit 1 -} - -# -# For all getters, usage: -# - -get_compiler_modules() { - get_compiler_data $1 1 -} - -get_compiler_build_list() { - get_compiler_data $1 2 -} - -get_compiler_exe_name() { - get_compiler_data $1 3 -} - -get_compiler_warning_flags() { - get_compiler_data $1 4 -} - -run_cmd() { - echo "RUNNING: $*" - if [ "$DRYRUN" != "True" ]; then - eval "$* 2>&1" - fi -} - -# report_and_log_test_results -report_and_log_test_result() { - # Use sane var names. - local success=$1; local desc=$2; local comment=$3; - - if [ "$success" = "0" ]; then - echo " PASSED $desc" - echo $comment > $PASSED_DIR/$desc - else - # For failures, comment should be the name of the phase that failed. - echo " FAILED $desc" >&2 - echo $comment > $FAILED_DIR/$desc - cat ${desc}.${comment}.log - fi -} - -setup_env() { - local compiler=$1 - local compiler_modules=$(get_compiler_modules $compiler) - - module purge - - local mod - for mod in $compiler_modules; do - echo "Loading module $mod" - module load $mod 2>&1 - # It is ridiculously hard to check for the success of a loaded - # module. Module does not return error codes and piping to grep - # causes module to run in a subshell. - module list 2>&1 | grep "$mod" >& /dev/null || return 1 - done - - return 0 -} - -# single_build_and_test -single_build_and_test() { - # Use sane var names. - local compiler=$1; local build=$2; local build_type=$3; - - # Set up env. - mkdir -p $ROOT_DIR/$compiler/"${build}-$build_type" - cd $ROOT_DIR/$compiler/"${build}-$build_type" - local desc=$(echo "${compiler}-${build}-${build_type}" | sed 's:/:-:g') - setup_env $compiler >& ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } - - # Set up flags. - local compiler_warning_flags=$(get_compiler_warning_flags $compiler) - local compiler_exe=$(get_compiler_exe_name $compiler) - - if [[ "$build_type" = hwloc* ]]; then - local extra_args=--with-hwloc=$(dirname $(dirname $(which hwloc-info))) - fi - - if [[ "$OPT_FLAG" = "" ]]; then - OPT_FLAG="-O3" - fi - - if [[ "$build_type" = *debug* ]]; then - local extra_args="$extra_args --debug" - local cxxflags="-g $compiler_warning_flags" - local ldflags="-g" - else - local cxxflags="$OPT_FLAG $compiler_warning_flags" - local ldflags="${OPT_FLAG}" - fi - - local cxxflags="${cxxflags} ${CXX_FLAGS_EXTRA}" - local ldflags="${ldflags} ${LD_FLAGS_EXTRA}" - - if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then - local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS" - fi - if [[ "$KOKKOS_OPTIONS" != "" ]]; then - local extra_args="$extra_args $KOKKOS_OPTIONS" - else - local extra_args="$extra_args --with-options=enable_large_mem_tests" - fi - - echo " Starting job $desc" - - local comment="no_comment" - - if [ "$TEST_SCRIPT" = "True" ]; then - local rand=$[ 1 + $[ RANDOM % 10 ]] - sleep $rand - - if [ $rand -gt 5 ]; then - run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; } - fi - else - run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --ldflags=\"$ldflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } - local -i build_start_time=$(date +%s) - run_cmd make -j 48 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; } - local -i build_end_time=$(date +%s) - comment="build_time=$(($build_end_time-$build_start_time))" - - if [[ "$BUILD_ONLY" == False ]]; then - run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; } - local -i run_end_time=$(date +%s) - comment="$comment run_time=$(($run_end_time-$build_end_time))" - fi - fi - - report_and_log_test_result 0 $desc "$comment" - - return 0 -} - -# wait_for_jobs -wait_for_jobs() { - local -i max_jobs=$1 - local -i num_active_jobs=$(jobs | wc -l) - while [ $num_active_jobs -ge $max_jobs ] - do - sleep 1 - num_active_jobs=$(jobs | wc -l) - jobs >& /dev/null - done -} - -# run_in_background -run_in_background() { - local compiler=$1 - - local -i num_jobs=$NUM_JOBS_TO_RUN_IN_PARALLEL - # Don't override command line input. - # if [[ "$BUILD_ONLY" == True ]]; then - # num_jobs=8 - # else - if [[ "$compiler" == cuda* ]]; then - num_jobs=1 - fi - if [[ "$compiler" == clang ]]; then - num_jobs=1 - fi - # fi - wait_for_jobs $num_jobs - - single_build_and_test $* & -} - -# build_and_test_all -build_and_test_all() { - # Get compiler data. - local compiler=$1 - if [ -z "$CUSTOM_BUILD_LIST" ]; then - local compiler_build_list=$(get_compiler_build_list $compiler) - else - local compiler_build_list=$(echo "$CUSTOM_BUILD_LIST" | tr , ' ') - fi - - # Do builds. - local build - for build in $compiler_build_list - do - run_in_background $compiler $build $BUILD_TYPE - - # If not cuda, do a hwloc test too. - if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then - run_in_background $compiler $build "hwloc-$BUILD_TYPE" - fi - done - - return 0 -} - -get_test_root_dir() { - local existing_results=$(find . -maxdepth 1 -name "$RESULT_ROOT_PREFIX*" | sort) - local -i num_existing_results=$(echo $existing_results | tr ' ' '\n' | wc -l) - local -i num_to_delete=${num_existing_results}-${NUM_RESULTS_TO_KEEP} - - if [ $num_to_delete -gt 0 ]; then - /bin/rm -rf $(echo $existing_results | tr ' ' '\n' | head -n $num_to_delete) - fi - - echo $(pwd)/${RESULT_ROOT_PREFIX}_$(date +"%Y-%m-%d_%H.%M.%S") -} - -wait_summarize_and_exit() { - wait_for_jobs 1 - - echo "#######################################################" - echo "PASSED TESTS" - echo "#######################################################" - - local passed_test - for passed_test in $(\ls -1 $PASSED_DIR | sort) - do - echo $passed_test $(cat $PASSED_DIR/$passed_test) - done - - local -i rv=0 - if [ "$(ls -A $FAILED_DIR)" ]; then - echo "#######################################################" - echo "FAILED TESTS" - echo "#######################################################" - - local failed_test - for failed_test in $(\ls -1 $FAILED_DIR | sort) - do - echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)" - rv=$rv+1 - done - fi - - exit $rv -} - -# -# Main. -# - -ROOT_DIR=$(get_test_root_dir) -mkdir -p $ROOT_DIR -cd $ROOT_DIR - -PASSED_DIR=$ROOT_DIR/results/passed -FAILED_DIR=$ROOT_DIR/results/failed -mkdir -p $PASSED_DIR -mkdir -p $FAILED_DIR - -echo "Going to test compilers: " $COMPILERS_TO_TEST -for COMPILER in $COMPILERS_TO_TEST; do - echo "Testing compiler $COMPILER" - build_and_test_all $COMPILER -done - -wait_summarize_and_exit diff --git a/packages/kokkos/config/yaml/volta.yaml b/packages/kokkos/config/yaml/volta.yaml deleted file mode 100644 index f67af9c2a44a..000000000000 --- a/packages/kokkos/config/yaml/volta.yaml +++ /dev/null @@ -1,4 +0,0 @@ -packages: - kokkos: - variants: +cuda +openmp +volta70 +cuda_lambda +wrapper ^cuda@10.1 - compiler: [gcc@7.2.0] diff --git a/packages/kokkos/containers/src/Kokkos_Bitset.hpp b/packages/kokkos/containers/src/Kokkos_Bitset.hpp index cd5ca4ea5123..f50ab0a0f7e9 100644 --- a/packages/kokkos/containers/src/Kokkos_Bitset.hpp +++ b/packages/kokkos/containers/src/Kokkos_Bitset.hpp @@ -28,24 +28,6 @@ namespace Kokkos { -namespace Impl { -//! Either append to the label if the property already exists, or set it. -template -auto with_updated_label(const ViewCtorProp& view_ctor_prop, - const std::string& label) { - using vcp_t = ViewCtorProp; - //! If the label property is already set, append. Otherwise, set label. - if constexpr (vcp_t::has_label) { - vcp_t new_ctor_props(view_ctor_prop); - static_cast&>(new_ctor_props) - .value.append(label); - return new_ctor_props; - } else { - return Impl::with_properties_if_unset(view_ctor_prop, label); - } -} -} // namespace Impl - template class Bitset; @@ -92,9 +74,10 @@ class Bitset { using block_view_type = View>; public: - /// constructor + Bitset() = default; + /// arg_size := number of bit in set - Bitset(unsigned arg_size = 0u) : Bitset(Kokkos::view_alloc(), arg_size) {} + Bitset(unsigned arg_size) : Bitset(Kokkos::view_alloc(), arg_size) {} template Bitset(const Impl::ViewCtorProp& arg_prop, unsigned arg_size) @@ -108,9 +91,8 @@ class Bitset { "Allocation properties should not contain the 'pointer' property."); //! Update 'label' property and allocate. - const auto prop_copy = Kokkos::Impl::with_updated_label( - Impl::with_properties_if_unset(arg_prop, std::string("Bitset")), - " - blocks"); + const auto prop_copy = + Impl::with_properties_if_unset(arg_prop, std::string("Bitset")); m_blocks = block_view_type(prop_copy, ((m_size + block_mask) >> block_shift)); @@ -310,8 +292,8 @@ class Bitset { } private: - unsigned m_size; - unsigned m_last_block_mask; + unsigned m_size = 0; + unsigned m_last_block_mask = 0; block_view_type m_blocks; private: diff --git a/packages/kokkos/containers/src/Kokkos_DualView.hpp b/packages/kokkos/containers/src/Kokkos_DualView.hpp index 84bced2cc447..e821570a8d5f 100644 --- a/packages/kokkos/containers/src/Kokkos_DualView.hpp +++ b/packages/kokkos/containers/src/Kokkos_DualView.hpp @@ -292,15 +292,6 @@ class DualView : public ViewTraits { d_view(src.d_view), h_view(src.h_view) {} - //! Copy assignment operator (shallow copy assignment) - template - DualView& operator=(const DualView& src) { - modified_flags = src.modified_flags; - d_view = src.d_view; - h_view = src.h_view; - return *this; - } - //! Subview constructor template DualView(const DualView& src, const Arg0& arg0, Args... args) diff --git a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp index 52aa86d8ee43..5fa59f1b7cdf 100644 --- a/packages/kokkos/containers/src/Kokkos_DynRankView.hpp +++ b/packages/kokkos/containers/src/Kokkos_DynRankView.hpp @@ -1340,7 +1340,7 @@ class ViewMapping< template struct apply { - static_assert(Kokkos::is_memory_traits::value, ""); + static_assert(Kokkos::is_memory_traits::value); using traits_type = Kokkos::ViewTraits::type, Args...>( - v.data(), v.impl_map().layout()); + auto layout = v.impl_map().layout(); + + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + is_layouttiled::value) { + for (int i = N; i < 7; ++i) + layout.dimension[i] = KOKKOS_IMPL_CTOR_DEFAULT_ARG; + } + + return View::type, Args...>(v.data(), layout); } template diff --git a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp index 92aead28784f..91a7e4a92732 100644 --- a/packages/kokkos/containers/src/Kokkos_OffsetView.hpp +++ b/packages/kokkos/containers/src/Kokkos_OffsetView.hpp @@ -124,15 +124,8 @@ KOKKOS_INLINE_FUNCTION void offsetview_verify_operator_bounds( args...); Kokkos::Impl::throw_runtime_exception(std::string(buffer));)) - KOKKOS_IF_ON_DEVICE(( - /* Check #1: is there a SharedAllocationRecord? - (we won't use it, but if it is not there then there isn't - a corresponding SharedAllocationHeader containing a label). - This check should cover the case of Views that don't - have the Unmanaged trait but were initialized by pointer. */ - if (tracker.has_record()) { - Kokkos::Impl::operator_bounds_error_on_device(map); - } else { Kokkos::abort("OffsetView bounds error"); })) + KOKKOS_IF_ON_DEVICE( + (Kokkos::abort("OffsetView bounds error"); (void)tracker;)) } } diff --git a/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp index e001c062de3a..78a6a238ece1 100644 --- a/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp +++ b/packages/kokkos/containers/src/Kokkos_UnorderedMap.hpp @@ -243,16 +243,16 @@ class UnorderedMap { using const_map_type = UnorderedMap; - static const bool is_set = std::is_void::value; - static const bool has_const_key = - std::is_same::value; - static const bool has_const_value = - is_set || std::is_same::value; + static constexpr bool is_set = std::is_void_v; + static constexpr bool has_const_key = + std::is_same_v; + static constexpr bool has_const_value = + is_set || std::is_same_v; - static const bool is_insertable_map = + static constexpr bool is_insertable_map = !has_const_key && (is_set || !has_const_value); - static const bool is_modifiable_map = has_const_key && !has_const_value; - static const bool is_const_map = has_const_key && has_const_value; + static constexpr bool is_modifiable_map = has_const_key && !has_const_value; + static constexpr bool is_const_map = has_const_key && has_const_value; using insert_result = UnorderedMapInsertResult; @@ -337,27 +337,27 @@ class UnorderedMap { Impl::get_property(prop_copy) + " - size")); m_available_indexes = - bitset_type(Kokkos::Impl::with_updated_label(prop_copy, " - bitset"), + bitset_type(Kokkos::Impl::append_to_label(prop_copy, " - bitset"), calculate_capacity(capacity_hint)); m_hash_lists = size_type_view( - Kokkos::Impl::with_updated_label(prop_copy_noinit, " - hash list"), + Kokkos::Impl::append_to_label(prop_copy_noinit, " - hash list"), Impl::find_hash_size(capacity())); m_next_index = size_type_view( - Kokkos::Impl::with_updated_label(prop_copy_noinit, " - next index"), + Kokkos::Impl::append_to_label(prop_copy_noinit, " - next index"), capacity() + 1); // +1 so that the *_at functions can always return a // valid reference - m_keys = key_type_view( - Kokkos::Impl::with_updated_label(prop_copy, " - keys"), capacity()); + m_keys = key_type_view(Kokkos::Impl::append_to_label(prop_copy, " - keys"), + capacity()); - m_values = value_type_view( - Kokkos::Impl::with_updated_label(prop_copy, " - values"), - is_set ? 0 : capacity()); + m_values = + value_type_view(Kokkos::Impl::append_to_label(prop_copy, " - values"), + is_set ? 0 : capacity()); m_scalars = - scalars_view(Kokkos::Impl::with_updated_label(prop_copy, " - scalars")); + scalars_view(Kokkos::Impl::append_to_label(prop_copy, " - scalars")); /** * Deep copies should also be done using the space instance if given. diff --git a/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp b/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp index 8f8cd9523b72..a979ee40d8c8 100644 --- a/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp +++ b/packages/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp @@ -27,6 +27,18 @@ namespace Kokkos { namespace Impl { +//! Append to the label contained in view_ctor_prop. +template +auto append_to_label(const ViewCtorProp& view_ctor_prop, + const std::string& label) { + using vcp_t = ViewCtorProp; + static_assert(vcp_t::has_label); + vcp_t new_ctor_props(view_ctor_prop); + static_cast&>(new_ctor_props) + .value.append(label); + return new_ctor_props; +} + uint32_t find_hash_size(uint32_t size); template diff --git a/packages/kokkos/containers/unit_tests/Makefile b/packages/kokkos/containers/unit_tests/Makefile index 2e35832cc891..18410882bca9 100644 --- a/packages/kokkos/containers/unit_tests/Makefile +++ b/packages/kokkos/containers/unit_tests/Makefile @@ -35,8 +35,8 @@ TESTS = Bitset DualView DynamicView DynViewAPI_generic DynViewAPI_rank12345 DynV tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ tmp2 := $(foreach test, $(TESTS), \ $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include" > Test$(device)_$(test).cpp); \ - $(shell echo "\#include" >> Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include" > Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include" >> Test$(device)_$(test).cpp); \ )\ ) \ ) diff --git a/packages/kokkos/containers/unit_tests/TestBitset.hpp b/packages/kokkos/containers/unit_tests/TestBitset.hpp index 3ad0d2bf5734..9923453f72ce 100644 --- a/packages/kokkos/containers/unit_tests/TestBitset.hpp +++ b/packages/kokkos/containers/unit_tests/TestBitset.hpp @@ -23,6 +23,8 @@ #include #include +#include <../../core/unit_test/tools/include/ToolTestingUtilities.hpp> + namespace Test { namespace Impl { @@ -155,7 +157,7 @@ void test_bitset() { { unsigned ts = 100u; - bitset_type b1; + bitset_type b1(Kokkos::view_alloc("MyBitset"), 0); ASSERT_TRUE(b1.is_allocated()); b1 = bitset_type(ts); @@ -165,6 +167,9 @@ void test_bitset() { ASSERT_TRUE(b1.is_allocated()); ASSERT_TRUE(b2.is_allocated()); ASSERT_TRUE(b3.is_allocated()); + + bitset_type b4; + ASSERT_FALSE(b4.is_allocated()); } std::array test_sizes = { @@ -237,6 +242,24 @@ void test_bitset() { } TEST(TEST_CATEGORY, bitset) { test_bitset(); } + +TEST(TEST_CATEGORY, bitset_default_constructor_no_alloc) { + using namespace Kokkos::Test::Tools; + listen_tool_events(Config::DisableAll(), Config::EnableAllocs()); + + auto success = validate_absence( + [&]() { + Kokkos::Bitset bs; + EXPECT_FALSE(bs.is_allocated()); + }, + [&](AllocateDataEvent) { + return MatchDiagnostic{true, {"Found alloc event"}}; + }); + ASSERT_TRUE(success); + + listen_tool_events(Config::DisableAll()); +} + } // namespace Test #endif // KOKKOS_TEST_BITSET_HPP diff --git a/packages/kokkos/core/perf_test/CMakeLists.txt b/packages/kokkos/core/perf_test/CMakeLists.txt index 7f3916da3127..e0dba03e1ecb 100644 --- a/packages/kokkos/core/perf_test/CMakeLists.txt +++ b/packages/kokkos/core/perf_test/CMakeLists.txt @@ -50,8 +50,8 @@ ELSE() FetchContent_Declare( googlebenchmark DOWNLOAD_EXTRACT_TIMESTAMP FALSE - URL https://github.com/google/benchmark/archive/refs/tags/v1.6.2.tar.gz - URL_HASH MD5=14d14849e075af116143a161bc3b927b + URL https://github.com/google/benchmark/archive/refs/tags/v1.7.1.tar.gz + URL_HASH MD5=0459a6c530df9851bee6504c3e37c2e7 ) FetchContent_MakeAvailable(googlebenchmark) list(POP_BACK CMAKE_MESSAGE_INDENT) diff --git a/packages/kokkos/core/src/CMakeLists.txt b/packages/kokkos/core/src/CMakeLists.txt index 012af0a7d06a..b84677e61b6f 100644 --- a/packages/kokkos/core/src/CMakeLists.txt +++ b/packages/kokkos/core/src/CMakeLists.txt @@ -18,10 +18,16 @@ IF (NOT desul_FOUND) ENDIF() IF(KOKKOS_ENABLE_SYCL) SET(DESUL_ATOMICS_ENABLE_SYCL ON) + IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED AND NOT KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + SET(DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION ON) + ENDIF() ENDIF() IF(KOKKOS_ENABLE_OPENMPTARGET) SET(DESUL_ATOMICS_ENABLE_OPENMP ON) # not a typo Kokkos OpenMPTarget -> Desul OpenMP ENDIF() + IF(KOKKOS_ENABLE_OPENACC) + SET(DESUL_ATOMICS_ENABLE_OPENACC ON) + ENDIF() CONFIGURE_FILE( ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/Config.hpp.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/desul/atomics/Config.hpp @@ -80,10 +86,6 @@ IF (KOKKOS_ENABLE_HPX) APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.hpp) ENDIF() -IF (NOT KOKKOS_ENABLE_MEMKIND) - LIST(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/Kokkos_HBWSpace.cpp) -ENDIF() - IF (KOKKOS_ENABLE_SERIAL) APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.cpp) APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.hpp) @@ -180,20 +182,15 @@ IF (Kokkos_ENABLE_IMPL_MDSPAN) ENDIF() KOKKOS_LINK_TPL(kokkoscore PUBLIC HWLOC) -KOKKOS_LINK_TPL(kokkoscore PUBLIC MEMKIND) -IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - KOKKOS_LINK_TPL(kokkoscore PUBLIC CUDA) -ENDIF() +KOKKOS_LINK_TPL(kokkoscore PUBLIC CUDA) KOKKOS_LINK_TPL(kokkoscore PUBLIC HPX) KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBDL) -KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBRT) # On *nix-like systems (Linux, macOS) we need pthread for C++ std::thread IF (NOT WIN32) KOKKOS_LINK_TPL(kokkoscore PUBLIC THREADS) ENDIF() IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM) - KOKKOS_LINK_TPL(kokkoscore PUBLIC ONEDPL) ENDIF() # FIXME: We need a proper solution to figure out whether to enable diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda.hpp index 8bfaf8317b65..276d03da2657 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda.hpp @@ -46,7 +46,6 @@ static_assert(false, namespace Kokkos { namespace Impl { -class CudaExec; class CudaInternal; } // namespace Impl } // namespace Kokkos @@ -129,33 +128,16 @@ class Cuda { /// \brief True if and only if this method is being called in a /// thread-parallel function. - KOKKOS_INLINE_FUNCTION static int in_parallel() { + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static int in_parallel() { #if defined(__CUDA_ARCH__) return true; #else return false; #endif } - - /** \brief Set the device in a "sleep" state. - * - * This function sets the device in a "sleep" state in which it is - * not ready for work. This may consume less resources than if the - * device were in an "awake" state, but it may also take time to - * bring the device from a sleep state to be ready for work. - * - * \return True if the device is in the "sleep" state, else false if - * the device is actively working and could not enter the "sleep" - * state. - */ - static bool sleep(); - - /// \brief Wake the device from the 'sleep' state so it is ready for work. - /// - /// \return True if the device is in the "ready" state, else "false" - /// if the device is actively working (which also means that it's - /// awake). - static bool wake(); +#endif /// \brief Wait until all dispatched functors complete. /// @@ -199,18 +181,37 @@ class Cuda { //! Initialize, telling the CUDA run-time library which device to use. static void impl_initialize(InitializationSettings const&); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 /// \brief Cuda device architecture of the selected device. /// /// This matches the __CUDA_ARCH__ specification. - static size_type device_arch(); + KOKKOS_DEPRECATED static size_type device_arch() { + const cudaDeviceProp& cudaProp = Cuda().cuda_device_prop(); + return cudaProp.major * 100 + cudaProp.minor; + } //! Query device count. - static size_type detect_device_count(); + KOKKOS_DEPRECATED static size_type detect_device_count() { + int count; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count)); + return count; + } /** \brief Detect the available devices and their architecture * as defined by the __CUDA_ARCH__ specification. */ - static std::vector detect_device_arch(); + KOKKOS_DEPRECATED static std::vector detect_device_arch() { + int count; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count)); + std::vector out; + for (int i = 0; i < count; ++i) { + cudaDeviceProp prop; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(&prop, i)); + out.push_back(prop.major * 100 + prop.minor); + } + return out; + } +#endif cudaStream_t cuda_stream() const; int cuda_device() const; diff --git a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp index c6512f44dadc..0944937e1bf6 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -33,7 +33,6 @@ //#include #include -#include #include @@ -83,11 +82,11 @@ void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) { KOKKOS_IMPL_CUDA_SAFE_CALL( (CudaInternal::singleton().cuda_memcpy_async_wrapper( dst, src, n, cudaMemcpyDefault, s))); - Impl::cuda_stream_synchronize( - s, + Kokkos::Tools::Experimental::Impl::profile_fence_event( + "Kokkos::Impl::DeepCopyAsyncCuda: Deep Copy Stream Sync", Kokkos::Tools::Experimental::SpecialSynchronizationCases:: DeepCopyResourceSynchronization, - "Kokkos::Impl::DeepCopyAsyncCuda: Deep Copy Stream Sync"); + [&]() { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(s)); }); } } // namespace Impl @@ -135,11 +134,23 @@ void kokkos_impl_cuda_set_pin_uvm_to_host(bool val) { namespace Kokkos { -CudaSpace::CudaSpace() : m_device(Kokkos::Cuda().cuda_device()) {} +CudaSpace::CudaSpace() + : m_device(Kokkos::Cuda().cuda_device()), + m_stream(Kokkos::Cuda().cuda_stream()) {} +CudaSpace::CudaSpace(int device_id, cudaStream_t stream) + : m_device(device_id), m_stream(stream) {} -CudaUVMSpace::CudaUVMSpace() : m_device(Kokkos::Cuda().cuda_device()) {} +CudaUVMSpace::CudaUVMSpace() + : m_device(Kokkos::Cuda().cuda_device()), + m_stream(Kokkos::Cuda().cuda_stream()) {} +CudaUVMSpace::CudaUVMSpace(int device_id, cudaStream_t stream) + : m_device(device_id), m_stream(stream) {} -CudaHostPinnedSpace::CudaHostPinnedSpace() {} +CudaHostPinnedSpace::CudaHostPinnedSpace() + : m_device(Kokkos::Cuda().cuda_device()), + m_stream(Kokkos::Cuda().cuda_stream()) {} +CudaHostPinnedSpace::CudaHostPinnedSpace(int device_id, cudaStream_t stream) + : m_device(device_id), m_stream(stream) {} size_t memory_threshold_g = 40000; // 40 kB @@ -161,52 +172,38 @@ void *CudaSpace::allocate(const char *arg_label, const size_t arg_alloc_size, } namespace { -void *impl_allocate_common(const Cuda &exec_space, const char *arg_label, - const size_t arg_alloc_size, +void *impl_allocate_common(const int device_id, + [[maybe_unused]] const cudaStream_t stream, + const char *arg_label, const size_t arg_alloc_size, const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle, - bool exec_space_provided) { + [[maybe_unused]] bool stream_sync_only) { void *ptr = nullptr; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(device_id)); + cudaError_t error_code = cudaSuccess; #ifndef CUDART_VERSION #error CUDART_VERSION undefined! #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) - cudaError_t error_code; if (arg_alloc_size >= memory_threshold_g) { - if (exec_space_provided) { - error_code = - exec_space.impl_internal_space_instance()->cuda_malloc_async_wrapper( - &ptr, arg_alloc_size); - exec_space.fence("Kokkos::Cuda: backend fence after async malloc"); - } else { - error_code = Impl::CudaInternal::singleton().cuda_malloc_async_wrapper( - &ptr, arg_alloc_size); - Impl::cuda_device_synchronize( - "Kokkos::Cuda: backend fence after async malloc"); + error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream); + + if (error_code == cudaSuccess) { + if (stream_sync_only) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream)); + } else { + Impl::cuda_device_synchronize( + "Kokkos::Cuda: backend fence after async malloc"); + } } - } else { - error_code = - (exec_space_provided - ? exec_space.impl_internal_space_instance()->cuda_malloc_wrapper( - &ptr, arg_alloc_size) - : Impl::CudaInternal::singleton().cuda_malloc_wrapper( - &ptr, arg_alloc_size)); - } -#else - cudaError_t error_code; - if (exec_space_provided) { - error_code = exec_space.impl_internal_space_instance()->cuda_malloc_wrapper( - &ptr, arg_alloc_size); - } else { - error_code = Impl::CudaInternal::singleton().cuda_malloc_wrapper( - &ptr, arg_alloc_size); - } + } else #endif + { error_code = cudaMalloc(&ptr, arg_alloc_size); } if (error_code != cudaSuccess) { // TODO tag as unlikely branch // This is the only way to clear the last error, which // we should do here since we're turning it into an // exception here - exec_space.impl_internal_space_instance()->cuda_get_last_error_wrapper(); + cudaGetLastError(); throw Experimental::CudaRawMemoryAllocationFailure( arg_alloc_size, error_code, Experimental::RawMemoryAllocationFailure::AllocationMechanism:: @@ -226,7 +223,7 @@ void *CudaSpace::impl_allocate( const char *arg_label, const size_t arg_alloc_size, const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle) const { - return impl_allocate_common(Kokkos::Cuda{}, arg_label, arg_alloc_size, + return impl_allocate_common(m_device, m_stream, arg_label, arg_alloc_size, arg_logical_size, arg_handle, false); } @@ -234,8 +231,9 @@ void *CudaSpace::impl_allocate( const Cuda &exec_space, const char *arg_label, const size_t arg_alloc_size, const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle) const { - return impl_allocate_common(exec_space, arg_label, arg_alloc_size, - arg_logical_size, arg_handle, true); + return impl_allocate_common( + exec_space.cuda_device(), exec_space.cuda_stream(), arg_label, + arg_alloc_size, arg_logical_size, arg_handle, true); } void *CudaUVMSpace::allocate(const size_t arg_alloc_size) const { @@ -256,28 +254,27 @@ void *CudaUVMSpace::impl_allocate( if (arg_alloc_size > 0) { Kokkos::Impl::num_uvm_allocations++; - auto error_code = - Impl::CudaInternal::singleton().cuda_malloc_managed_wrapper( - &ptr, arg_alloc_size, cudaMemAttachGlobal); - -#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST - if (Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host()) - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_mem_advise_wrapper( - ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation, - cudaCpuDeviceId))); -#endif + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + cudaError_t error_code = + cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal); if (error_code != cudaSuccess) { // TODO tag as unlikely branch // This is the only way to clear the last error, which // we should do here since we're turning it into an // exception here - Impl::CudaInternal::singleton().cuda_get_last_error_wrapper(); + cudaGetLastError(); throw Experimental::CudaRawMemoryAllocationFailure( arg_alloc_size, error_code, Experimental::RawMemoryAllocationFailure::AllocationMechanism:: CudaMallocManaged); } + +#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST + if (Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host()) + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMemAdvise(ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation, + cudaCpuDeviceId)); +#endif } Cuda::impl_static_fence( "Kokkos::CudaUVMSpace::impl_allocate: Post UVM Allocation"); @@ -302,13 +299,14 @@ void *CudaHostPinnedSpace::impl_allocate( const Kokkos::Tools::SpaceHandle arg_handle) const { void *ptr = nullptr; - auto error_code = Impl::CudaInternal::singleton().cuda_host_alloc_wrapper( - &ptr, arg_alloc_size, cudaHostAllocDefault); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + cudaError_t error_code = + cudaHostAlloc(&ptr, arg_alloc_size, cudaHostAllocDefault); if (error_code != cudaSuccess) { // TODO tag as unlikely branch // This is the only way to clear the last error, which // we should do here since we're turning it into an // exception here - Impl::CudaInternal::singleton().cuda_get_last_error_wrapper(); + cudaGetLastError(); throw Experimental::CudaRawMemoryAllocationFailure( arg_alloc_size, error_code, Experimental::RawMemoryAllocationFailure::AllocationMechanism:: @@ -350,18 +348,17 @@ void CudaSpace::impl_deallocate( if (arg_alloc_size >= memory_threshold_g) { Impl::cuda_device_synchronize( "Kokkos::Cuda: backend fence before async free"); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_free_async_wrapper( - arg_alloc_ptr))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeAsync(arg_alloc_ptr, m_stream)); Impl::cuda_device_synchronize( "Kokkos::Cuda: backend fence after async free"); } else { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); } #else - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); #endif } catch (...) { } @@ -393,8 +390,8 @@ void CudaUVMSpace::impl_deallocate( try { if (arg_alloc_ptr != nullptr) { Kokkos::Impl::num_uvm_allocations--; - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); } } catch (...) { } @@ -424,8 +421,8 @@ void CudaHostPinnedSpace::impl_deallocate( reported_size); } try { - KOKKOS_IMPL_CUDA_SAFE_CALL(( - Impl::CudaInternal::singleton().cuda_free_host_wrapper(arg_alloc_ptr))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr)); } catch (...) { } } @@ -438,160 +435,6 @@ void CudaHostPinnedSpace::impl_deallocate( namespace Kokkos { namespace Impl { -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord - SharedAllocationRecord::s_root_record; - -SharedAllocationRecord - SharedAllocationRecord::s_root_record; - -SharedAllocationRecord - SharedAllocationRecord::s_root_record; -#endif - -//============================================================================== -// {{{1 - -SharedAllocationRecord::~SharedAllocationRecord() { - auto alloc_size = SharedAllocationRecord::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - alloc_size, (alloc_size - sizeof(SharedAllocationHeader))); -} - -void SharedAllocationRecord::deep_copy_header_no_exec( - void *ptr, const void *header) { - Kokkos::Cuda exec; - Kokkos::Impl::DeepCopy(exec, ptr, header, - sizeof(SharedAllocationHeader)); - exec.fence( - "SharedAllocationRecord::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size, - (SharedAllocationRecord::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size, - (SharedAllocationRecord::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -// end SharedAllocationRecord destructors }}}1 -//============================================================================== - -//============================================================================== -// {{{1 - -SharedAllocationRecord::SharedAllocationRecord( - const Kokkos::CudaSpace &arg_space, const std::string &arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - Kokkos::Cuda exec; - Kokkos::Impl::DeepCopy( - exec, RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - exec.fence( - "SharedAllocationRecord::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -SharedAllocationRecord::SharedAllocationRecord( - const Kokkos::Cuda &arg_exec_space, const Kokkos::CudaSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_exec_space, arg_space, - arg_label, arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - Kokkos::Impl::DeepCopy(arg_exec_space, - RecordBase::m_alloc_ptr, &header, - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord::SharedAllocationRecord( - const Kokkos::CudaUVMSpace &arg_space, const std::string &arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::CudaHostPinnedSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -// end SharedAllocationRecord constructors }}}1 -//============================================================================== - void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes, bool to_device) { if ((ptr == nullptr) || (bytes == 0)) return; @@ -620,19 +463,12 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes, #include -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class SharedAllocationRecordCommon; -template class HostInaccessibleSharedAllocationRecordCommon; -template class SharedAllocationRecordCommon; -template class SharedAllocationRecordCommon; - -} // end namespace Impl -} // end namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::CudaSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::CudaUVMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::CudaHostPinnedSpace); // end Explicit instantiations of CRTP Base classes }}}1 //============================================================================== diff --git a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp index b8fa335cd3b2..0e20193e8b42 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp @@ -68,6 +68,11 @@ class CudaSpace { /*--------------------------------*/ CudaSpace(); + + private: + CudaSpace(int device_id, cudaStream_t stream); + + public: CudaSpace(CudaSpace&& rhs) = default; CudaSpace(const CudaSpace& rhs) = default; CudaSpace& operator=(CudaSpace&& rhs) = default; @@ -89,9 +94,11 @@ class CudaSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; + static CudaSpace impl_create(int device_id, cudaStream_t stream) { + return CudaSpace(device_id, stream); + } + private: - template - friend class Kokkos::Experimental::LogicalMemorySpace; void* impl_allocate(const Cuda& exec_space, const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, @@ -112,10 +119,10 @@ class CudaSpace { static constexpr const char* name() { return m_name; } private: - int m_device; ///< Which Cuda device + int m_device; + cudaStream_t m_stream; static constexpr const char* m_name = "Cuda"; - friend class Kokkos::Impl::SharedAllocationRecord; }; template <> @@ -149,6 +156,11 @@ class CudaUVMSpace { /*--------------------------------*/ CudaUVMSpace(); + + private: + CudaUVMSpace(int device_id, cudaStream_t stream); + + public: CudaUVMSpace(CudaUVMSpace&& rhs) = default; CudaUVMSpace(const CudaUVMSpace& rhs) = default; CudaUVMSpace& operator=(CudaUVMSpace&& rhs) = default; @@ -156,6 +168,16 @@ class CudaUVMSpace { ~CudaUVMSpace() = default; /**\brief Allocate untracked memory in the cuda space */ + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -167,8 +189,6 @@ class CudaUVMSpace { const size_t arg_logical_size = 0) const; private: - template - friend class Kokkos::Experimental::LogicalMemorySpace; void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -189,8 +209,13 @@ class CudaUVMSpace { #endif /*--------------------------------*/ + static CudaUVMSpace impl_create(int device_id, cudaStream_t stream) { + return CudaUVMSpace(device_id, stream); + } + private: - int m_device; ///< Which Cuda device + int m_device; + cudaStream_t m_stream; #ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST static bool kokkos_impl_cuda_pin_uvm_to_host_v; @@ -223,6 +248,11 @@ class CudaHostPinnedSpace { /*--------------------------------*/ CudaHostPinnedSpace(); + + private: + CudaHostPinnedSpace(int device_id, cudaStream_t stream); + + public: CudaHostPinnedSpace(CudaHostPinnedSpace&& rhs) = default; CudaHostPinnedSpace(const CudaHostPinnedSpace& rhs) = default; CudaHostPinnedSpace& operator=(CudaHostPinnedSpace&& rhs) = default; @@ -230,6 +260,16 @@ class CudaHostPinnedSpace { ~CudaHostPinnedSpace() = default; /**\brief Allocate untracked memory in the space */ + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -240,9 +280,11 @@ class CudaHostPinnedSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; + static CudaHostPinnedSpace impl_create(int device_id, cudaStream_t stream) { + return CudaHostPinnedSpace(device_id, stream); + } + private: - template - friend class Kokkos::Experimental::LogicalMemorySpace; void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -258,6 +300,9 @@ class CudaHostPinnedSpace { static constexpr const char* name() { return m_name; } private: + int m_device; + cudaStream_t m_stream; + static constexpr const char* m_name = "CudaHostPinned"; /*--------------------------------*/ @@ -280,15 +325,12 @@ const std::unique_ptr& cuda_get_deep_copy_space( bool initialize = true); static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); -static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaSpace>::assignable); +static_assert(Kokkos::Impl::MemorySpaceAccess< + Kokkos::CudaUVMSpace, Kokkos::CudaUVMSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaHostPinnedSpace>::assignable); //---------------------------------------- @@ -516,179 +558,10 @@ struct DeepCopy -class SharedAllocationRecord - : public HostInaccessibleSharedAllocationRecordCommon { - private: - friend class SharedAllocationRecord; - friend class SharedAllocationRecordCommon; - friend class HostInaccessibleSharedAllocationRecordCommon; - - using RecordBase = SharedAllocationRecord; - using base_t = - HostInaccessibleSharedAllocationRecordCommon; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; -#endif - - const Kokkos::CudaSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - // This constructor does not forward to the one without exec_space arg - // in order to work around https://github.com/kokkos/kokkos/issues/5258 - // This constructor is templated so I can't just put it into the cpp file - // like the other constructor. - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, const Kokkos::CudaSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - // workaround for issue with NVCC and MSVC - // https://github.com/kokkos/kokkos/issues/5258 - deep_copy_header_no_exec(RecordBase::m_alloc_ptr, &header); - } - - SharedAllocationRecord( - const Kokkos::Cuda& exec_space, const Kokkos::CudaSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const Kokkos::CudaSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - // helper function to work around MSVC+NVCC issue - // https://github.com/kokkos/kokkos/issues/5258 - static void deep_copy_header_no_exec(void*, const void*); -}; - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon { - private: - friend class SharedAllocationRecordCommon; - - using base_t = SharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - static RecordBase s_root_record; - - const Kokkos::CudaUVMSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - // This constructor does not forward to the one without exec_space arg - // in order to work around https://github.com/kokkos/kokkos/issues/5258 - // This constructor is templated so I can't just put it into the cpp file - // like the other constructor. - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::CudaUVMSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); - } - - SharedAllocationRecord( - const Kokkos::CudaUVMSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon { - private: - friend class SharedAllocationRecordCommon; - - using RecordBase = SharedAllocationRecord; - using base_t = SharedAllocationRecordCommon; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - static RecordBase s_root_record; - - const Kokkos::CudaHostPinnedSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - // This constructor does not forward to the one without exec_space arg - // in order to work around https://github.com/kokkos/kokkos/issues/5258 - // This constructor is templated so I can't just put it into the cpp file - // like the other constructor. - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::CudaHostPinnedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); - } - - SharedAllocationRecord( - const Kokkos::CudaHostPinnedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -} // namespace Impl -} // namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::CudaSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaUVMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaHostPinnedSpace); //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp index f68e05f78040..c4458c910ca7 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp @@ -27,10 +27,6 @@ namespace Kokkos { namespace Impl { -void cuda_stream_synchronize( - const cudaStream_t stream, - Kokkos::Tools::Experimental::SpecialSynchronizationCases reason, - const std::string& name); void cuda_device_synchronize(const std::string& name); void cuda_stream_synchronize(const cudaStream_t stream, const std::string& name); diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp index a4d064e544a7..5a821ab64a3c 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp @@ -23,8 +23,7 @@ #include -#include // GraphAccess needs to be complete -#include // SharedAllocationRecord +#include // GraphAccess needs to be complete #include #include @@ -50,10 +49,6 @@ class GraphNodeKernelImpl m_graph_ptr = nullptr; Kokkos::ObservingRawPtr m_graph_node_ptr = nullptr; - // Note: owned pointer to CudaSpace memory (used for global memory launches), - // which we're responsible for deallocating, but not responsible for calling - // its destructor. - using Record = Kokkos::Impl::SharedAllocationRecord; // Basically, we have to make this mutable for the same reasons that the // global kernel buffers in the Cuda instance are mutable... mutable Kokkos::OwningRawPtr m_driver_storage = nullptr; @@ -82,9 +77,7 @@ class GraphNodeKernelImpl allocate_driver_memory_buffer() const { KOKKOS_EXPECTS(m_driver_storage == nullptr) - - auto* record = Record::allocate( - Kokkos::CudaSpace{}, "GraphNodeKernel global memory functor storage", - sizeof(base_t)); - - Record::increment(record); - m_driver_storage = reinterpret_cast(record->data()); + m_driver_storage = static_cast(Kokkos::CudaSpace().allocate( + "GraphNodeKernel global memory functor storage", sizeof(base_t))); KOKKOS_ENSURES(m_driver_storage != nullptr) return m_driver_storage; } diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp index d7f853d99102..849e8b3b30e8 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -26,10 +26,10 @@ #include -#include -#include -#include -#include +//#include +//#include +//#include +//#include #include #include #include @@ -97,21 +97,21 @@ __global__ void query_cuda_kernel_arch(int *d_arch) { } /** Query what compute capability is actually launched to the device: */ -int cuda_kernel_arch() { +int cuda_kernel_arch(int device_id) { int arch = 0; int *d_arch = nullptr; - KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_malloc_wrapper( - reinterpret_cast(&d_arch), sizeof(int)))); - KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_memcpy_wrapper( - d_arch, &arch, sizeof(int), cudaMemcpyDefault))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(device_id)); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMalloc(reinterpret_cast(&d_arch), sizeof(int))); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMemcpy(d_arch, &arch, sizeof(int), cudaMemcpyDefault)); query_cuda_kernel_arch<<<1, 1>>>(d_arch); - KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_memcpy_wrapper( - &arch, d_arch, sizeof(int), cudaMemcpyDefault))); KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_free_wrapper(d_arch))); + cudaMemcpy(&arch, d_arch, sizeof(int), cudaMemcpyDefault)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(d_arch)); return arch; } @@ -135,7 +135,6 @@ Kokkos::View cuda_global_unique_token_locks( return locks; } -// FIXME_CUDA_MULTIPLE_DEVICES void cuda_device_synchronize(const std::string &name) { Kokkos::Tools::Experimental::Impl::profile_fence_event( name, @@ -144,16 +143,16 @@ void cuda_device_synchronize(const std::string &name) { #if defined(KOKKOS_COMPILER_CLANG) // annotate with __host__ silence a clang warning about using // cudaDeviceSynchronize in device code - [] __host__() { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_synchronize_wrapper())); - }); + [] __host__() #else - []() { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_synchronize_wrapper())); - }); + []() #endif + { + for (int cuda_device : Kokkos::Impl::CudaInternal::cuda_devices) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize()); + } + }); } void cuda_stream_synchronize(const cudaStream_t stream, const CudaInternal *ptr, @@ -168,25 +167,11 @@ void cuda_stream_synchronize(const cudaStream_t stream, const CudaInternal *ptr, }); } -void cuda_stream_synchronize( - const cudaStream_t stream, - Kokkos::Tools::Experimental::SpecialSynchronizationCases reason, - const std::string &name) { - Kokkos::Tools::Experimental::Impl::profile_fence_event( - name, reason, [&]() { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_stream_synchronize_wrapper( - stream))); - }); -} - void cuda_internal_error_throw(cudaError e, const char *name, const char *file, const int line) { std::ostringstream out; - out << name << " error( " - << CudaInternal::singleton().cuda_get_error_name_wrapper(e) - << "): " - << CudaInternal::singleton().cuda_get_error_string_wrapper(e); + out << name << " error( " << cudaGetErrorName(e) + << "): " << cudaGetErrorString(e); if (file) { out << " " << file << ":" << line; } @@ -196,10 +181,8 @@ void cuda_internal_error_throw(cudaError e, const char *name, const char *file, void cuda_internal_error_abort(cudaError e, const char *name, const char *file, const int line) { std::ostringstream out; - out << name << " error( " - << CudaInternal::singleton().cuda_get_error_name_wrapper(e) - << "): " - << CudaInternal::singleton().cuda_get_error_string_wrapper(e); + out << name << " error( " << cudaGetErrorName(e) + << "): " << cudaGetErrorString(e); if (file) { out << " " << file << ":" << line; } @@ -208,96 +191,6 @@ void cuda_internal_error_abort(cudaError e, const char *name, const char *file, host_abort(out.str().c_str()); } -//---------------------------------------------------------------------------- -// Some significant cuda device properties: -// -// cudaDeviceProp::name : Text label for device -// cudaDeviceProp::major : Device major number -// cudaDeviceProp::minor : Device minor number -// cudaDeviceProp::warpSize : number of threads per warp -// cudaDeviceProp::multiProcessorCount : number of multiprocessors -// cudaDeviceProp::sharedMemPerBlock : capacity of shared memory per block -// cudaDeviceProp::totalConstMem : capacity of constant memory -// cudaDeviceProp::totalGlobalMem : capacity of global memory -// cudaDeviceProp::maxGridSize[3] : maximum grid size - -// -// Section 4.4.2.4 of the CUDA Toolkit Reference Manual -// -// struct cudaDeviceProp { -// char name[256]; -// size_t totalGlobalMem; -// size_t sharedMemPerBlock; -// int regsPerBlock; -// int warpSize; -// size_t memPitch; -// int maxThreadsPerBlock; -// int maxThreadsDim[3]; -// int maxGridSize[3]; -// size_t totalConstMem; -// int major; -// int minor; -// int clockRate; -// size_t textureAlignment; -// int deviceOverlap; -// int multiProcessorCount; -// int kernelExecTimeoutEnabled; -// int integrated; -// int canMapHostMemory; -// int computeMode; -// int concurrentKernels; -// int ECCEnabled; -// int pciBusID; -// int pciDeviceID; -// int tccDriver; -// int asyncEngineCount; -// int unifiedAddressing; -// int memoryClockRate; -// int memoryBusWidth; -// int l2CacheSize; -// int maxThreadsPerMultiProcessor; -// }; - -namespace { - -class CudaInternalDevices { - public: - enum { MAXIMUM_DEVICE_COUNT = 64 }; - struct cudaDeviceProp m_cudaProp[MAXIMUM_DEVICE_COUNT]; - int m_cudaDevCount; - - CudaInternalDevices(); - - static const CudaInternalDevices &singleton(); -}; - -CudaInternalDevices::CudaInternalDevices() { - // See 'cudaSetDeviceFlags' for host-device thread interaction - // Section 4.4.2.6 of the CUDA Toolkit Reference Manual - - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_device_count_wrapper( - &m_cudaDevCount))); - - if (m_cudaDevCount > MAXIMUM_DEVICE_COUNT) { - Kokkos::abort( - "Sorry, you have more GPUs per node than we thought anybody would ever " - "have. Please report this to github.com/kokkos/kokkos."); - } - for (int i = 0; i < m_cudaDevCount; ++i) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_device_properties_wrapper( - m_cudaProp + i, i))); - } -} - -const CudaInternalDevices &CudaInternalDevices::singleton() { - static CudaInternalDevices self; - return self; -} - -} // namespace - //---------------------------------------------------------------------------- int Impl::CudaInternal::concurrency() { @@ -307,8 +200,6 @@ int Impl::CudaInternal::concurrency() { } void CudaInternal::print_configuration(std::ostream &s) const { - const CudaInternalDevices &dev_info = CudaInternalDevices::singleton(); - #if defined(KOKKOS_ENABLE_CUDA) s << "macro KOKKOS_ENABLE_CUDA : defined\n"; #endif @@ -317,22 +208,23 @@ void CudaInternal::print_configuration(std::ostream &s) const { << CUDA_VERSION / 1000 << "." << (CUDA_VERSION % 1000) / 10 << '\n'; #endif - for (int i = 0; i < dev_info.m_cudaDevCount; ++i) { - s << "Kokkos::Cuda[ " << i << " ] " << dev_info.m_cudaProp[i].name - << " capability " << dev_info.m_cudaProp[i].major << "." - << dev_info.m_cudaProp[i].minor << ", Total Global Memory: " - << human_memory_size(dev_info.m_cudaProp[i].totalGlobalMem) + for (int i : get_visible_devices()) { + cudaDeviceProp prop; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(&prop, i)); + s << "Kokkos::Cuda[ " << i << " ] " << prop.name << " capability " + << prop.major << "." << prop.minor + << ", Total Global Memory: " << human_memory_size(prop.totalGlobalMem) << ", Shared Memory per Block: " - << human_memory_size(dev_info.m_cudaProp[i].sharedMemPerBlock); + << human_memory_size(prop.sharedMemPerBlock); if (m_cudaDev == i) s << " : Selected"; - s << std::endl; + s << '\n'; } } //---------------------------------------------------------------------------- CudaInternal::~CudaInternal() { - if (m_stream || m_scratchSpace || m_scratchFlags || m_scratchUnified) { + if (m_scratchSpace || m_scratchFlags || m_scratchUnified) { std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()" << std::endl; } @@ -370,45 +262,53 @@ void CudaInternal::fence() const { fence("Kokkos::CudaInternal::fence(): Unnamed Instance Fence"); } -void CudaInternal::initialize(cudaStream_t stream, bool manage_stream) { +void CudaInternal::initialize(cudaStream_t stream) { KOKKOS_EXPECTS(!is_initialized()); if (was_finalized) Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n"); was_initialized = true; + // Check that the device associated with the stream matches cuda_device + CUcontext context; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuStreamGetCtx(stream, &context))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuCtxPushCurrent(context))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuCtxGetDevice(&m_cudaDev))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev)); + + m_stream = stream; + CudaInternal::cuda_devices.insert(m_cudaDev); + + // Allocate a staging buffer for constant mem in pinned host memory + // and an event to avoid overwriting driver for previous kernel launches + if (!constantMemHostStagingPerDevice[m_cudaDev]) + KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_malloc_host_wrapper( + reinterpret_cast(&constantMemHostStagingPerDevice[m_cudaDev]), + CudaTraits::ConstantMemoryUsage))); + + if (!constantMemReusablePerDevice[m_cudaDev]) + KOKKOS_IMPL_CUDA_SAFE_CALL( + (cuda_event_create_wrapper(&constantMemReusablePerDevice[m_cudaDev]))); + //---------------------------------- // Multiblock reduction uses scratch flags for counters // and scratch space for partial reduction values. // Allocate some initial space. This will grow as needed. { - const unsigned reduce_block_count = - m_maxWarpCount * Impl::CudaTraits::WarpSize; + // Maximum number of warps, + // at most one warp per thread in a warp for reduction. + auto const maxWarpCount = std::min( + m_deviceProp.maxThreadsPerBlock / CudaTraits::WarpSize, + CudaTraits::WarpSize); + unsigned const reduce_block_count = + maxWarpCount * Impl::CudaTraits::WarpSize; (void)scratch_unified(16 * sizeof(size_type)); (void)scratch_flags(reduce_block_count * 2 * sizeof(size_type)); (void)scratch_space(reduce_block_count * 16 * sizeof(size_type)); } - // Init the array for used for arbitrarily sized atomics - if (this == &singleton()) { - desul::Impl::init_lock_arrays(); // FIXME - } - - // Allocate a staging buffer for constant mem in pinned host memory - // and an event to avoid overwriting driver for previous kernel launches - if (this == &singleton()) { - KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_malloc_host_wrapper( - reinterpret_cast(&constantMemHostStaging), - CudaTraits::ConstantMemoryUsage))); - - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_event_create_wrapper(&constantMemReusable))); - } - - m_stream = stream; - m_manage_stream = manage_stream; for (int i = 0; i < m_n_team_scratch; ++i) { m_team_scratch_current_size[i] = 0; m_team_scratch_ptr[i] = nullptr; @@ -427,22 +327,23 @@ void CudaInternal::initialize(cudaStream_t stream, bool manage_stream) { Cuda::size_type *CudaInternal::scratch_flags(const std::size_t size) const { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { - m_scratchFlagsCount = scratch_count(size); + auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); - using Record = - Kokkos::Impl::SharedAllocationRecord; + if (m_scratchFlags) { + mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); + } - if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags)); + m_scratchFlagsCount = scratch_count(size); std::size_t alloc_size = multiply_overflow_abort(m_scratchFlagsCount, sizeScratchGrain); - Record *const r = Record::allocate( - Kokkos::CudaSpace(), "Kokkos::InternalScratchFlags", alloc_size); - - Record::increment(r); - - m_scratchFlags = reinterpret_cast(r->data()); + m_scratchFlags = static_cast( + mem_space.allocate("Kokkos::InternalScratchFlags", alloc_size)); + // We only zero-initialize the allocation when we actually allocate. + // It's the responsibility of the features using scratch_flags, + // namely parallel_reduce and parallel_scan, to reset the used values to 0. KOKKOS_IMPL_CUDA_SAFE_CALL( (cuda_memset_wrapper(m_scratchFlags, 0, alloc_size))); } @@ -453,21 +354,19 @@ Cuda::size_type *CudaInternal::scratch_flags(const std::size_t size) const { Cuda::size_type *CudaInternal::scratch_space(const std::size_t size) const { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { - m_scratchSpaceCount = scratch_count(size); + auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); - using Record = - Kokkos::Impl::SharedAllocationRecord; + if (m_scratchSpace) { + mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + } - if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace)); + m_scratchSpaceCount = scratch_count(size); std::size_t alloc_size = multiply_overflow_abort(m_scratchSpaceCount, sizeScratchGrain); - Record *const r = Record::allocate( - Kokkos::CudaSpace(), "Kokkos::InternalScratchSpace", alloc_size); - - Record::increment(r); - - m_scratchSpace = reinterpret_cast(r->data()); + m_scratchSpace = static_cast( + mem_space.allocate("Kokkos::InternalScratchSpace", alloc_size)); } return m_scratchSpace; @@ -476,23 +375,20 @@ Cuda::size_type *CudaInternal::scratch_space(const std::size_t size) const { Cuda::size_type *CudaInternal::scratch_unified(const std::size_t size) const { if (verify_is_initialized("scratch_unified") && m_scratchUnifiedCount < scratch_count(size)) { - m_scratchUnifiedCount = scratch_count(size); + auto mem_space = + Kokkos::CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream); - using Record = - Kokkos::Impl::SharedAllocationRecord; + if (m_scratchUnified) { + mem_space.deallocate(m_scratchUnified, + m_scratchUnifiedCount * sizeScratchGrain); + } - if (m_scratchUnified) - Record::decrement(Record::get_record(m_scratchUnified)); + m_scratchUnifiedCount = scratch_count(size); std::size_t alloc_size = multiply_overflow_abort(m_scratchUnifiedCount, sizeScratchGrain); - Record *const r = - Record::allocate(Kokkos::CudaHostPinnedSpace(), - "Kokkos::InternalScratchUnified", alloc_size); - - Record::increment(r); - - m_scratchUnified = reinterpret_cast(r->data()); + m_scratchUnified = static_cast( + mem_space.allocate("Kokkos::InternalScratchUnified", alloc_size)); } return m_scratchUnified; @@ -500,21 +396,16 @@ Cuda::size_type *CudaInternal::scratch_unified(const std::size_t size) const { Cuda::size_type *CudaInternal::scratch_functor(const std::size_t size) const { if (verify_is_initialized("scratch_functor") && m_scratchFunctorSize < size) { - m_scratchFunctorSize = size; - - using Record = - Kokkos::Impl::SharedAllocationRecord; - - if (m_scratchFunctor) - Record::decrement(Record::get_record(m_scratchFunctor)); + auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); - Record *const r = - Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchFunctor", - m_scratchFunctorSize); + if (m_scratchFunctor) { + mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize); + } - Record::increment(r); + m_scratchFunctorSize = size; - m_scratchFunctor = reinterpret_cast(r->data()); + m_scratchFunctor = static_cast(mem_space.allocate( + "Kokkos::InternalScratchFunctor", m_scratchFunctorSize)); } return m_scratchFunctor; @@ -537,21 +428,21 @@ void *CudaInternal::resize_team_scratch_space(int scratch_pool_id, // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race // condition. + auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); if (m_team_scratch_current_size[scratch_pool_id] == 0) { m_team_scratch_current_size[scratch_pool_id] = bytes; m_team_scratch_ptr[scratch_pool_id] = - Kokkos::kokkos_malloc( - "Kokkos::CudaSpace::TeamScratchMemory", - m_team_scratch_current_size[scratch_pool_id]); + mem_space.allocate("Kokkos::CudaSpace::TeamScratchMemory", + m_team_scratch_current_size[scratch_pool_id]); } if ((bytes > m_team_scratch_current_size[scratch_pool_id]) || ((bytes < m_team_scratch_current_size[scratch_pool_id]) && (force_shrink))) { + mem_space.deallocate(m_team_scratch_ptr[scratch_pool_id], + m_team_scratch_current_size[scratch_pool_id]); m_team_scratch_current_size[scratch_pool_id] = bytes; m_team_scratch_ptr[scratch_pool_id] = - Kokkos::kokkos_realloc( - m_team_scratch_ptr[scratch_pool_id], - m_team_scratch_current_size[scratch_pool_id]); + mem_space.allocate("Kokkos::CudaSpace::TeamScratchMemory", bytes); } return m_team_scratch_ptr[scratch_pool_id]; } @@ -568,50 +459,33 @@ void CudaInternal::finalize() { was_finalized = true; - // Only finalize this if we're the singleton - if (this == &singleton()) { - (void)Impl::cuda_global_unique_token_locks(true); - desul::Impl::finalize_lock_arrays(); // FIXME - - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_free_host_wrapper(constantMemHostStaging))); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_event_destroy_wrapper(constantMemReusable))); - auto &deep_copy_space = - Kokkos::Impl::cuda_get_deep_copy_space(/*initialize*/ false); - if (deep_copy_space) - deep_copy_space->impl_internal_space_instance()->finalize(); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_stream_destroy_wrapper(cuda_get_deep_copy_stream()))); - } - + auto cuda_mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) { - using RecordCuda = Kokkos::Impl::SharedAllocationRecord; - using RecordHost = - Kokkos::Impl::SharedAllocationRecord; - - RecordCuda::decrement(RecordCuda::get_record(m_scratchFlags)); - RecordCuda::decrement(RecordCuda::get_record(m_scratchSpace)); - RecordHost::decrement(RecordHost::get_record(m_scratchUnified)); - if (m_scratchFunctorSize > 0) - RecordCuda::decrement(RecordCuda::get_record(m_scratchFunctor)); + auto host_mem_space = + Kokkos::CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream); + cuda_mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); + cuda_mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + host_mem_space.deallocate(m_scratchUnified, + m_scratchUnifiedCount * sizeScratchGrain); + if (m_scratchFunctorSize > 0) { + cuda_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize); + } } for (int i = 0; i < m_n_team_scratch; ++i) { if (m_team_scratch_current_size[i] > 0) - Kokkos::kokkos_free(m_team_scratch_ptr[i]); + cuda_mem_space.deallocate(m_team_scratch_ptr[i], + m_team_scratch_current_size[i]); } - if (m_manage_stream && get_stream() != nullptr) - KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_stream_destroy_wrapper(m_stream))); - m_scratchSpaceCount = 0; m_scratchFlagsCount = 0; m_scratchUnifiedCount = 0; m_scratchSpace = nullptr; m_scratchFlags = nullptr; m_scratchUnified = nullptr; - m_stream = nullptr; for (int i = 0; i < m_n_team_scratch; ++i) { m_team_scratch_current_size[i] = 0; m_team_scratch_ptr[i] = nullptr; @@ -624,30 +498,6 @@ void CudaInternal::finalize() { //---------------------------------------------------------------------------- -Cuda::size_type cuda_internal_multiprocessor_count() { - return CudaInternal::singleton().m_multiProcCount; -} - -CudaSpace::size_type cuda_internal_maximum_concurrent_block_count() { -#if defined(KOKKOS_ARCH_KEPLER) - // Compute capability 3.0 through 3.7 - enum : int { max_resident_blocks_per_multiprocessor = 16 }; -#else - // Compute capability 5.0 through 6.2 - enum : int { max_resident_blocks_per_multiprocessor = 32 }; -#endif - return CudaInternal::singleton().m_multiProcCount * - max_resident_blocks_per_multiprocessor; -}; - -Cuda::size_type cuda_internal_maximum_warp_count() { - return CudaInternal::singleton().m_maxWarpCount; -} - -std::array cuda_internal_maximum_grid_count() { - return CudaInternal::singleton().m_maxBlock; -} - Cuda::size_type *cuda_internal_scratch_space(const Cuda &instance, const std::size_t size) { return instance.impl_internal_space_instance()->scratch_space(size); @@ -670,10 +520,6 @@ Cuda::size_type *cuda_internal_scratch_unified(const Cuda &instance, namespace Kokkos { -Cuda::size_type Cuda::detect_device_count() { - return Impl::CudaInternalDevices::singleton().m_cudaDevCount; -} - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 int Cuda::concurrency() { #else @@ -687,25 +533,23 @@ int Cuda::impl_is_initialized() { } void Cuda::impl_initialize(InitializationSettings const &settings) { - const int cuda_device_id = Impl::get_gpu(settings); - const auto &dev_info = Impl::CudaInternalDevices::singleton(); - - const struct cudaDeviceProp &cudaProp = dev_info.m_cudaProp[cuda_device_id]; + const std::vector &visible_devices = Impl::get_visible_devices(); + const int cuda_device_id = + Impl::get_gpu(settings).value_or(visible_devices[0]); - Impl::CudaInternal::m_cudaDev = cuda_device_id; + cudaDeviceProp cudaProp; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaGetDeviceProperties(&cudaProp, cuda_device_id)); Impl::CudaInternal::m_deviceProp = cudaProp; - - Kokkos::Impl::cuda_device_synchronize( - "Kokkos::CudaInternal::initialize: Fence on space initialization"); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize()); // Query what compute capability architecture a kernel executes: - Impl::CudaInternal::m_cudaArch = Impl::cuda_kernel_arch(); + Impl::CudaInternal::m_cudaArch = Impl::cuda_kernel_arch(cuda_device_id); if (Impl::CudaInternal::m_cudaArch == 0) { - std::stringstream ss; - ss << "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture\n"; - std::string msg = ss.str(); - Kokkos::abort(msg.c_str()); + Kokkos::abort( + "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture\n"); } int compiled_major = Impl::CudaInternal::m_cudaArch / 100; @@ -761,77 +605,41 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default } #endif - //---------------------------------- - // number of multiprocessors - Impl::CudaInternal::m_multiProcCount = cudaProp.multiProcessorCount; - - //---------------------------------- - // Maximum number of warps, - // at most one warp per thread in a warp for reduction. - Impl::CudaInternal::m_maxWarpCount = - cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize; - - if (Impl::CudaTraits::WarpSize < Impl::CudaInternal::m_maxWarpCount) { - Impl::CudaInternal::m_maxWarpCount = Impl::CudaTraits::WarpSize; - } - - //---------------------------------- - // Maximum number of blocks: - - Impl::CudaInternal::m_maxBlock[0] = cudaProp.maxGridSize[0]; - Impl::CudaInternal::m_maxBlock[1] = cudaProp.maxGridSize[1]; - Impl::CudaInternal::m_maxBlock[2] = cudaProp.maxGridSize[2]; - - Impl::CudaInternal::m_shmemPerSM = cudaProp.sharedMemPerMultiprocessor; - Impl::CudaInternal::m_maxShmemPerBlock = cudaProp.sharedMemPerBlock; - Impl::CudaInternal::m_maxBlocksPerSM = - Impl::CudaInternal::m_cudaArch < 500 - ? 16 - : (Impl::CudaInternal::m_cudaArch < 750 - ? 32 - : (Impl::CudaInternal::m_cudaArch == 750 ? 16 : 32)); - Impl::CudaInternal::m_maxThreadsPerSM = cudaProp.maxThreadsPerMultiProcessor; - Impl::CudaInternal::m_maxThreadsPerBlock = cudaProp.maxThreadsPerBlock; - //---------------------------------- cudaStream_t singleton_stream; - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_stream_create_wrapper( - &singleton_stream))); - - auto &cuda_singleton = Impl::CudaInternal::singleton(); - cuda_singleton.initialize(singleton_stream, /*manage*/ true); -} + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&singleton_stream)); -std::vector Cuda::detect_device_arch() { - const Impl::CudaInternalDevices &s = Impl::CudaInternalDevices::singleton(); - - std::vector output(s.m_cudaDevCount); - - for (int i = 0; i < s.m_cudaDevCount; ++i) { - output[i] = s.m_cudaProp[i].major * 100 + s.m_cudaProp[i].minor; - } + // Init the array for used for arbitrarily sized atomics + desul::Impl::init_lock_arrays(); // FIXME - return output; + Impl::CudaInternal::singleton().initialize(singleton_stream); } -Cuda::size_type Cuda::device_arch() { - const int dev_id = Impl::CudaInternal::singleton().m_cudaDev; +void Cuda::impl_finalize() { + (void)Impl::cuda_global_unique_token_locks(true); + desul::Impl::finalize_lock_arrays(); // FIXME - int dev_arch = 0; - - if (0 <= dev_id) { - const struct cudaDeviceProp &cudaProp = - Impl::CudaInternalDevices::singleton().m_cudaProp[dev_id]; - - dev_arch = cudaProp.major * 100 + cudaProp.minor; + for (const auto cuda_device : Kokkos::Impl::CudaInternal::cuda_devices) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaFreeHost(Kokkos::Impl::CudaInternal::constantMemHostStagingPerDevice + [cuda_device])); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventDestroy( + Kokkos::Impl::CudaInternal::constantMemReusablePerDevice[cuda_device])); } - return dev_arch; -} + auto &deep_copy_space = Impl::cuda_get_deep_copy_space(/*initialize*/ false); + if (deep_copy_space) + deep_copy_space->impl_internal_space_instance()->finalize(); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaStreamDestroy(Impl::cuda_get_deep_copy_stream())); -void Cuda::impl_finalize() { Impl::CudaInternal::singleton().finalize(); } + Impl::CudaInternal::singleton().finalize(); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaStreamDestroy(Impl::CudaInternal::singleton().m_stream)); +} Cuda::Cuda() : m_space_instance(&Impl::CudaInternal::singleton(), @@ -845,13 +653,17 @@ KOKKOS_DEPRECATED Cuda::Cuda(cudaStream_t stream, bool manage_stream) manage_stream ? Impl::ManageStream::yes : Impl::ManageStream::no) {} Cuda::Cuda(cudaStream_t stream, Impl::ManageStream manage_stream) - : m_space_instance(new Impl::CudaInternal, [](Impl::CudaInternal *ptr) { - ptr->finalize(); - delete ptr; - }) { + : m_space_instance( + new Impl::CudaInternal, [manage_stream](Impl::CudaInternal *ptr) { + ptr->finalize(); + if (static_cast(manage_stream)) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(ptr->m_stream)); + } + delete ptr; + }) { Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor"); - m_space_instance->initialize(stream, static_cast(manage_stream)); + m_space_instance->initialize(stream); } void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const { diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp index a324adecfeb0..24f4af310190 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -22,6 +22,10 @@ #include #include #include +#include "Kokkos_CudaSpace.hpp" + +#include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -55,27 +59,10 @@ struct CudaTraits { unsigned long[ConstantMemoryUsage / sizeof(unsigned long)]; static constexpr int ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */; - - KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_count( - CudaSpace::size_type i) { - return (i + WarpIndexMask) >> WarpIndexShift; - } - - KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_align( - CudaSpace::size_type i) { - constexpr CudaSpace::size_type Mask = ~WarpIndexMask; - return (i + WarpIndexMask) & Mask; - } }; //---------------------------------------------------------------------------- -CudaSpace::size_type cuda_internal_multiprocessor_count(); -CudaSpace::size_type cuda_internal_maximum_warp_count(); -std::array cuda_internal_maximum_grid_count(); - -CudaSpace::size_type cuda_internal_maximum_concurrent_block_count(); - CudaSpace::size_type* cuda_internal_scratch_flags(const Cuda&, const std::size_t size); CudaSpace::size_type* cuda_internal_scratch_space(const Cuda&, @@ -101,18 +88,10 @@ class CudaInternal { public: using size_type = Cuda::size_type; - inline static int m_cudaDev = -1; + int m_cudaDev = -1; // Device Properties - inline static int m_cudaArch = -1; - inline static unsigned m_multiProcCount = 0; - inline static unsigned m_maxWarpCount = 0; - inline static std::array m_maxBlock = {0, 0, 0}; - inline static int m_shmemPerSM = 0; - inline static int m_maxShmemPerBlock = 0; - inline static int m_maxBlocksPerSM = 0; - inline static int m_maxThreadsPerSM = 0; - inline static int m_maxThreadsPerBlock = 0; + inline static int m_cudaArch = -1; static int concurrency(); inline static cudaDeviceProp m_deviceProp; @@ -129,7 +108,6 @@ class CudaInternal { mutable size_type* m_scratchFunctor; cudaStream_t m_stream; uint32_t m_instance_id; - bool m_manage_stream; // Team Scratch Level 1 Space int m_n_team_scratch = 10; @@ -142,11 +120,11 @@ class CudaInternal { bool was_initialized = false; bool was_finalized = false; - // FIXME_CUDA: these want to be per-device, not per-stream... use of 'static' - // here will break once there are multiple devices though - inline static unsigned long* constantMemHostStaging = nullptr; - inline static cudaEvent_t constantMemReusable = nullptr; - inline static std::mutex constantMemMutex; + inline static std::set cuda_devices = {}; + inline static std::map constantMemHostStagingPerDevice = + {}; + inline static std::map constantMemReusablePerDevice = {}; + inline static std::map constantMemMutexPerDevice = {}; static CudaInternal& singleton(); @@ -156,7 +134,7 @@ class CudaInternal { return nullptr != m_scratchSpace && nullptr != m_scratchFlags; } - void initialize(cudaStream_t stream, bool manage_stream); + void initialize(cudaStream_t stream); void finalize(); void print_configuration(std::ostream&) const; @@ -247,12 +225,6 @@ class CudaInternal { return cudaDeviceSetLimit(limit, value); } - template - cudaError_t cuda_device_synchronize_wrapper() const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaDeviceSynchronize(); - } - template cudaError_t cuda_event_create_wrapper(cudaEvent_t* event) const { if constexpr (setCudaDevice) set_cuda_device(); @@ -290,37 +262,6 @@ class CudaInternal { return cudaFreeHost(ptr); } - template - cudaError_t cuda_get_device_count_wrapper(int* count) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetDeviceCount(count); - } - - template - cudaError_t cuda_get_device_properties_wrapper(cudaDeviceProp* prop, - int device) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetDeviceProperties(prop, device); - } - - template - const char* cuda_get_error_name_wrapper(cudaError_t error) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetErrorName(error); - } - - template - const char* cuda_get_error_string_wrapper(cudaError_t error) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetErrorString(error); - } - - template - cudaError_t cuda_get_last_error_wrapper() const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetLastError(); - } - template cudaError_t cuda_graph_add_dependencies_wrapper( cudaGraph_t graph, const cudaGraphNode_t* from, const cudaGraphNode_t* to, @@ -506,10 +447,10 @@ class CudaInternal { } template - cudaError_t cuda_func_set_attributes_wrapper(T* entry, cudaFuncAttribute attr, - int value) const { + cudaError_t cuda_func_set_attribute_wrapper(T* entry, cudaFuncAttribute attr, + int value) const { if constexpr (setCudaDevice) set_cuda_device(); - return cudaFuncSetAttributes(entry, attr, value); + return cudaFuncSetAttribute(entry, attr, value); } template diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index 82a72b690218..b0dadb45f72b 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -21,7 +21,6 @@ #ifdef KOKKOS_ENABLE_CUDA #include -#include #include #include #include @@ -118,42 +117,43 @@ inline bool is_empty_launch(dim3 const& grid, dim3 const& block) { } inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) { - if (cuda_instance->m_maxShmemPerBlock < shmem) { + int const maxShmemPerBlock = cuda_instance->m_deviceProp.sharedMemPerBlock; + if (maxShmemPerBlock < shmem) { Kokkos::Impl::throw_runtime_exception( - std::string("CudaParallelLaunch (or graph node creation) FAILED: shared" - " memory request is too large")); + "CudaParallelLaunch (or graph node creation) FAILED: shared memory " + "request is too large"); } } // These functions need to be templated on DriverType and LaunchBounds // so that the static bool is unique for each type combo // KernelFuncPtr does not necessarily contain that type information. -// FIXME_CUDA_MULTIPLE_DEVICES template const cudaFuncAttributes& get_cuda_kernel_func_attributes( - const KernelFuncPtr& func) { + int cuda_device, const KernelFuncPtr& func) { // Only call cudaFuncGetAttributes once for each unique kernel // by leveraging static variable initialization rules - auto wrap_get_attributes = [&]() -> cudaFuncAttributes { + static std::map func_attr; + if (func_attr.find(cuda_device) == func_attr.end()) { cudaFuncAttributes attr; - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_func_get_attributes_wrapper(&attr, - func))); - return attr; - }; - static cudaFuncAttributes func_attr = wrap_get_attributes(); - return func_attr; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func)); + func_attr.emplace(cuda_device, attr); + } + return func_attr[cuda_device]; } template -inline void configure_shmem_preference(const KernelFuncPtr& func, +inline void configure_shmem_preference(const int cuda_device, + const KernelFuncPtr& func, const cudaDeviceProp& device_props, const size_t block_size, int& shmem, const size_t occupancy) { #ifndef KOKKOS_ARCH_KEPLER const auto& func_attr = - get_cuda_kernel_func_attributes(func); + get_cuda_kernel_func_attributes(cuda_device, + func); // Compute limits for number of blocks due to registers/SM const size_t regs_per_sm = device_props.regsPerMultiprocessor; @@ -222,7 +222,7 @@ inline void configure_shmem_preference(const KernelFuncPtr& func, // FIXME_CUDA_MULTIPLE_DEVICES auto set_cache_config = [&] { KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_func_set_attributes_wrapper( + (CudaInternal::singleton().cuda_func_set_attribute_wrapper( func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout))); return carveout; }; @@ -387,8 +387,8 @@ struct CudaParallelLaunchKernelInvoker< driver.get_policy().impl_get_desired_occupancy().value(); size_t block_size = block.x * block.y * block.z; Impl::configure_shmem_preference( - base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, - shmem, desired_occupancy); + cuda_instance->m_cudaDev, base_t::get_kernel_func(), + cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy); } void const* args[] = {&driver}; @@ -487,8 +487,8 @@ struct CudaParallelLaunchKernelInvoker< driver.get_policy().impl_get_desired_occupancy().value(); size_t block_size = block.x * block.y * block.z; Impl::configure_shmem_preference( - base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, - shmem, desired_occupancy); + cuda_instance->m_cudaDev, base_t::get_kernel_func(), + cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy); } auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); @@ -576,13 +576,16 @@ struct CudaParallelLaunchKernelInvoker< static void invoke_kernel(DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, CudaInternal const* cuda_instance) { + int cuda_device = cuda_instance->m_cudaDev; // Wait until the previous kernel that uses the constant buffer is done - std::lock_guard lock(CudaInternal::constantMemMutex); + std::lock_guard lock( + CudaInternal::constantMemMutexPerDevice[cuda_device]); KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_event_synchronize_wrapper( - CudaInternal::constantMemReusable))); + CudaInternal::constantMemReusablePerDevice[cuda_device]))); // Copy functor (synchronously) to staging buffer in pinned host memory - unsigned long* staging = cuda_instance->constantMemHostStaging; + unsigned long* staging = + cuda_instance->constantMemHostStagingPerDevice[cuda_device]; memcpy(staging, &driver, sizeof(DriverType)); // Copy functor asynchronously from there to constant memory on the device @@ -597,7 +600,7 @@ struct CudaParallelLaunchKernelInvoker< // Record an event that says when the constant buffer can be reused KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_event_record_wrapper( - CudaInternal::constantMemReusable))); + CudaInternal::constantMemReusablePerDevice[cuda_device]))); } inline static void create_parallel_launch_graph_node( @@ -665,8 +668,8 @@ struct CudaParallelLaunchImpl< Impl::configure_shmem_preference< DriverType, Kokkos::LaunchBounds>( - base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, - shmem, desired_occupancy); + cuda_instance->m_cudaDev, base_t::get_kernel_func(), + cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy); } desul::ensure_cuda_lock_arrays_on_device(); @@ -675,18 +678,17 @@ struct CudaParallelLaunchImpl< base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance); #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_instance->cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); cuda_instance->fence( "Kokkos::Impl::launch_kernel: Debug Only Check for Execution Error"); #endif } } - static cudaFuncAttributes get_cuda_func_attributes() { + static cudaFuncAttributes get_cuda_func_attributes(int cuda_device) { return get_cuda_kernel_func_attributes< DriverType, Kokkos::LaunchBounds>( - base_t::get_kernel_func()); + cuda_device, base_t::get_kernel_func()); } }; diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp index 7492ab49e56b..2c7eba7a18ff 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp @@ -40,8 +40,8 @@ template <> inline TileSizeProperties get_tile_size_properties( const Kokkos::Cuda& space) { TileSizeProperties properties; - properties.max_threads = - space.impl_internal_space_instance()->m_maxThreadsPerSM; + properties.max_threads = space.impl_internal_space_instance() + ->m_deviceProp.maxThreadsPerMultiProcessor; properties.default_largest_tile_size = 16; properties.default_tile_size = 2; properties.max_total_tile_size = 512; diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp index 49d6c112e370..630389840048 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp @@ -28,7 +28,6 @@ #include #include #include -#include #include #include @@ -42,8 +41,8 @@ namespace Impl { template int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) { cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); + CudaParallelLaunch::get_cuda_func_attributes( + pol.space().cuda_device()); auto const& prop = pol.space().cuda_device_prop(); // Limits due to registers/SM, MDRange doesn't have @@ -96,7 +95,7 @@ class ParallelFor, Kokkos::Cuda> { inline void execute() const { if (m_rp.m_num_tiles == 0) return; - const auto maxblocks = cuda_internal_maximum_grid_count(); + const auto maxblocks = m_rp.space().cuda_device_prop().maxGridSize; if (RP::rank == 2) { const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], 1); KOKKOS_ASSERT(block.x > 0); @@ -325,19 +324,18 @@ class ParallelReduce( f, n); using closure_type = Impl::ParallelReduce, Policy, Kokkos::Cuda>; - cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); + cudaFuncAttributes attr = CudaParallelLaunch:: + get_cuda_func_attributes(m_policy.space().cuda_device()); while ( - (n && - (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < - shmem_size)) || + (n && (maxShmemPerBlock < shmem_size)) || (n > static_cast( Kokkos::Impl::cuda_get_max_block_size( diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp index 34729992812b..0f052be3c307 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp @@ -28,7 +28,6 @@ #include #include #include -#include #include #include @@ -86,18 +85,18 @@ class ParallelFor, Kokkos::Cuda> { const typename Policy::index_type nwork = m_policy.end() - m_policy.begin(); cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); + CudaParallelLaunch::get_cuda_func_attributes( + m_policy.space().cuda_device()); const int block_size = Kokkos::Impl::cuda_get_opt_block_size( m_policy.space().impl_internal_space_instance(), attr, m_functor, 1, 0, 0); KOKKOS_ASSERT(block_size > 0); dim3 block(1, block_size, 1); + const int maxGridSizeX = m_policy.space().cuda_device_prop().maxGridSize[0]; dim3 grid( - std::min( - typename Policy::index_type((nwork + block.y - 1) / block.y), - typename Policy::index_type(cuda_internal_maximum_grid_count()[0])), + std::min(typename Policy::index_type((nwork + block.y - 1) / block.y), + typename Policy::index_type(maxGridSizeX)), 1, 1); #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION if (Kokkos::Impl::CudaInternal::cuda_use_serial_execution()) { @@ -244,10 +243,10 @@ class ParallelReduce, if (CudaTraits::WarpSize < word_count.value) { __syncthreads(); } else if (word_count.value > 1) { - // Inside cuda_single_inter_block_reduce_scan() above, shared[i] below - // might have been updated by a single thread within a warp without - // synchronization afterwards. Synchronize threads within warp to avoid - // potential racecondition. + // Inside cuda_single_inter_block_reduce_scan() and final() above, + // shared[i] below might have been updated by a single thread within a + // warp without synchronization afterwards. Synchronize threads within + // warp to avoid potential race condition. __syncwarp(0xffffffff); } @@ -260,19 +259,18 @@ class ParallelReduce, // Determine block size constrained by shared memory: inline unsigned local_block_size(const FunctorType& f) { unsigned n = CudaTraits::WarpSize * 8; + const int maxShmemPerBlock = + m_policy.space().cuda_device_prop().sharedMemPerBlock; int shmem_size = cuda_single_inter_block_reduce_scan_shmem( f, n); using closure_type = Impl::ParallelReduce, Policy, Kokkos::Cuda>; - cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); + cudaFuncAttributes attr = CudaParallelLaunch:: + get_cuda_func_attributes(m_policy.space().cuda_device()); while ( - (n && - (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < - shmem_size)) || + (n && (maxShmemPerBlock < shmem_size)) || (n > static_cast( Kokkos::Impl::cuda_get_max_block_size( @@ -615,11 +613,11 @@ class ParallelScan, Kokkos::Cuda> { // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit // testing + const int maxShmemPerBlock = + m_policy.space().cuda_device_prop().sharedMemPerBlock; unsigned n = CudaTraits::WarpSize * 4; while (n && - unsigned(m_policy.space() - .impl_internal_space_instance() - ->m_maxShmemPerBlock) < + unsigned(maxShmemPerBlock) < cuda_single_inter_block_reduce_scan_shmem(f, n)) { n >>= 1; @@ -939,11 +937,11 @@ class ParallelScanWithTotal, // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit // testing + const int maxShmemPerBlock = + m_policy.space().cuda_device_prop().sharedMemPerBlock; unsigned n = CudaTraits::WarpSize * 4; while (n && - unsigned(m_policy.space() - .impl_internal_space_instance() - ->m_maxShmemPerBlock) < + unsigned(maxShmemPerBlock) < cuda_single_inter_block_reduce_scan_shmem(f, n)) { n >>= 1; diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp index b4679b4e0da7..9f7be45c839b 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include @@ -98,7 +98,7 @@ class TeamPolicyInternal Impl::ParallelFor>; cudaFuncAttributes attr = CudaParallelLaunch:: - get_cuda_func_attributes(); + get_cuda_func_attributes(space().cuda_device()); int block_size = Kokkos::Impl::cuda_get_max_block_size( @@ -137,7 +137,7 @@ class TeamPolicyInternal Impl::ParallelFor>; cudaFuncAttributes attr = CudaParallelLaunch:: - get_cuda_func_attributes(); + get_cuda_func_attributes(space().cuda_device()); const int block_size = Kokkos::Impl::cuda_get_opt_block_size( @@ -262,7 +262,8 @@ class TeamPolicyInternal m_tune_team(bool(team_size_request <= 0)), m_tune_vector(bool(vector_length_request <= 0)) { // Make sure league size is permissible - if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count()[0])) + const int maxGridSizeX = m_space.cuda_device_prop().maxGridSize[0]; + if (league_size_ >= maxGridSizeX) Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution " "space."); @@ -369,7 +370,7 @@ class TeamPolicyInternal cudaFuncAttributes attr = CudaParallelLaunch:: - get_cuda_func_attributes(); + get_cuda_func_attributes(space().cuda_device()); const int block_size = std::forward(block_size_callable)( space().impl_internal_space_instance(), attr, f, (size_t)impl_vector_length(), @@ -539,8 +540,8 @@ class ParallelFor, auto internal_space_instance = m_policy.space().impl_internal_space_instance(); cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); + CudaParallelLaunch::get_cuda_func_attributes( + internal_space_instance->m_cudaDev); m_team_size = m_team_size >= 0 ? m_team_size @@ -575,10 +576,11 @@ class ParallelFor, static_cast(m_league_size)))); } + const int maxShmemPerBlock = + m_policy.space().cuda_device_prop().sharedMemPerBlock; const int shmem_size_total = m_shmem_begin + m_shmem_size; - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { - printf("%i %i\n", internal_space_instance->m_maxShmemPerBlock, - shmem_size_total); + if (maxShmemPerBlock < shmem_size_total) { + printf("%i %i\n", maxShmemPerBlock, shmem_size_total); Kokkos::Impl::throw_runtime_exception(std::string( "Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory")); } @@ -623,6 +625,22 @@ class ParallelReduce 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + using word_size_type = std::conditional_t< + sizeof(value_type) < sizeof(Kokkos::Cuda::size_type), + std::conditional_t, + Kokkos::Cuda::size_type>; using size_type = Cuda::size_type; using reducer_type = ReducerType; @@ -646,9 +664,11 @@ class ParallelReduce + const integral_nonzero_constant word_count(m_functor_reducer.get_reducer().value_size() / - sizeof(size_type)); + sizeof(word_size_type)); reference_type value = m_functor_reducer.get_reducer().init( - kokkos_impl_cuda_shared_memory() + + kokkos_impl_cuda_shared_memory() + threadIdx.y * word_count.value); // Iterate this block through the league @@ -721,18 +742,19 @@ class ParallelReduce( m_functor_reducer.get_reducer(), blockIdx.x, gridDim.x, - kokkos_impl_cuda_shared_memory(), m_scratch_space, + kokkos_impl_cuda_shared_memory(), m_scratch_space, m_scratch_flags); if (do_final_reduction) { // This is the final block with the final result at the final threads' // location - size_type* const shared = kokkos_impl_cuda_shared_memory() + - (blockDim.y - 1) * word_count.value; + word_size_type* const shared = + kokkos_impl_cuda_shared_memory() + + (blockDim.y - 1) * word_count.value; size_type* const global = m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) + ? reinterpret_cast(m_result_ptr) : (m_unified_space ? m_unified_space : m_scratch_space); if (threadIdx.y == 0) { @@ -787,7 +809,8 @@ class ParallelReduce(m_scratch_space), result, + m_scratch_flags, blockDim.y)) { const unsigned id = threadIdx.y * blockDim.x + threadIdx.x; if (id == 0) { m_functor_reducer.get_reducer().final(&value); @@ -808,13 +831,15 @@ class ParallelReduce(cuda_internal_scratch_space( + m_policy.space(), + m_functor_reducer.get_reducer().value_size() * block_count)); m_scratch_flags = cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type)); - m_unified_space = cuda_internal_scratch_unified( - m_policy.space(), m_functor_reducer.get_reducer().value_size()); + m_unified_space = + reinterpret_cast(cuda_internal_scratch_unified( + m_policy.space(), m_functor_reducer.get_reducer().value_size())); dim3 block(m_vector_size, m_team_size, 1); dim3 grid(block_count, 1, 1); @@ -847,7 +872,8 @@ class ParallelReduce(m_result_ptr, m_scratch_space, size); + DeepCopy(m_policy.space(), m_result_ptr, + m_scratch_space, size); } } } @@ -883,9 +909,8 @@ class ParallelReduce::get_cuda_func_attributes(); + cudaFuncAttributes attr = CudaParallelLaunch:: + get_cuda_func_attributes(internal_space_instance->m_cudaDev); m_team_size = m_team_size >= 0 ? m_team_size @@ -940,6 +965,8 @@ class ParallelReduce bad team size")); } - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { + if (maxShmemPerBlock < shmem_size_total) { Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much " "L0 scratch memory")); diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp index 7ccedbfe28da..3037c4ab5414 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp @@ -103,7 +103,7 @@ template __device__ bool cuda_inter_block_reduction( typename FunctorType::reference_type value, typename FunctorType::reference_type neutral, const FunctorType& reducer, - Cuda::size_type* const m_scratch_space, + typename FunctorType::pointer_type const m_scratch_space, typename FunctorType::pointer_type const /*result*/, Cuda::size_type* const m_scratch_flags, const int max_active_thread = blockDim.y) { @@ -117,7 +117,7 @@ __device__ bool cuda_inter_block_reduction( // One thread in the block writes block result to global scratch_memory if (id == 0) { - pointer_type global = ((pointer_type)m_scratch_space) + blockIdx.x; + pointer_type global = m_scratch_space + blockIdx.x; *global = value; } @@ -140,7 +140,7 @@ __device__ bool cuda_inter_block_reduction( last_block = true; value = neutral; - pointer_type const volatile global = (pointer_type)m_scratch_space; + pointer_type const volatile global = m_scratch_space; // Reduce all global values with splitting work over threads in one warp const int step_size = @@ -702,8 +702,7 @@ inline void check_reduced_view_shmem_size(const Policy& policy, unsigned reqShmemSize = cuda_single_inter_block_reduce_scan_shmem( functor, minBlockSize); - size_t maxShmemPerBlock = - policy.space().impl_internal_space_instance()->m_maxShmemPerBlock; + size_t maxShmemPerBlock = policy.space().cuda_device_prop().sharedMemPerBlock; if (reqShmemSize > maxShmemPerBlock) { Kokkos::Impl::throw_runtime_exception( diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp index baff7ef3f553..86d6d91bbee1 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp @@ -84,8 +84,8 @@ class TaskQueueSpecialization> { KOKKOS_INLINE_FUNCTION static void iff_single_thread_recursive_execute(scheduler_type const&) {} - static int get_max_team_count(execution_space const&) { - return Kokkos::Impl::cuda_internal_multiprocessor_count() * warps_per_block; + static int get_max_team_count(execution_space const& space) { + return space.cuda_device_prop().multiProcessorCount * warps_per_block; } __device__ static void driver(scheduler_type scheduler, @@ -225,7 +225,11 @@ class TaskQueueSpecialization> { // FIXME_CUDA_MULTIPLE_DEVICES static void execute(scheduler_type const& scheduler) { const int shared_per_warp = 2048; - const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1); + const Kokkos::Cuda& exec = scheduler.get_execution_space(); + const auto& impl_instance = exec.impl_internal_space_instance(); + const int multi_processor_count = + exec.cuda_device_prop().multiProcessorCount; + const dim3 grid(multi_processor_count, 1, 1); const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block); const int shared_total = shared_per_warp * warps_per_block; const cudaStream_t stream = nullptr; @@ -245,34 +249,30 @@ class TaskQueueSpecialization> { // Query the stack size, in bytes: size_t previous_stack_size = 0; - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_get_limit_wrapper( - &previous_stack_size, cudaLimitStackSize))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_get_limit_wrapper( + &previous_stack_size, cudaLimitStackSize)); // If not large enough then set the stack size, in bytes: const size_t larger_stack_size = 1 << 11; if (previous_stack_size < larger_stack_size) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_set_limit_wrapper( - cudaLimitStackSize, larger_stack_size))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_set_limit_wrapper( + cudaLimitStackSize, larger_stack_size)); } cuda_task_queue_execute<<>>( scheduler, shared_per_warp); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); Impl::cuda_device_synchronize( "Kokkos::Impl::TaskQueueSpecialization::execute: Post Task Execution"); if (previous_stack_size < larger_stack_size) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_set_limit_wrapper( - cudaLimitStackSize, previous_stack_size))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_set_limit_wrapper( + cudaLimitStackSize, previous_stack_size)); } } @@ -300,8 +300,8 @@ class TaskQueueSpecialization> { set_cuda_task_base_apply_function_pointer <<<1, 1>>>(ptr_ptr, dtor_ptr); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); + Impl::cuda_device_synchronize( "Kokkos::Impl::TaskQueueSpecialization::execute: Post Get Function Pointer for Tasks"); @@ -466,7 +466,13 @@ class TaskQueueSpecializationConstrained< static void execute(scheduler_type const& scheduler) { const int shared_per_warp = 2048; const int warps_per_block = 4; - const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1); + const Kokkos::Cuda exec = Cuda(); // FIXME_CUDA_MULTIPLE_DEVICES + const auto& impl_instance = exec.impl_internal_space_instance(); + const int multi_processor_count = + // FIXME not sure why this didn't work + // exec.cuda_device_prop().multiProcessorCount; + impl_instance->m_deviceProp.multiProcessorCount; + const dim3 grid(multi_processor_count, 1, 1); // const dim3 grid( 1 , 1 , 1 ); const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block); const int shared_total = shared_per_warp * warps_per_block; @@ -482,34 +488,30 @@ class TaskQueueSpecializationConstrained< // Query the stack size, in bytes: size_t previous_stack_size = 0; - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_get_limit_wrapper( - &previous_stack_size, cudaLimitStackSize))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_get_limit_wrapper( + &previous_stack_size, cudaLimitStackSize)); // If not large enough then set the stack size, in bytes: const size_t larger_stack_size = 2048; if (previous_stack_size < larger_stack_size) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_set_limit_wrapper( - cudaLimitStackSize, larger_stack_size))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_set_limit_wrapper( + cudaLimitStackSize, larger_stack_size)); } cuda_task_queue_execute<<>>( scheduler, shared_per_warp); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); Impl::cuda_device_synchronize( "Kokkos::Impl::TaskQueueSpecializationConstrained::execute: Post Execute Task"); if (previous_stack_size < larger_stack_size) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_set_limit_wrapper( - cudaLimitStackSize, previous_stack_size))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_set_limit_wrapper( + cudaLimitStackSize, previous_stack_size)); } } @@ -532,8 +534,7 @@ class TaskQueueSpecializationConstrained< set_cuda_task_base_apply_function_pointer <<<1, 1>>>(ptr_ptr, dtor_ptr); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); Impl::cuda_device_synchronize( "Kokkos::Impl::TaskQueueSpecializationConstrained::get_function_pointer: Post Get Function Pointer"); diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp index abb747e39a10..94a428493f47 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp @@ -22,7 +22,6 @@ #include #include -#include namespace Kokkos { diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp index a945a716bc33..c7ea6988a5d0 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp @@ -77,7 +77,9 @@ class ParallelFor, inline void execute() { const int warps_per_block = 4; - const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1); + const int multi_processor_count = + m_policy.space().cuda_device_prop().multiProcessorCount; + const dim3 grid(multi_processor_count, 1, 1); const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block); const int shared = 0; diff --git a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp index c7f0d12d914b..517c592af724 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp @@ -25,23 +25,14 @@ namespace Impl { template struct ZeroMemset> { - ZeroMemset(const Kokkos::Cuda& exec_space_instance, const View& dst, - typename View::const_value_type&) { + ZeroMemset(const Kokkos::Cuda& exec_space_instance, + const View& dst) { KOKKOS_IMPL_CUDA_SAFE_CALL( (exec_space_instance.impl_internal_space_instance() ->cuda_memset_async_wrapper( dst.data(), 0, dst.size() * sizeof(typename View::value_type)))); } - - ZeroMemset(const View& dst, - typename View::const_value_type&) { - // FIXME_CUDA_MULTIPLE_DEVICES - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Kokkos::Impl::CudaInternal::singleton().cuda_memset_wrapper( - dst.data(), 0, - dst.size() * sizeof(typename View::value_type)))); - } }; } // namespace Impl diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP.cpp index f78bfd28b2f2..309e07fb3fbb 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP.cpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP.cpp @@ -18,6 +18,7 @@ #define KOKKOS_IMPL_PUBLIC_INCLUDE #endif +#include #include #include @@ -41,7 +42,9 @@ int HIP::impl_is_initialized() { } void HIP::impl_initialize(InitializationSettings const& settings) { - const int hip_device_id = Impl::get_gpu(settings); + const std::vector& visible_devices = Impl::get_visible_devices(); + const int hip_device_id = + Impl::get_gpu(settings).value_or(visible_devices[0]); Impl::HIPInternal::m_hipDev = hip_device_id; KOKKOS_IMPL_HIP_SAFE_CALL( @@ -89,10 +92,23 @@ void HIP::impl_initialize(InitializationSettings const& settings) { hipStream_t singleton_stream; KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&singleton_stream)); - Impl::HIPInternal::singleton().initialize(singleton_stream, /*manage*/ true); + Impl::HIPInternal::singleton().initialize(singleton_stream); } -void HIP::impl_finalize() { Impl::HIPInternal::singleton().finalize(); } +void HIP::impl_finalize() { + (void)Impl::hip_global_unique_token_locks(true); + + desul::Impl::finalize_lock_arrays(); // FIXME + + KOKKOS_IMPL_HIP_SAFE_CALL( + hipEventDestroy(Impl::HIPInternal::constantMemReusable)); + KOKKOS_IMPL_HIP_SAFE_CALL( + hipHostFree(Impl::HIPInternal::constantMemHostStaging)); + + Impl::HIPInternal::singleton().finalize(); + KOKKOS_IMPL_HIP_SAFE_CALL( + hipStreamDestroy(Impl::HIPInternal::singleton().m_stream)); +} HIP::HIP() : m_space_instance(&Impl::HIPInternal::singleton(), @@ -102,13 +118,17 @@ HIP::HIP() } HIP::HIP(hipStream_t const stream, Impl::ManageStream manage_stream) - : m_space_instance(new Impl::HIPInternal, [](Impl::HIPInternal* ptr) { - ptr->finalize(); - delete ptr; - }) { + : m_space_instance( + new Impl::HIPInternal, [manage_stream](Impl::HIPInternal* ptr) { + ptr->finalize(); + if (static_cast(manage_stream)) { + KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(ptr->m_stream)); + } + delete ptr; + }) { Impl::HIPInternal::singleton().verify_is_initialized( "HIP instance constructor"); - m_space_instance->initialize(stream, static_cast(manage_stream)); + m_space_instance->initialize(stream); } KOKKOS_DEPRECATED HIP::HIP(hipStream_t const stream, bool manage_stream) diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP.hpp index 61ed346b2182..3a88e97ee3dd 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP.hpp @@ -57,13 +57,15 @@ class HIP { //! \name Functions that all Kokkos devices must implement. //@{ - KOKKOS_INLINE_FUNCTION static int in_parallel() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static int in_parallel() { #if defined(__HIP_DEVICE_COMPILE__) return true; #else return false; #endif } +#endif /** \brief Wait until all dispatched functors complete. * @@ -94,9 +96,13 @@ class HIP { static int impl_is_initialized(); - // static size_type device_arch(); - - static size_type detect_device_count(); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static size_type detect_device_count() { + int count; + KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&count)); + return count; + } +#endif #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(); diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp index 576c53426bca..5f0df72df179 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp @@ -20,13 +20,11 @@ #include #include -#include #include #include #include -#include #include namespace Kokkos { @@ -43,7 +41,6 @@ class GraphNodeKernelImpl using base_t = typename PatternImplSpecializationFromTag::type; - using Record = Kokkos::Impl::SharedAllocationRecord; // TODO use the name and executionspace template @@ -60,7 +57,7 @@ class GraphNodeKernelImpl ~GraphNodeKernelImpl() { if (m_driver_storage) { - Record::decrement(Record::get_record(m_driver_storage)); + Kokkos::HIPSpace().deallocate(m_driver_storage, sizeof(base_t)); } } @@ -78,15 +75,9 @@ class GraphNodeKernelImpl Kokkos::ObservingRawPtr allocate_driver_memory_buffer() const { KOKKOS_EXPECTS(m_driver_storage == nullptr); - - auto* record = Record::allocate( - Kokkos::HIPSpace{}, "GraphNodeKernel global memory functor storage", - sizeof(base_t)); - - Record::increment(record); - m_driver_storage = reinterpret_cast(record->data()); + m_driver_storage = static_cast(Kokkos::HIPSpace().allocate( + "GraphNodeKernel global memory functor storage", sizeof(base_t))); KOKKOS_ENSURES(m_driver_storage != nullptr); - return m_driver_storage; } diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp index 7f04eb721cb4..22c0db047f61 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include /*--------------------------------------------------------------------------*/ @@ -89,10 +90,14 @@ void HIPInternal::print_configuration(std::ostream &s) const { << '\n'; #endif - int hipDevCount; - KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&hipDevCount)); + s << "macro KOKKOS_ENABLE_ROCTHRUST : " +#if defined(KOKKOS_ENABLE_ROCTHRUST) + << "defined\n"; +#else + << "undefined\n"; +#endif - for (int i = 0; i < hipDevCount; ++i) { + for (int i : get_visible_devices()) { hipDeviceProp_t hipProp; KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceProperties(&hipProp, i)); std::string gpu_type = hipProp.integrated == 1 ? "APU" : "dGPU"; @@ -159,14 +164,13 @@ void HIPInternal::fence(const std::string &name) const { [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(m_stream)); }); } -void HIPInternal::initialize(hipStream_t stream, bool manage_stream) { +void HIPInternal::initialize(hipStream_t stream) { KOKKOS_EXPECTS(!is_initialized()); if (was_finalized) Kokkos::abort("Calling HIP::initialize after HIP::finalize is illegal\n"); - m_stream = stream; - m_manage_stream = manage_stream; + m_stream = stream; //---------------------------------- // Multiblock reduction uses scratch flags for counters @@ -192,20 +196,19 @@ void HIPInternal::initialize(hipStream_t stream, bool manage_stream) { Kokkos::HIP::size_type *HIPInternal::scratch_space(const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { - m_scratchSpaceCount = scratch_count(size); + Kokkos::HIPSpace mem_space; - using Record = Kokkos::Impl::SharedAllocationRecord; + if (m_scratchSpace) { + mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + } - if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace)); + m_scratchSpaceCount = scratch_count(size); std::size_t alloc_size = multiply_overflow_abort(m_scratchSpaceCount, sizeScratchGrain); - Record *const r = Record::allocate( - Kokkos::HIPSpace(), "Kokkos::InternalScratchSpace", alloc_size); - - Record::increment(r); - - m_scratchSpace = reinterpret_cast(r->data()); + m_scratchSpace = static_cast( + mem_space.allocate("Kokkos::InternalScratchSpace", alloc_size)); } return m_scratchSpace; @@ -214,21 +217,23 @@ Kokkos::HIP::size_type *HIPInternal::scratch_space(const std::size_t size) { Kokkos::HIP::size_type *HIPInternal::scratch_flags(const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { - m_scratchFlagsCount = scratch_count(size); + Kokkos::HIPSpace mem_space; - using Record = Kokkos::Impl::SharedAllocationRecord; + if (m_scratchFlags) { + mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); + } - if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags)); + m_scratchFlagsCount = scratch_count(size); std::size_t alloc_size = multiply_overflow_abort(m_scratchFlagsCount, sizeScratchGrain); - Record *const r = Record::allocate( - Kokkos::HIPSpace(), "Kokkos::InternalScratchFlags", alloc_size); - - Record::increment(r); - - m_scratchFlags = reinterpret_cast(r->data()); + m_scratchFlags = static_cast( + mem_space.allocate("Kokkos::InternalScratchFlags", alloc_size)); + // We only zero-initialize the allocation when we actually allocate. + // It's the responsibility of the features using scratch_flags, + // namely parallel_reduce and parallel_scan, to reset the used values to 0. KOKKOS_IMPL_HIP_SAFE_CALL(hipMemset(m_scratchFlags, 0, alloc_size)); } @@ -238,29 +243,20 @@ Kokkos::HIP::size_type *HIPInternal::scratch_flags(const std::size_t size) { Kokkos::HIP::size_type *HIPInternal::stage_functor_for_execution( void const *driver, std::size_t const size) const { if (verify_is_initialized("scratch_functor") && m_scratchFunctorSize < size) { - m_scratchFunctorSize = size; - - using Record = Kokkos::Impl::SharedAllocationRecord; - using RecordHost = - Kokkos::Impl::SharedAllocationRecord; + Kokkos::HIPSpace device_mem_space; + Kokkos::HIPHostPinnedSpace host_mem_space; if (m_scratchFunctor) { - Record::decrement(Record::get_record(m_scratchFunctor)); - RecordHost::decrement(RecordHost::get_record(m_scratchFunctorHost)); + device_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize); + host_mem_space.deallocate(m_scratchFunctorHost, m_scratchFunctorSize); } - Record *const r = - Record::allocate(Kokkos::HIPSpace(), "Kokkos::InternalScratchFunctor", - m_scratchFunctorSize); - RecordHost *const r_host = RecordHost::allocate( - Kokkos::HIPHostPinnedSpace(), "Kokkos::InternalScratchFunctorHost", - m_scratchFunctorSize); - - Record::increment(r); - RecordHost::increment(r_host); + m_scratchFunctorSize = size; - m_scratchFunctor = reinterpret_cast(r->data()); - m_scratchFunctorHost = reinterpret_cast(r_host->data()); + m_scratchFunctor = static_cast(device_mem_space.allocate( + "Kokkos::InternalScratchFunctor", m_scratchFunctorSize)); + m_scratchFunctorHost = static_cast(host_mem_space.allocate( + "Kokkos::InternalScratchFunctorHost", m_scratchFunctorSize)); } // When using HSA_XNACK=1, it is necessary to copy the driver to the host to @@ -323,23 +319,18 @@ void HIPInternal::finalize() { this->fence("Kokkos::HIPInternal::finalize: fence on finalization"); was_finalized = true; - if (this == &singleton()) { - (void)Kokkos::Impl::hip_global_unique_token_locks(true); - desul::Impl::finalize_lock_arrays(); // FIXME - - KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(constantMemHostStaging)); - KOKKOS_IMPL_HIP_SAFE_CALL(hipEventDestroy(constantMemReusable)); - } - if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) { - using RecordHIP = Kokkos::Impl::SharedAllocationRecord; + Kokkos::HIPSpace device_mem_space; - RecordHIP::decrement(RecordHIP::get_record(m_scratchFlags)); - RecordHIP::decrement(RecordHIP::get_record(m_scratchSpace)); + device_mem_space.deallocate(m_scratchFlags, + m_scratchSpaceCount * sizeScratchGrain); + device_mem_space.deallocate(m_scratchSpace, + m_scratchFlagsCount * sizeScratchGrain); if (m_scratchFunctorSize > 0) { - RecordHIP::decrement(RecordHIP::get_record(m_scratchFunctor)); - RecordHIP::decrement(RecordHIP::get_record(m_scratchFunctorHost)); + device_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize); + Kokkos::HIPHostPinnedSpace host_mem_space; + host_mem_space.deallocate(m_scratchFunctorHost, m_scratchFunctorSize); } } @@ -348,14 +339,10 @@ void HIPInternal::finalize() { Kokkos::kokkos_free(m_team_scratch_ptr[i]); } - if (m_manage_stream && m_stream != nullptr) - KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(m_stream)); - m_scratchSpaceCount = 0; m_scratchFlagsCount = 0; m_scratchSpace = nullptr; m_scratchFlags = nullptr; - m_stream = nullptr; for (int i = 0; i < m_n_team_scratch; ++i) { m_team_scratch_current_size[i] = 0; m_team_scratch_ptr[i] = nullptr; @@ -419,13 +406,3 @@ void Kokkos::Impl::create_HIP_instances(std::vector &instances) { instances[s] = HIP(stream, ManageStream::yes); } } - -//---------------------------------------------------------------------------- - -namespace Kokkos { -HIP::size_type HIP::detect_device_count() { - int hipDevCount; - KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&hipDevCount)); - return hipDevCount; -} -} // namespace Kokkos diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp index 63ad66686bb8..142008124af9 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -98,7 +98,6 @@ class HIPInternal { uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance( reinterpret_cast(this)); - bool m_manage_stream = false; // Team Scratch Level 1 Space int m_n_team_scratch = 10; @@ -124,7 +123,7 @@ class HIPInternal { return nullptr != m_scratchSpace && nullptr != m_scratchFlags; } - void initialize(hipStream_t stream, bool manage_stream); + void initialize(hipStream_t stream); void finalize(); void print_configuration(std::ostream &) const; diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_MDRange.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_MDRange.hpp new file mode 100644 index 000000000000..db07c360b5cb --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_MDRange.hpp @@ -0,0 +1,173 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_FOR_MDRANGE_HPP +#define KOKKOS_HIP_PARALLEL_FOR_MDRANGE_HPP + +#include + +#include +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +// ParallelFor +template +class ParallelFor, HIP> { + public: + using Policy = Kokkos::MDRangePolicy; + using functor_type = FunctorType; + + private: + using array_index_type = typename Policy::array_index_type; + using index_type = typename Policy::index_type; + using LaunchBounds = typename Policy::launch_bounds; + + const FunctorType m_functor; + const Policy m_policy; + + public: + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; + ParallelFor& operator=(ParallelFor const&) = delete; + + inline __device__ void operator()() const { + Kokkos::Impl::DeviceIterateTile(m_policy, + m_functor) + .exec_range(); + } + + inline void execute() const { + using ClosureType = ParallelFor; + if (m_policy.m_num_tiles == 0) return; + auto const maxblocks = hip_internal_maximum_grid_count(); + if (Policy::rank == 2) { + dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], 1); + dim3 const grid( + std::min( + (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) / + block.x, + maxblocks[0]), + std::min( + (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) / + block.y, + maxblocks[1]), + 1); + hip_parallel_launch( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else if (Policy::rank == 3) { + dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], + m_policy.m_tile[2]); + dim3 const grid( + std::min( + (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) / + block.x, + maxblocks[0]), + std::min( + (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) / + block.y, + maxblocks[1]), + std::min( + (m_policy.m_upper[2] - m_policy.m_lower[2] + block.z - 1) / + block.z, + maxblocks[2])); + hip_parallel_launch( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else if (Policy::rank == 4) { + // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to + // threadIdx.z + dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], + m_policy.m_tile[2], m_policy.m_tile[3]); + dim3 const grid( + std::min( + m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), + std::min( + (m_policy.m_upper[2] - m_policy.m_lower[2] + block.y - 1) / + block.y, + maxblocks[1]), + std::min( + (m_policy.m_upper[3] - m_policy.m_lower[3] + block.z - 1) / + block.z, + maxblocks[2])); + hip_parallel_launch( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else if (Policy::rank == 5) { + // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 + // to threadIdx.z + dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], + m_policy.m_tile[2] * m_policy.m_tile[3], + m_policy.m_tile[4]); + dim3 const grid( + std::min( + m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), + std::min( + m_policy.m_tile_end[2] * m_policy.m_tile_end[3], maxblocks[1]), + std::min( + (m_policy.m_upper[4] - m_policy.m_lower[4] + block.z - 1) / + block.z, + maxblocks[2])); + hip_parallel_launch( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else if (Policy::rank == 6) { + // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; + // id4,id5 to threadIdx.z + dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], + m_policy.m_tile[2] * m_policy.m_tile[3], + m_policy.m_tile[4] * m_policy.m_tile[5]); + dim3 const grid( + std::min( + m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), + std::min( + m_policy.m_tile_end[2] * m_policy.m_tile_end[3], maxblocks[1]), + std::min( + m_policy.m_tile_end[4] * m_policy.m_tile_end[5], maxblocks[2])); + hip_parallel_launch( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else { + Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with HIP\n"); + } + + } // end execute + + ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} + + template + static int max_tile_size_product(const Policy&, const Functor&) { + using closure_type = + ParallelFor, HIP>; + unsigned block_size = hip_get_max_blocksize(); + if (block_size == 0) + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelFor< HIP > could not find a valid " + "tile size.")); + return block_size; + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp new file mode 100644 index 000000000000..9355c1c75fbe --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp @@ -0,0 +1,100 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_FOR_RANGE_HPP +#define KOKKOS_HIP_PARALLEL_FOR_RANGE_HPP + +#include + +#include +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelFor, Kokkos::HIP> { + public: + using Policy = Kokkos::RangePolicy; + + private: + using Member = typename Policy::member_type; + using WorkTag = typename Policy::work_tag; + using LaunchBounds = typename Policy::launch_bounds; + + const FunctorType m_functor; + const Policy m_policy; + + template + inline __device__ std::enable_if_t::value> exec_range( + const Member i) const { + m_functor(i); + } + + template + inline __device__ std::enable_if_t::value> exec_range( + const Member i) const { + m_functor(TagType(), i); + } + + public: + using functor_type = FunctorType; + + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; + ParallelFor& operator=(ParallelFor const&) = delete; + + inline __device__ void operator()() const { + const Member work_stride = blockDim.y * gridDim.x; + const Member work_end = m_policy.end(); + + for (Member iwork = + m_policy.begin() + threadIdx.y + blockDim.y * blockIdx.x; + iwork < work_end; + iwork = iwork < work_end - work_stride ? iwork + work_stride + : work_end) { + this->template exec_range(iwork); + } + } + + inline void execute() const { + const typename Policy::index_type nwork = m_policy.end() - m_policy.begin(); + + using DriverType = ParallelFor; + const int block_size = + Kokkos::Impl::hip_get_preferred_blocksize(); + const dim3 block(1, block_size, 1); + const dim3 grid( + typename Policy::index_type((nwork + block.y - 1) / block.y), 1, 1); + + if (block_size == 0) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelFor< HIP > could not find a " + "valid execution configuration.")); + } + Kokkos::Impl::hip_parallel_launch( + *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), + false); + } + + ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp new file mode 100644 index 000000000000..bf0c2193383f --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp @@ -0,0 +1,177 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_FOR_TEAM_HPP +#define KOKKOS_HIP_PARALLEL_FOR_TEAM_HPP + +#include + +#include +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelFor, HIP> { + public: + using Policy = TeamPolicy; + using functor_type = FunctorType; + using size_type = HIP::size_type; + + private: + using member_type = typename Policy::member_type; + using work_tag = typename Policy::work_tag; + using launch_bounds = typename Policy::launch_bounds; + + // Algorithmic constraints: blockDim.y is a power of two AND + // blockDim.y == blockDim.z == 1 shared memory utilization: + // + // [ team reduce space ] + // [ team shared space ] + + FunctorType const m_functor; + Policy const m_policy; + size_type const m_league_size; + int m_team_size; + size_type const m_vector_size; + int m_shmem_begin; + int m_shmem_size; + void* m_scratch_ptr[2]; + size_t m_scratch_size[2]; + int m_scratch_pool_id = -1; + int32_t* m_scratch_locks; + size_t m_num_scratch_locks; + + template + __device__ inline std::enable_if_t::value> exec_team( + const member_type& member) const { + m_functor(member); + } + + template + __device__ inline std::enable_if_t::value> exec_team( + const member_type& member) const { + m_functor(TagType(), member); + } + + public: + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; + ParallelFor& operator=(ParallelFor const&) = delete; + + __device__ inline void operator()() const { + // Iterate this block through the league + int64_t threadid = 0; + if (m_scratch_size[1] > 0) { + threadid = hip_get_scratch_index(m_league_size, m_scratch_locks, + m_num_scratch_locks); + } + + int const int_league_size = static_cast(m_league_size); + for (int league_rank = blockIdx.x; league_rank < int_league_size; + league_rank += gridDim.x) { + this->template exec_team(typename Policy::member_type( + kokkos_impl_hip_shared_memory(), m_shmem_begin, m_shmem_size, + static_cast(static_cast(m_scratch_ptr[1]) + + ptrdiff_t(threadid / (blockDim.x * blockDim.y)) * + m_scratch_size[1]), + m_scratch_size[1], league_rank, m_league_size)); + } + if (m_scratch_size[1] > 0) { + hip_release_scratch_index(m_scratch_locks, threadid); + } + } + + inline void execute() const { + int64_t const shmem_size_total = m_shmem_begin + m_shmem_size; + dim3 const grid(static_cast(m_league_size), 1, 1); + dim3 const block(static_cast(m_vector_size), + static_cast(m_team_size), 1); + + using closure_type = + ParallelFor, HIP>; + Impl::hip_parallel_launch( + *this, grid, block, shmem_size_total, + m_policy.space().impl_internal_space_instance(), + true); // copy to device and execute + } + + ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) + : m_functor(arg_functor), + m_policy(arg_policy), + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()) { + auto internal_space_instance = + m_policy.space().impl_internal_space_instance(); + m_team_size = m_team_size >= 0 ? m_team_size + : arg_policy.team_size_recommended( + arg_functor, ParallelForTag()); + + m_shmem_begin = (sizeof(double) * (m_team_size + 2)); + m_shmem_size = + (m_policy.scratch_size(0, m_team_size) + + FunctorTeamShmemSize::value(m_functor, m_team_size)); + m_scratch_size[0] = m_policy.scratch_size(0, m_team_size); + m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); + m_scratch_locks = internal_space_instance->m_scratch_locks; + m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + m_scratch_ptr[0] = nullptr; + if (m_team_size <= 0) { + m_scratch_ptr[1] = nullptr; + } else { + m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space(); + m_scratch_ptr[1] = internal_space_instance->resize_team_scratch_space( + m_scratch_pool_id, + static_cast(m_scratch_size[1]) * + (std::min( + static_cast(HIP().concurrency() / + (m_team_size * m_vector_size)), + static_cast(m_league_size)))); + } + + int const shmem_size_total = m_shmem_begin + m_shmem_size; + if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { + Kokkos::Impl::throw_runtime_exception(std::string( + "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory")); + } + + size_t max_size = arg_policy.team_size_max(arg_functor, ParallelForTag()); + if (static_cast(m_team_size) > static_cast(max_size)) { + Kokkos::Impl::throw_runtime_exception(std::string( + "Kokkos::Impl::ParallelFor< HIP > requested too large team size.")); + } + } + + ~ParallelFor() { + if (m_scratch_pool_id >= 0) { + m_policy.space() + .impl_internal_space_instance() + ->release_team_scratch_space(m_scratch_pool_id); + } + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp similarity index 61% rename from packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp rename to packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp index 0fa325cb12c7..55b6218d1c88 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp @@ -14,157 +14,19 @@ // //@HEADER -#ifndef KOKKOS_HIP_PARALLEL_MDRANGE_HPP -#define KOKKOS_HIP_PARALLEL_MDRANGE_HPP +#ifndef KOKKOS_HIP_PARALLEL_REDUCE_MDRANGE_HPP +#define KOKKOS_HIP_PARALLEL_REDUCE_MDRANGE_HPP + +#include #include #include #include #include #include -#include namespace Kokkos { namespace Impl { -// ParallelFor -template -class ParallelFor, HIP> { - public: - using Policy = Kokkos::MDRangePolicy; - using functor_type = FunctorType; - - private: - using array_index_type = typename Policy::array_index_type; - using index_type = typename Policy::index_type; - using LaunchBounds = typename Policy::launch_bounds; - - const FunctorType m_functor; - const Policy m_policy; - - public: - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; - ParallelFor& operator=(ParallelFor const&) = delete; - - inline __device__ void operator()() const { - Kokkos::Impl::DeviceIterateTile(m_policy, - m_functor) - .exec_range(); - } - - inline void execute() const { - using ClosureType = ParallelFor; - if (m_policy.m_num_tiles == 0) return; - auto const maxblocks = hip_internal_maximum_grid_count(); - if (Policy::rank == 2) { - dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], 1); - dim3 const grid( - std::min( - (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) / - block.x, - maxblocks[0]), - std::min( - (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) / - block.y, - maxblocks[1]), - 1); - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else if (Policy::rank == 3) { - dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], - m_policy.m_tile[2]); - dim3 const grid( - std::min( - (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) / - block.x, - maxblocks[0]), - std::min( - (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) / - block.y, - maxblocks[1]), - std::min( - (m_policy.m_upper[2] - m_policy.m_lower[2] + block.z - 1) / - block.z, - maxblocks[2])); - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else if (Policy::rank == 4) { - // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to - // threadIdx.z - dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], - m_policy.m_tile[2], m_policy.m_tile[3]); - dim3 const grid( - std::min( - m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), - std::min( - (m_policy.m_upper[2] - m_policy.m_lower[2] + block.y - 1) / - block.y, - maxblocks[1]), - std::min( - (m_policy.m_upper[3] - m_policy.m_lower[3] + block.z - 1) / - block.z, - maxblocks[2])); - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else if (Policy::rank == 5) { - // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 - // to threadIdx.z - dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], - m_policy.m_tile[2] * m_policy.m_tile[3], - m_policy.m_tile[4]); - dim3 const grid( - std::min( - m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), - std::min( - m_policy.m_tile_end[2] * m_policy.m_tile_end[3], maxblocks[1]), - std::min( - (m_policy.m_upper[4] - m_policy.m_lower[4] + block.z - 1) / - block.z, - maxblocks[2])); - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else if (Policy::rank == 6) { - // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; - // id4,id5 to threadIdx.z - dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], - m_policy.m_tile[2] * m_policy.m_tile[3], - m_policy.m_tile[4] * m_policy.m_tile[5]); - dim3 const grid( - std::min( - m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), - std::min( - m_policy.m_tile_end[2] * m_policy.m_tile_end[3], maxblocks[1]), - std::min( - m_policy.m_tile_end[4] * m_policy.m_tile_end[5], maxblocks[2])); - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else { - Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with HIP\n"); - } - - } // end execute - - ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) - : m_functor(arg_functor), m_policy(arg_policy) {} - - template - static int max_tile_size_product(const Policy&, const Functor&) { - using closure_type = - ParallelFor, HIP>; - unsigned block_size = hip_get_max_blocksize(); - if (block_size == 0) - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelFor< HIP > could not find a valid " - "tile size.")); - return block_size; - } -}; // ParallelReduce template diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Range.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Range.hpp new file mode 100644 index 000000000000..c8981866e8ac --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Range.hpp @@ -0,0 +1,329 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_REDUCE_RANGE_HPP +#define KOKKOS_HIP_PARALLEL_REDUCE_RANGE_HPP + +#include + +#include +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelReduce, + Kokkos::HIP> { + public: + using Policy = Kokkos::RangePolicy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; + + private: + using WorkRange = typename Policy::WorkRange; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + using LaunchBounds = typename Policy::launch_bounds; + + public: + using pointer_type = typename ReducerType::pointer_type; + using value_type = typename ReducerType::value_type; + using reference_type = typename ReducerType::reference_type; + using functor_type = FunctorType; + using reducer_type = ReducerType; + using size_type = Kokkos::HIP::size_type; + using index_type = typename Policy::index_type; + // Conditionally set word_size_type to int16_t or int8_t if value_type is + // smaller than int32_t (Kokkos::HIP::size_type) + // word_size_type is used to determine the word count, shared memory buffer + // size, and global memory buffer size before the scan is performed. + // Within the scan, the word count is recomputed based on word_size_type + // and when calculating indexes into the shared/global memory buffers for + // performing the scan, word_size_type is used again. + // For scalars > 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + using word_size_type = std::conditional_t< + sizeof(value_type) < sizeof(size_type), + std::conditional_t, size_type>; + + // Algorithmic constraints: blockSize is a power of two AND blockDim.y == + // blockDim.z == 1 + + const CombinedFunctorReducerType m_functor_reducer; + const Policy m_policy; + const pointer_type m_result_ptr; + const bool m_result_ptr_device_accessible; + const bool m_result_ptr_host_accessible; + word_size_type* m_scratch_space = nullptr; + size_type* m_scratch_flags = nullptr; + + static constexpr bool UseShflReduction = false; + + private: + struct ShflReductionTag {}; + struct SHMEMReductionTag {}; + + // Make the exec_range calls call to Reduce::DeviceIterateTile + template + __device__ inline std::enable_if_t::value> exec_range( + const Member& i, reference_type update) const { + m_functor_reducer.get_functor()(i, update); + } + + template + __device__ inline std::enable_if_t::value> exec_range( + const Member& i, reference_type update) const { + m_functor_reducer.get_functor()(TagType(), i, update); + } + + public: + __device__ inline void operator()() const { + using ReductionTag = std::conditional_t; + run(ReductionTag{}); + } + + __device__ inline void run(SHMEMReductionTag) const { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + const integral_nonzero_constant + word_count(reducer.value_size() / sizeof(word_size_type)); + + { + reference_type value = reducer.init(reinterpret_cast( + ::Kokkos::kokkos_impl_hip_shared_memory() + + threadIdx.y * word_count.value)); + + // Number of blocks is bounded so that the reduction can be limited to two + // passes. Each thread block is given an approximately equal amount of + // work to perform. Accumulate the values for this block. The accumulation + // ordering does not match the final pass, but is arithmetically + // equivalent. + + const WorkRange range(m_policy, blockIdx.x, gridDim.x); + + for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); + iwork < iwork_end; iwork += blockDim.y) { + this->template exec_range(iwork, value); + } + } + + // Reduce with final value at blockDim.y - 1 location. + // Shortcut for length zero reduction + bool do_final_reduction = m_policy.begin() == m_policy.end(); + if (!do_final_reduction) + do_final_reduction = hip_single_inter_block_reduce_scan( + reducer, blockIdx.x, gridDim.x, + ::Kokkos::kokkos_impl_hip_shared_memory(), + m_scratch_space, m_scratch_flags); + if (do_final_reduction) { + // This is the final block with the final result at the final threads' + // location + + word_size_type* const shared = + ::Kokkos::kokkos_impl_hip_shared_memory() + + (blockDim.y - 1) * word_count.value; + word_size_type* const global = + m_result_ptr_device_accessible + ? reinterpret_cast(m_result_ptr) + : m_scratch_space; + + if (threadIdx.y == 0) { + reducer.final(reinterpret_cast(shared)); + } + + if (::Kokkos::Impl::HIPTraits::WarpSize < word_count.value) { + __syncthreads(); + } + + for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { + global[i] = shared[i]; + } + } + } + + __device__ inline void run(ShflReductionTag) const { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + value_type value; + reducer.init(&value); + // Number of blocks is bounded so that the reduction can be limited to two + // passes. Each thread block is given an approximately equal amount of work + // to perform. Accumulate the values for this block. The accumulation + // ordering does not match the final pass, but is arithmetically equivalent. + + WorkRange const range(m_policy, blockIdx.x, gridDim.x); + + for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); + iwork < iwork_end; iwork += blockDim.y) { + this->template exec_range(iwork, value); + } + + pointer_type const result = reinterpret_cast(m_scratch_space); + + int max_active_thread = static_cast(range.end() - range.begin()) < + static_cast(blockDim.y) + ? range.end() - range.begin() + : blockDim.y; + + max_active_thread = + (max_active_thread == 0) ? blockDim.y : max_active_thread; + + value_type init; + reducer.init(&init); + if (m_policy.begin() == m_policy.end()) { + reducer.final(&value); + pointer_type const final_result = + m_result_ptr_device_accessible ? m_result_ptr : result; + *final_result = value; + } else if (Impl::hip_inter_block_shuffle_reduction<>( + value, init, reducer, m_scratch_space, result, + m_scratch_flags, max_active_thread)) { + unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; + if (id == 0) { + reducer.final(&value); + pointer_type const final_result = + m_result_ptr_device_accessible ? m_result_ptr : result; + *final_result = value; + } + } + } + + // Determine block size constrained by shared memory: + inline unsigned local_block_size(const FunctorType& f) { + const auto& instance = m_policy.space().impl_internal_space_instance(); + auto shmem_functor = [&f](unsigned n) { + return hip_single_inter_block_reduce_scan_shmem(f, n); + }; + return Kokkos::Impl::hip_get_preferred_blocksize( + instance, shmem_functor); + } + + inline void execute() { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + const index_type nwork = m_policy.end() - m_policy.begin(); + const bool need_device_set = ReducerType::has_init_member_function() || + ReducerType::has_final_member_function() || + !m_result_ptr_host_accessible || + !std::is_same::value; + if ((nwork > 0) || need_device_set) { + const int block_size = local_block_size(m_functor_reducer.get_functor()); + if (block_size == 0) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a " + "valid execution configuration.")); + } + + // REQUIRED ( 1 , N , 1 ) + dim3 block(1, block_size, 1); + // use a slightly less constrained, but still well bounded limit for + // scratch + int nblocks = (nwork + block.y - 1) / block.y; + // Heuristic deciding the value of nblocks. + // The general idea here is we want to: + // 1. Not undersubscribe the device (i.e., we want at least + // preferred_block_min blocks) + // 2. Have each thread reduce > 1 value to minimize overheads + // 3. Limit the total # of blocks, to avoid unbounded scratch space + constexpr int block_max = 4096; + constexpr int preferred_block_min = 1024; + + if (nblocks < preferred_block_min) { + // keep blocks as is, already have low parallelism + } else if (nblocks > block_max) { + // "large dispatch" -> already have lots of parallelism + nblocks = block_max; + } else { + // in the intermediate range, try to have each thread process multiple + // items to offset the cost of the reduction (with not enough + // parallelism to hide it) + int items_per_thread = + (nwork + nblocks * block_size - 1) / (nblocks * block_size); + if (items_per_thread < 4) { + int ratio = std::min( + (nblocks + preferred_block_min - 1) / preferred_block_min, + (4 + items_per_thread - 1) / items_per_thread); + nblocks /= ratio; + } + } + + // TODO: down casting these uses more space than required? + m_scratch_space = + (word_size_type*)::Kokkos::Impl::hip_internal_scratch_space( + m_policy.space(), reducer.value_size() * nblocks); + // Intentionally do not downcast to word_size_type since we use HIP + // atomics in Kokkos_HIP_ReduceScan.hpp + m_scratch_flags = ::Kokkos::Impl::hip_internal_scratch_flags( + m_policy.space(), sizeof(size_type)); + // Required grid.x <= block.y + dim3 grid(nblocks, 1, 1); + + if (nwork == 0) { + block = dim3(1, 1, 1); + grid = dim3(1, 1, 1); + } + const int shmem = + UseShflReduction + ? 0 + : hip_single_inter_block_reduce_scan_shmem( + m_functor_reducer.get_functor(), block.y); + + Kokkos::Impl::hip_parallel_launch( + *this, grid, block, shmem, + m_policy.space().impl_internal_space_instance(), + false); // copy to device and execute + + if (!m_result_ptr_device_accessible && m_result_ptr) { + const int size = reducer.value_size(); + DeepCopy(m_policy.space(), m_result_ptr, + m_scratch_space, size); + } + } else { + if (m_result_ptr) { + reducer.init(m_result_ptr); + } + } + } + + template + ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + const Policy& arg_policy, const ViewType& arg_result) + : m_functor_reducer(arg_functor_reducer), + m_policy(arg_policy), + m_result_ptr(arg_result.data()), + m_result_ptr_device_accessible( + MemorySpaceAccess::accessible), + m_result_ptr_host_accessible( + MemorySpaceAccess::accessible) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp new file mode 100644 index 000000000000..609ba28b866d --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp @@ -0,0 +1,394 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_REDUCE_TEAM_HPP +#define KOKKOS_HIP_PARALLEL_REDUCE_TEAM_HPP + +#include + +#include +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelReduce, HIP> { + public: + using Policy = TeamPolicyInternal; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; + + private: + using member_type = typename Policy::member_type; + using work_tag = typename Policy::work_tag; + using launch_bounds = typename Policy::launch_bounds; + + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; + using value_type = typename ReducerType::value_type; + + public: + using functor_type = FunctorType; + using size_type = HIP::size_type; + + // static int constexpr UseShflReduction = false; + // FIXME_HIP This should be disabled unconditionally for best performance, but + // it currently causes tests to fail. + static constexpr int UseShflReduction = + (ReducerType::static_value_size() != 0); + + private: + struct ShflReductionTag {}; + struct SHMEMReductionTag {}; + + // Algorithmic constraints: blockDim.y is a power of two AND + // blockDim.y == blockDim.z == 1 shared memory utilization: + // + // [ global reduce space ] + // [ team reduce space ] + // [ team shared space ] + // + + const CombinedFunctorReducerType m_functor_reducer; + const Policy m_policy; + const pointer_type m_result_ptr; + const bool m_result_ptr_device_accessible; + const bool m_result_ptr_host_accessible; + size_type* m_scratch_space; + size_type* m_scratch_flags; + size_type m_team_begin; + size_type m_shmem_begin; + size_type m_shmem_size; + void* m_scratch_ptr[2]; + size_t m_scratch_size[2]; + int m_scratch_pool_id = -1; + int32_t* m_scratch_locks; + size_t m_num_scratch_locks; + const size_type m_league_size; + int m_team_size; + const size_type m_vector_size; + + template + __device__ inline std::enable_if_t::value> exec_team( + member_type const& member, reference_type update) const { + m_functor_reducer.get_functor()(member, update); + } + + template + __device__ inline std::enable_if_t::value> exec_team( + member_type const& member, reference_type update) const { + m_functor_reducer.get_functor()(TagType(), member, update); + } + + __device__ inline void iterate_through_league(int const threadid, + reference_type value) const { + int const int_league_size = static_cast(m_league_size); + for (int league_rank = blockIdx.x; league_rank < int_league_size; + league_rank += gridDim.x) { + this->template exec_team( + member_type( + kokkos_impl_hip_shared_memory() + m_team_begin, + m_shmem_begin, m_shmem_size, + reinterpret_cast( + reinterpret_cast(m_scratch_ptr[1]) + + static_cast(threadid / (blockDim.x * blockDim.y)) * + m_scratch_size[1]), + m_scratch_size[1], league_rank, m_league_size), + value); + } + } + + int compute_block_count() const { + constexpr auto light_weight = + Kokkos::Experimental::WorkItemProperty::HintLightWeight; + constexpr typename Policy::work_item_property property; + // Numbers were tuned on MI210 using dot product and yAx benchmarks + constexpr int block_max = + (property & light_weight) == light_weight ? 2097152 : 65536; + constexpr int preferred_block_min = 1024; + int block_count = m_league_size; + if (block_count < preferred_block_min) { + // keep blocks as is, already low parallelism + } else if (block_count >= block_max) { + block_count = block_max; + + } else { + int nwork = m_league_size * m_team_size; + int items_per_thread = + (nwork + block_count * m_team_size - 1) / (block_count * m_team_size); + if (items_per_thread < 4) { + int ratio = std::min( + (block_count + preferred_block_min - 1) / preferred_block_min, + (4 + items_per_thread - 1) / items_per_thread); + block_count /= ratio; + } + } + + return block_count; + } + + public: + __device__ inline void operator()() const { + int64_t threadid = 0; + if (m_scratch_size[1] > 0) { + threadid = hip_get_scratch_index(m_league_size, m_scratch_locks, + m_num_scratch_locks); + } + + using ReductionTag = std::conditional_t; + run(ReductionTag{}, threadid); + + if (m_scratch_size[1] > 0) { + hip_release_scratch_index(m_scratch_locks, threadid); + } + } + + __device__ inline void run(SHMEMReductionTag, int const threadid) const { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + integral_nonzero_constant const + word_count(reducer.value_size() / sizeof(size_type)); + + reference_type value = + reducer.init(kokkos_impl_hip_shared_memory() + + threadIdx.y * word_count.value); + // Iterate this block through the league + iterate_through_league(threadid, value); + + // Reduce with final value at blockDim.y - 1 location. + bool do_final_reduce = (m_league_size == 0); + if (!do_final_reduce) + do_final_reduce = + hip_single_inter_block_reduce_scan( + reducer, blockIdx.x, gridDim.x, + kokkos_impl_hip_shared_memory(), m_scratch_space, + m_scratch_flags); + if (do_final_reduce) { + // This is the final block with the final result at the final threads' + // location + + size_type* const shared = kokkos_impl_hip_shared_memory() + + (blockDim.y - 1) * word_count.value; + size_type* const global = m_result_ptr_device_accessible + ? reinterpret_cast(m_result_ptr) + : m_scratch_space; + + if (threadIdx.y == 0) { + reducer.final(reinterpret_cast(shared)); + } + + if (HIPTraits::WarpSize < word_count.value) { + __syncthreads(); + } + + for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { + global[i] = shared[i]; + } + } + } + + __device__ inline void run(ShflReductionTag, int const threadid) const { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + value_type value; + reducer.init(&value); + + // Iterate this block through the league + iterate_through_league(threadid, value); + + pointer_type const result = + m_result_ptr_device_accessible + ? m_result_ptr + : reinterpret_cast(m_scratch_space); + + value_type init; + reducer.init(&init); + if (m_league_size == 0) { + reducer.final(&value); + *result = value; + } else if (Impl::hip_inter_block_shuffle_reduction( + value, init, reducer, m_scratch_space, result, + m_scratch_flags, blockDim.y)) { + unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; + if (id == 0) { + reducer.final(&value); + *result = value; + } + } + } + + inline void execute() { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + const bool is_empty_range = m_league_size == 0 || m_team_size == 0; + const bool need_device_set = ReducerType::has_init_member_function() || + ReducerType::has_final_member_function() || + !m_result_ptr_host_accessible || + Policy::is_graph_kernel::value || + !std::is_same::value; + if (!is_empty_range || need_device_set) { + int const block_count = compute_block_count(); + + m_scratch_space = hip_internal_scratch_space( + m_policy.space(), reducer.value_size() * block_count); + m_scratch_flags = + hip_internal_scratch_flags(m_policy.space(), sizeof(size_type)); + + dim3 block(m_vector_size, m_team_size, 1); + dim3 grid(block_count, 1, 1); + if (is_empty_range) { + block = dim3(1, 1, 1); + grid = dim3(1, 1, 1); + } + const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; + + Impl::hip_parallel_launch( + *this, grid, block, shmem_size_total, + m_policy.space().impl_internal_space_instance(), + true); // copy to device and execute + + if (!m_result_ptr_device_accessible) { + m_policy.space().impl_internal_space_instance()->fence(); + + if (m_result_ptr) { + const int size = reducer.value_size(); + DeepCopy(m_policy.space(), m_result_ptr, + m_scratch_space, size); + } + } + } else { + if (m_result_ptr) { + reducer.init(m_result_ptr); + } + } + } + + template + ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer, + Policy const& arg_policy, ViewType const& arg_result) + : m_functor_reducer(arg_functor_reducer), + m_policy(arg_policy), + m_result_ptr(arg_result.data()), + m_result_ptr_device_accessible( + MemorySpaceAccess::accessible), + m_result_ptr_host_accessible( + MemorySpaceAccess::accessible), + m_scratch_space(nullptr), + m_scratch_flags(nullptr), + m_team_begin(0), + m_shmem_begin(0), + m_shmem_size(0), + m_scratch_ptr{nullptr, nullptr}, + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()) { + auto internal_space_instance = + m_policy.space().impl_internal_space_instance(); + m_team_size = m_team_size >= 0 ? m_team_size + : arg_policy.team_size_recommended( + arg_functor_reducer.get_functor(), + arg_functor_reducer.get_reducer(), + ParallelReduceTag()); + + m_team_begin = + UseShflReduction + ? 0 + : hip_single_inter_block_reduce_scan_shmem( + arg_functor_reducer.get_functor(), m_team_size); + m_shmem_begin = sizeof(double) * (m_team_size + 2); + m_shmem_size = m_policy.scratch_size(0, m_team_size) + + FunctorTeamShmemSize::value( + arg_functor_reducer.get_functor(), m_team_size); + m_scratch_size[0] = m_shmem_size; + m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); + m_scratch_locks = internal_space_instance->m_scratch_locks; + m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; + if (m_team_size <= 0) { + m_scratch_ptr[1] = nullptr; + } else { + m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space(); + m_scratch_ptr[1] = internal_space_instance->resize_team_scratch_space( + m_scratch_pool_id, + static_cast(m_scratch_size[1]) * + (std::min( + static_cast(HIP().concurrency() / + (m_team_size * m_vector_size)), + static_cast(m_league_size)))); + } + + // The global parallel_reduce does not support vector_length other than 1 at + // the moment + if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) + Impl::throw_runtime_exception( + "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " + "greater than 1 is not currently supported for HIP for dynamic " + "sized reduction types."); + + if ((m_team_size < HIPTraits::WarpSize) && !UseShflReduction) + Impl::throw_runtime_exception( + "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller " + "than 64 is not currently supported with HIP for dynamic sized " + "reduction types."); + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + + const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; + + if (!Kokkos::Impl::is_integral_power_of_two(m_team_size) && + !UseShflReduction) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size")); + } + + if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > requested too much " + "L0 scratch memory")); + } + + size_t max_size = arg_policy.team_size_max( + arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(), + ParallelReduceTag()); + if (static_cast(m_team_size) > static_cast(max_size)) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > requested too " + "large team size.")); + } + } + + ~ParallelReduce() { + if (m_scratch_pool_id >= 0) { + m_policy.space() + .impl_internal_space_instance() + ->release_team_scratch_space(m_scratch_pool_id); + } + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelScan_Range.hpp similarity index 50% rename from packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp rename to packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelScan_Range.hpp index 26e8be4698a8..41692a3291be 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ParallelScan_Range.hpp @@ -14,390 +14,18 @@ // //@HEADER -#ifndef KOKKO_HIP_PARALLEL_RANGE_HPP -#define KOKKO_HIP_PARALLEL_RANGE_HPP +#ifndef KOKKOS_HIP_PARALLEL_SCAN_RANGE_HPP +#define KOKKOS_HIP_PARALLEL_SCAN_RANGE_HPP #include -#if defined(__HIPCC__) - #include #include #include -#include -#include namespace Kokkos { namespace Impl { -template -class ParallelFor, Kokkos::HIP> { - public: - using Policy = Kokkos::RangePolicy; - - private: - using Member = typename Policy::member_type; - using WorkTag = typename Policy::work_tag; - using LaunchBounds = typename Policy::launch_bounds; - - const FunctorType m_functor; - const Policy m_policy; - - template - inline __device__ std::enable_if_t::value> exec_range( - const Member i) const { - m_functor(i); - } - - template - inline __device__ std::enable_if_t::value> exec_range( - const Member i) const { - m_functor(TagType(), i); - } - - public: - using functor_type = FunctorType; - - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; - ParallelFor& operator=(ParallelFor const&) = delete; - - inline __device__ void operator()() const { - const Member work_stride = blockDim.y * gridDim.x; - const Member work_end = m_policy.end(); - - for (Member iwork = - m_policy.begin() + threadIdx.y + blockDim.y * blockIdx.x; - iwork < work_end; - iwork = iwork < work_end - work_stride ? iwork + work_stride - : work_end) { - this->template exec_range(iwork); - } - } - - inline void execute() const { - const typename Policy::index_type nwork = m_policy.end() - m_policy.begin(); - - using DriverType = ParallelFor; - const int block_size = - Kokkos::Impl::hip_get_preferred_blocksize(); - const dim3 block(1, block_size, 1); - const dim3 grid( - typename Policy::index_type((nwork + block.y - 1) / block.y), 1, 1); - - if (block_size == 0) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelFor< HIP > could not find a " - "valid execution configuration.")); - } - Kokkos::Impl::hip_parallel_launch( - *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), - false); - } - - ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) - : m_functor(arg_functor), m_policy(arg_policy) {} -}; - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -template -class ParallelReduce, - Kokkos::HIP> { - public: - using Policy = Kokkos::RangePolicy; - using FunctorType = typename CombinedFunctorReducerType::functor_type; - using ReducerType = typename CombinedFunctorReducerType::reducer_type; - - private: - using WorkRange = typename Policy::WorkRange; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; - using LaunchBounds = typename Policy::launch_bounds; - - public: - using pointer_type = typename ReducerType::pointer_type; - using value_type = typename ReducerType::value_type; - using reference_type = typename ReducerType::reference_type; - using functor_type = FunctorType; - using reducer_type = ReducerType; - using size_type = Kokkos::HIP::size_type; - using index_type = typename Policy::index_type; - // Conditionally set word_size_type to int16_t or int8_t if value_type is - // smaller than int32_t (Kokkos::HIP::size_type) - // word_size_type is used to determine the word count, shared memory buffer - // size, and global memory buffer size before the scan is performed. - // Within the scan, the word count is recomputed based on word_size_type - // and when calculating indexes into the shared/global memory buffers for - // performing the scan, word_size_type is used again. - // For scalars > 4 bytes in size, indexing into shared/global memory relies - // on the block and grid dimensions to ensure that we index at the correct - // offset rather than at every 4 byte word; such that, when the join is - // performed, we have the correct data that was copied over in chunks of 4 - // bytes. - using word_size_type = std::conditional_t< - sizeof(value_type) < sizeof(size_type), - std::conditional_t, size_type>; - - // Algorithmic constraints: blockSize is a power of two AND blockDim.y == - // blockDim.z == 1 - - const CombinedFunctorReducerType m_functor_reducer; - const Policy m_policy; - const pointer_type m_result_ptr; - const bool m_result_ptr_device_accessible; - const bool m_result_ptr_host_accessible; - word_size_type* m_scratch_space = nullptr; - size_type* m_scratch_flags = nullptr; - - static constexpr bool UseShflReduction = false; - - private: - struct ShflReductionTag {}; - struct SHMEMReductionTag {}; - - // Make the exec_range calls call to Reduce::DeviceIterateTile - template - __device__ inline std::enable_if_t::value> exec_range( - const Member& i, reference_type update) const { - m_functor_reducer.get_functor()(i, update); - } - - template - __device__ inline std::enable_if_t::value> exec_range( - const Member& i, reference_type update) const { - m_functor_reducer.get_functor()(TagType(), i, update); - } - - public: - __device__ inline void operator()() const { - using ReductionTag = std::conditional_t; - run(ReductionTag{}); - } - - __device__ inline void run(SHMEMReductionTag) const { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - const integral_nonzero_constant - word_count(reducer.value_size() / sizeof(word_size_type)); - - { - reference_type value = reducer.init(reinterpret_cast( - ::Kokkos::kokkos_impl_hip_shared_memory() + - threadIdx.y * word_count.value)); - - // Number of blocks is bounded so that the reduction can be limited to two - // passes. Each thread block is given an approximately equal amount of - // work to perform. Accumulate the values for this block. The accumulation - // ordering does not match the final pass, but is arithmetically - // equivalent. - - const WorkRange range(m_policy, blockIdx.x, gridDim.x); - - for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); - iwork < iwork_end; iwork += blockDim.y) { - this->template exec_range(iwork, value); - } - } - - // Reduce with final value at blockDim.y - 1 location. - // Shortcut for length zero reduction - bool do_final_reduction = m_policy.begin() == m_policy.end(); - if (!do_final_reduction) - do_final_reduction = hip_single_inter_block_reduce_scan( - reducer, blockIdx.x, gridDim.x, - ::Kokkos::kokkos_impl_hip_shared_memory(), - m_scratch_space, m_scratch_flags); - if (do_final_reduction) { - // This is the final block with the final result at the final threads' - // location - - word_size_type* const shared = - ::Kokkos::kokkos_impl_hip_shared_memory() + - (blockDim.y - 1) * word_count.value; - word_size_type* const global = - m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) - : m_scratch_space; - - if (threadIdx.y == 0) { - reducer.final(reinterpret_cast(shared)); - } - - if (::Kokkos::Impl::HIPTraits::WarpSize < word_count.value) { - __syncthreads(); - } - - for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { - global[i] = shared[i]; - } - } - } - - __device__ inline void run(ShflReductionTag) const { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - value_type value; - reducer.init(&value); - // Number of blocks is bounded so that the reduction can be limited to two - // passes. Each thread block is given an approximately equal amount of work - // to perform. Accumulate the values for this block. The accumulation - // ordering does not match the final pass, but is arithmetically equivalent. - - WorkRange const range(m_policy, blockIdx.x, gridDim.x); - - for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); - iwork < iwork_end; iwork += blockDim.y) { - this->template exec_range(iwork, value); - } - - pointer_type const result = reinterpret_cast(m_scratch_space); - - int max_active_thread = static_cast(range.end() - range.begin()) < - static_cast(blockDim.y) - ? range.end() - range.begin() - : blockDim.y; - - max_active_thread = - (max_active_thread == 0) ? blockDim.y : max_active_thread; - - value_type init; - reducer.init(&init); - if (m_policy.begin() == m_policy.end()) { - reducer.final(&value); - pointer_type const final_result = - m_result_ptr_device_accessible ? m_result_ptr : result; - *final_result = value; - } else if (Impl::hip_inter_block_shuffle_reduction<>( - value, init, reducer, m_scratch_space, result, - m_scratch_flags, max_active_thread)) { - unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; - if (id == 0) { - reducer.final(&value); - pointer_type const final_result = - m_result_ptr_device_accessible ? m_result_ptr : result; - *final_result = value; - } - } - } - - // Determine block size constrained by shared memory: - inline unsigned local_block_size(const FunctorType& f) { - const auto& instance = m_policy.space().impl_internal_space_instance(); - auto shmem_functor = [&f](unsigned n) { - return hip_single_inter_block_reduce_scan_shmem(f, n); - }; - return Kokkos::Impl::hip_get_preferred_blocksize( - instance, shmem_functor); - } - - inline void execute() { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - const index_type nwork = m_policy.end() - m_policy.begin(); - const bool need_device_set = ReducerType::has_init_member_function() || - ReducerType::has_final_member_function() || - !m_result_ptr_host_accessible || - !std::is_same::value; - if ((nwork > 0) || need_device_set) { - const int block_size = local_block_size(m_functor_reducer.get_functor()); - if (block_size == 0) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a " - "valid execution configuration.")); - } - - // REQUIRED ( 1 , N , 1 ) - dim3 block(1, block_size, 1); - // use a slightly less constrained, but still well bounded limit for - // scratch - int nblocks = (nwork + block.y - 1) / block.y; - // Heuristic deciding the value of nblocks. - // The general idea here is we want to: - // 1. Not undersubscribe the device (i.e., we want at least - // preferred_block_min blocks) - // 2. Have each thread reduce > 1 value to minimize overheads - // 3. Limit the total # of blocks, to avoid unbounded scratch space - constexpr int block_max = 4096; - constexpr int preferred_block_min = 1024; - - if (nblocks < preferred_block_min) { - // keep blocks as is, already have low parallelism - } else if (nblocks > block_max) { - // "large dispatch" -> already have lots of parallelism - nblocks = block_max; - } else { - // in the intermediate range, try to have each thread process multiple - // items to offset the cost of the reduction (with not enough - // parallelism to hide it) - int items_per_thread = - (nwork + nblocks * block_size - 1) / (nblocks * block_size); - if (items_per_thread < 4) { - int ratio = std::min( - (nblocks + preferred_block_min - 1) / preferred_block_min, - (4 + items_per_thread - 1) / items_per_thread); - nblocks /= ratio; - } - } - - // TODO: down casting these uses more space than required? - m_scratch_space = - (word_size_type*)::Kokkos::Impl::hip_internal_scratch_space( - m_policy.space(), reducer.value_size() * nblocks); - // Intentionally do not downcast to word_size_type since we use HIP - // atomics in Kokkos_HIP_ReduceScan.hpp - m_scratch_flags = ::Kokkos::Impl::hip_internal_scratch_flags( - m_policy.space(), sizeof(size_type)); - // Required grid.x <= block.y - dim3 grid(nblocks, 1, 1); - - if (nwork == 0) { - block = dim3(1, 1, 1); - grid = dim3(1, 1, 1); - } - const int shmem = - UseShflReduction - ? 0 - : hip_single_inter_block_reduce_scan_shmem( - m_functor_reducer.get_functor(), block.y); - - Kokkos::Impl::hip_parallel_launch( - *this, grid, block, shmem, - m_policy.space().impl_internal_space_instance(), - false); // copy to device and execute - - if (!m_result_ptr_device_accessible && m_result_ptr) { - const int size = reducer.value_size(); - DeepCopy(m_policy.space(), m_result_ptr, - m_scratch_space, size); - } - } else { - if (m_result_ptr) { - reducer.init(m_result_ptr); - } - } - } - - template - ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, - const Policy& arg_policy, const ViewType& arg_result) - : m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr(arg_result.data()), - m_result_ptr_device_accessible( - MemorySpaceAccess::accessible), - m_result_ptr_host_accessible( - MemorySpaceAccess::accessible) {} -}; - template class ParallelScanHIPBase { public: @@ -763,5 +391,3 @@ class ParallelScanWithTotal, } // namespace Kokkos #endif - -#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp deleted file mode 100644 index 3fe568ac361f..000000000000 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp +++ /dev/null @@ -1,936 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKO_HIP_PARALLEL_TEAM_HPP -#define KOKKO_HIP_PARALLEL_TEAM_HPP - -#include - -#if defined(__HIPCC__) - -#include -#include -#include -#include - -namespace Kokkos { -namespace Impl { - -template -class TeamPolicyInternal - : public PolicyTraits { - public: - using execution_policy = TeamPolicyInternal; - - using traits = PolicyTraits; - - template - friend class TeamPolicyInternal; - - private: - typename traits::execution_space m_space; - int m_league_size; - int m_team_size; - int m_vector_length; - size_t m_team_scratch_size[2]; - size_t m_thread_scratch_size[2]; - int m_chunk_size; - bool m_tune_team_size; - bool m_tune_vector_length; - - public: - using execution_space = HIP; - - template - TeamPolicyInternal(TeamPolicyInternal const& p) { - m_league_size = p.m_league_size; - m_team_size = p.m_team_size; - m_vector_length = p.m_vector_length; - m_team_scratch_size[0] = p.m_team_scratch_size[0]; - m_team_scratch_size[1] = p.m_team_scratch_size[1]; - m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; - m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; - m_chunk_size = p.m_chunk_size; - m_space = p.m_space; - m_tune_team_size = p.m_tune_team_size; - m_tune_vector_length = p.m_tune_vector_length; - } - - template - int team_size_max(FunctorType const& f, ParallelForTag const&) const { - using closure_type = - Impl::ParallelFor>; - - return internal_team_size_common(f); - } - - template - inline int team_size_max(const FunctorType& f, - const ParallelReduceTag&) const { - using functor_analysis_type = - Impl::FunctorAnalysis; - using closure_type = Impl::ParallelReduce< - CombinedFunctorReducer, - TeamPolicy, Kokkos::HIP>; - return internal_team_size_common< - BlockType::Max, closure_type, - typename functor_analysis_type::value_type>(f); - } - - template - inline int team_size_max(const FunctorType& f, const ReducerType&, - const ParallelReduceTag&) const { - using closure_type = - Impl::ParallelReduce, - TeamPolicy, Kokkos::HIP>; - return internal_team_size_common(f); - } - - template - int team_size_recommended(FunctorType const& f, ParallelForTag const&) const { - using closure_type = - Impl::ParallelFor>; - - return internal_team_size_common( - f); - } - - template - inline int team_size_recommended(FunctorType const& f, - ParallelReduceTag const&) const { - using functor_analysis_type = - Impl::FunctorAnalysis; - using closure_type = Impl::ParallelReduce< - CombinedFunctorReducer, - TeamPolicy, Kokkos::HIP>; - return internal_team_size_common< - BlockType::Preferred, closure_type, - typename functor_analysis_type::value_type>(f); - } - - template - int team_size_recommended(FunctorType const& f, ReducerType const&, - ParallelReduceTag const&) const { - using closure_type = - Impl::ParallelReduce, - TeamPolicy, Kokkos::HIP>; - return internal_team_size_common(f); - } - - inline bool impl_auto_vector_length() const { return m_tune_vector_length; } - inline bool impl_auto_team_size() const { return m_tune_team_size; } - static int vector_length_max() { return HIPTraits::WarpSize; } - - static int verify_requested_vector_length(int requested_vector_length) { - int test_vector_length = - std::min(requested_vector_length, vector_length_max()); - - // Allow only power-of-two vector_length - if (!(is_integral_power_of_two(test_vector_length))) { - int test_pow2 = 1; - constexpr int warp_size = HIPTraits::WarpSize; - while (test_pow2 < warp_size) { - test_pow2 <<= 1; - if (test_pow2 > test_vector_length) { - break; - } - } - test_vector_length = test_pow2 >> 1; - } - - return test_vector_length; - } - - inline static int scratch_size_max(int level) { - // HIP Teams use (team_size + 2)*sizeof(double) shared memory for team - // reductions. They also use one int64_t in static shared memory for a - // shared ID. Furthermore, they use additional scratch memory in some - // reduction scenarios, which depend on the size of the value_type and is - // NOT captured here - constexpr size_t max_possible_team_size = 1024; - constexpr size_t max_reserved_shared_mem_per_team = - (max_possible_team_size + 2) * sizeof(double) + sizeof(int64_t); - // arbitrarily setting level 1 scratch limit to 20MB, for a - // MI250 that would give us about 4.4GB for 2 teams per CU - constexpr size_t max_l1_scratch_size = 20 * 1024 * 1024; - - size_t max_shmem = HIP().hip_device_prop().sharedMemPerBlock; - return (level == 0 ? max_shmem - max_reserved_shared_mem_per_team - : max_l1_scratch_size); - } - - inline void impl_set_vector_length(size_t size) { m_vector_length = size; } - inline void impl_set_team_size(size_t size) { m_team_size = size; } - int impl_vector_length() const { return m_vector_length; } - - int team_size() const { return m_team_size; } - - int league_size() const { return m_league_size; } - - size_t scratch_size(int level, int team_size_ = -1) const { - if (team_size_ < 0) team_size_ = m_team_size; - return m_team_scratch_size[level] + - team_size_ * m_thread_scratch_size[level]; - } - - size_t team_scratch_size(int level) const { - return m_team_scratch_size[level]; - } - - size_t thread_scratch_size(int level) const { - return m_thread_scratch_size[level]; - } - - typename traits::execution_space space() const { return m_space; } - - TeamPolicyInternal() - : m_space(typename traits::execution_space()), - m_league_size(0), - m_team_size(-1), - m_vector_length(0), - m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_chunk_size(HIPTraits::WarpSize), - m_tune_team_size(false), - m_tune_vector_length(false) {} - - /** \brief Specify league size, request team size */ - TeamPolicyInternal(const execution_space space_, int league_size_, - int team_size_request, int vector_length_request = 1) - : m_space(space_), - m_league_size(league_size_), - m_team_size(team_size_request), - m_vector_length( - (vector_length_request > 0) - ? verify_requested_vector_length(vector_length_request) - : (verify_requested_vector_length(1))), - m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_chunk_size(HIPTraits::WarpSize), - m_tune_team_size(bool(team_size_request <= 0)), - m_tune_vector_length(bool(vector_length_request <= 0)) { - // Make sure league size is permissible - if (league_size_ >= static_cast(hip_internal_maximum_grid_count()[0])) - Impl::throw_runtime_exception( - "Requested too large league_size for TeamPolicy on HIP execution " - "space."); - - // Make sure total block size is permissible - if (m_team_size * m_vector_length > HIPTraits::MaxThreadsPerBlock) { - Impl::throw_runtime_exception( - std::string("Kokkos::TeamPolicy< HIP > the team size is too large. " - "Team size x vector length must be smaller than 1024.")); - } - } - - /** \brief Specify league size, request team size */ - TeamPolicyInternal(const execution_space space_, int league_size_, - const Kokkos::AUTO_t& /* team_size_request */, - int vector_length_request = 1) - : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {} - // FLAG - /** \brief Specify league size and team size, request vector length*/ - TeamPolicyInternal(const execution_space space_, int league_size_, - int team_size_request, - const Kokkos::AUTO_t& /* vector_length_request */ - ) - : TeamPolicyInternal(space_, league_size_, team_size_request, -1) - - {} - - /** \brief Specify league size, request team size and vector length*/ - TeamPolicyInternal(const execution_space space_, int league_size_, - const Kokkos::AUTO_t& /* team_size_request */, - const Kokkos::AUTO_t& /* vector_length_request */ - - ) - : TeamPolicyInternal(space_, league_size_, -1, -1) - - {} - - TeamPolicyInternal(int league_size_, int team_size_request, - int vector_length_request = 1) - : TeamPolicyInternal(typename traits::execution_space(), league_size_, - team_size_request, vector_length_request) {} - - TeamPolicyInternal(int league_size_, - const Kokkos::AUTO_t& /* team_size_request */, - int vector_length_request = 1) - : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, - vector_length_request) {} - - /** \brief Specify league size and team size, request vector length*/ - TeamPolicyInternal(int league_size_, int team_size_request, - const Kokkos::AUTO_t& /* vector_length_request */ - - ) - : TeamPolicyInternal(typename traits::execution_space(), league_size_, - team_size_request, -1) - - {} - - /** \brief Specify league size, request team size and vector length*/ - TeamPolicyInternal(int league_size_, - const Kokkos::AUTO_t& /* team_size_request */, - const Kokkos::AUTO_t& /* vector_length_request */ - - ) - : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, - -1) {} - - int chunk_size() const { return m_chunk_size; } - - TeamPolicyInternal& set_chunk_size(typename traits::index_type chunk_size_) { - m_chunk_size = chunk_size_; - return *this; - } - - /** \brief set per team scratch size for a specific level of the scratch - * hierarchy */ - TeamPolicyInternal& set_scratch_size(int level, - PerTeamValue const& per_team) { - m_team_scratch_size[level] = per_team.value; - return *this; - } - - /** \brief set per thread scratch size for a specific level of the scratch - * hierarchy */ - TeamPolicyInternal& set_scratch_size(int level, - PerThreadValue const& per_thread) { - m_thread_scratch_size[level] = per_thread.value; - return *this; - } - - /** \brief set per thread and per team scratch size for a specific level of - * the scratch hierarchy */ - TeamPolicyInternal& set_scratch_size(int level, PerTeamValue const& per_team, - PerThreadValue const& per_thread) { - m_team_scratch_size[level] = per_team.value; - m_thread_scratch_size[level] = per_thread.value; - return *this; - } - - using member_type = Kokkos::Impl::HIPTeamMember; - - protected: - template - int internal_team_size_common(FunctorType const& f) const { - const unsigned shmem_block = team_scratch_size(0) + 2 * sizeof(double); - unsigned shmem_thread = thread_scratch_size(0) + sizeof(double); - using Tag = typename PatternTagFromImplSpecialization::type; - if constexpr (std::is_same_v) { - using Interface = - typename Impl::DeduceFunctorPatternInterface::type; - using Analysis = - Impl::FunctorAnalysis; - shmem_thread += - ((Analysis::StaticValueSize != 0) ? 0 : Analysis::value_size(f)); - } - const int vector_length = impl_vector_length(); - - const auto functor = [&f, shmem_block, shmem_thread, vector_length]( - const hipFuncAttributes& attr, int block_size) { - int functor_shmem = - ::Kokkos::Impl::FunctorTeamShmemSize::value( - f, block_size / vector_length); - return shmem_block + shmem_thread * (block_size / vector_length) + - functor_shmem + attr.sharedSizeBytes; - }; - int block_size; - if constexpr (BlockSize == BlockType::Max) { - block_size = hip_get_max_team_blocksize( - space().impl_internal_space_instance(), functor); - } else { - block_size = - hip_get_preferred_team_blocksize( - space().impl_internal_space_instance(), functor); - } - - if (block_size == 0) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::ParallelFor/Reduce< HIP > could not find a valid " - "team size.")); - } - if constexpr (std::is_same_v) { - return block_size / impl_vector_length(); - } else { - // Currently we require Power-of-2 team size for reductions. - int p2 = 1; - while (p2 <= block_size) p2 *= 2; - p2 /= 2; - return p2 / impl_vector_length(); - } - } -}; - -__device__ inline int64_t hip_get_scratch_index(HIP::size_type league_size, - int32_t* scratch_locks, - size_t num_scratch_locks) { - int64_t threadid = 0; - __shared__ int64_t base_thread_id; - if (threadIdx.x == 0 && threadIdx.y == 0) { - int64_t const wraparound_len = - Kokkos::min(int64_t(league_size), - int64_t(num_scratch_locks) / (blockDim.x * blockDim.y)); - threadid = (blockIdx.x * blockDim.z + threadIdx.z) % wraparound_len; - threadid *= blockDim.x * blockDim.y; - int done = 0; - while (!done) { - done = (0 == atomicCAS(&scratch_locks[threadid], 0, 1)); - if (!done) { - threadid += blockDim.x * blockDim.y; - if (int64_t(threadid + blockDim.x * blockDim.y) >= - wraparound_len * blockDim.x * blockDim.y) - threadid = 0; - } - } - base_thread_id = threadid; - } - __syncthreads(); - threadid = base_thread_id; - return threadid; -} - -__device__ inline void hip_release_scratch_index(int32_t* scratch_locks, - int64_t threadid) { - __syncthreads(); - if (threadIdx.x == 0 && threadIdx.y == 0) { - scratch_locks[threadid] = 0; - } -} - -template -class ParallelFor, HIP> { - public: - using Policy = TeamPolicy; - using functor_type = FunctorType; - using size_type = HIP::size_type; - - private: - using member_type = typename Policy::member_type; - using work_tag = typename Policy::work_tag; - using launch_bounds = typename Policy::launch_bounds; - - // Algorithmic constraints: blockDim.y is a power of two AND - // blockDim.y == blockDim.z == 1 shared memory utilization: - // - // [ team reduce space ] - // [ team shared space ] - - FunctorType const m_functor; - Policy const m_policy; - size_type const m_league_size; - int m_team_size; - size_type const m_vector_size; - int m_shmem_begin; - int m_shmem_size; - void* m_scratch_ptr[2]; - size_t m_scratch_size[2]; - int m_scratch_pool_id = -1; - int32_t* m_scratch_locks; - size_t m_num_scratch_locks; - - template - __device__ inline std::enable_if_t::value> exec_team( - const member_type& member) const { - m_functor(member); - } - - template - __device__ inline std::enable_if_t::value> exec_team( - const member_type& member) const { - m_functor(TagType(), member); - } - - public: - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; - ParallelFor& operator=(ParallelFor const&) = delete; - - __device__ inline void operator()() const { - // Iterate this block through the league - int64_t threadid = 0; - if (m_scratch_size[1] > 0) { - threadid = hip_get_scratch_index(m_league_size, m_scratch_locks, - m_num_scratch_locks); - } - - int const int_league_size = static_cast(m_league_size); - for (int league_rank = blockIdx.x; league_rank < int_league_size; - league_rank += gridDim.x) { - this->template exec_team(typename Policy::member_type( - kokkos_impl_hip_shared_memory(), m_shmem_begin, m_shmem_size, - static_cast(static_cast(m_scratch_ptr[1]) + - ptrdiff_t(threadid / (blockDim.x * blockDim.y)) * - m_scratch_size[1]), - m_scratch_size[1], league_rank, m_league_size)); - } - if (m_scratch_size[1] > 0) { - hip_release_scratch_index(m_scratch_locks, threadid); - } - } - - inline void execute() const { - int64_t const shmem_size_total = m_shmem_begin + m_shmem_size; - dim3 const grid(static_cast(m_league_size), 1, 1); - dim3 const block(static_cast(m_vector_size), - static_cast(m_team_size), 1); - - using closure_type = - ParallelFor, HIP>; - Impl::hip_parallel_launch( - *this, grid, block, shmem_size_total, - m_policy.space().impl_internal_space_instance(), - true); // copy to device and execute - } - - ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) - : m_functor(arg_functor), - m_policy(arg_policy), - m_league_size(arg_policy.league_size()), - m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()) { - auto internal_space_instance = - m_policy.space().impl_internal_space_instance(); - m_team_size = m_team_size >= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor, ParallelForTag()); - - m_shmem_begin = (sizeof(double) * (m_team_size + 2)); - m_shmem_size = - (m_policy.scratch_size(0, m_team_size) + - FunctorTeamShmemSize::value(m_functor, m_team_size)); - m_scratch_size[0] = m_policy.scratch_size(0, m_team_size); - m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - m_scratch_locks = internal_space_instance->m_scratch_locks; - m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; - - // Functor's reduce memory, team scan memory, and team shared memory depend - // upon team size. - m_scratch_ptr[0] = nullptr; - if (m_team_size <= 0) { - m_scratch_ptr[1] = nullptr; - } else { - m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space(); - m_scratch_ptr[1] = internal_space_instance->resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * - (std::min( - static_cast(HIP().concurrency() / - (m_team_size * m_vector_size)), - static_cast(m_league_size)))); - } - - int const shmem_size_total = m_shmem_begin + m_shmem_size; - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory")); - } - - size_t max_size = arg_policy.team_size_max(arg_functor, ParallelForTag()); - if (static_cast(m_team_size) > static_cast(max_size)) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::ParallelFor< HIP > requested too large team size.")); - } - } - - ~ParallelFor() { - if (m_scratch_pool_id >= 0) { - m_policy.space() - .impl_internal_space_instance() - ->release_team_scratch_space(m_scratch_pool_id); - } - } -}; - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -template -class ParallelReduce, HIP> { - public: - using Policy = TeamPolicyInternal; - using FunctorType = typename CombinedFunctorReducerType::functor_type; - using ReducerType = typename CombinedFunctorReducerType::reducer_type; - - private: - using member_type = typename Policy::member_type; - using work_tag = typename Policy::work_tag; - using launch_bounds = typename Policy::launch_bounds; - - using pointer_type = typename ReducerType::pointer_type; - using reference_type = typename ReducerType::reference_type; - using value_type = typename ReducerType::value_type; - - public: - using functor_type = FunctorType; - using size_type = HIP::size_type; - - // static int constexpr UseShflReduction = false; - // FIXME_HIP This should be disabled unconditionally for best performance, but - // it currently causes tests to fail. - static constexpr int UseShflReduction = - (ReducerType::static_value_size() != 0); - - private: - struct ShflReductionTag {}; - struct SHMEMReductionTag {}; - - // Algorithmic constraints: blockDim.y is a power of two AND - // blockDim.y == blockDim.z == 1 shared memory utilization: - // - // [ global reduce space ] - // [ team reduce space ] - // [ team shared space ] - // - - const CombinedFunctorReducerType m_functor_reducer; - const Policy m_policy; - const pointer_type m_result_ptr; - const bool m_result_ptr_device_accessible; - const bool m_result_ptr_host_accessible; - size_type* m_scratch_space; - size_type* m_scratch_flags; - size_type m_team_begin; - size_type m_shmem_begin; - size_type m_shmem_size; - void* m_scratch_ptr[2]; - size_t m_scratch_size[2]; - int m_scratch_pool_id = -1; - int32_t* m_scratch_locks; - size_t m_num_scratch_locks; - const size_type m_league_size; - int m_team_size; - const size_type m_vector_size; - - template - __device__ inline std::enable_if_t::value> exec_team( - member_type const& member, reference_type update) const { - m_functor_reducer.get_functor()(member, update); - } - - template - __device__ inline std::enable_if_t::value> exec_team( - member_type const& member, reference_type update) const { - m_functor_reducer.get_functor()(TagType(), member, update); - } - - __device__ inline void iterate_through_league(int const threadid, - reference_type value) const { - int const int_league_size = static_cast(m_league_size); - for (int league_rank = blockIdx.x; league_rank < int_league_size; - league_rank += gridDim.x) { - this->template exec_team( - member_type( - kokkos_impl_hip_shared_memory() + m_team_begin, - m_shmem_begin, m_shmem_size, - reinterpret_cast( - reinterpret_cast(m_scratch_ptr[1]) + - static_cast(threadid / (blockDim.x * blockDim.y)) * - m_scratch_size[1]), - m_scratch_size[1], league_rank, m_league_size), - value); - } - } - - int compute_block_count() const { - constexpr auto light_weight = - Kokkos::Experimental::WorkItemProperty::HintLightWeight; - constexpr typename Policy::work_item_property property; - // Numbers were tuned on MI210 using dot product and yAx benchmarks - constexpr int block_max = - (property & light_weight) == light_weight ? 2097152 : 65536; - constexpr int preferred_block_min = 1024; - int block_count = m_league_size; - if (block_count < preferred_block_min) { - // keep blocks as is, already low parallelism - } else if (block_count >= block_max) { - block_count = block_max; - - } else { - int nwork = m_league_size * m_team_size; - int items_per_thread = - (nwork + block_count * m_team_size - 1) / (block_count * m_team_size); - if (items_per_thread < 4) { - int ratio = std::min( - (block_count + preferred_block_min - 1) / preferred_block_min, - (4 + items_per_thread - 1) / items_per_thread); - block_count /= ratio; - } - } - - return block_count; - } - - public: - __device__ inline void operator()() const { - int64_t threadid = 0; - if (m_scratch_size[1] > 0) { - threadid = hip_get_scratch_index(m_league_size, m_scratch_locks, - m_num_scratch_locks); - } - - using ReductionTag = std::conditional_t; - run(ReductionTag{}, threadid); - - if (m_scratch_size[1] > 0) { - hip_release_scratch_index(m_scratch_locks, threadid); - } - } - - __device__ inline void run(SHMEMReductionTag, int const threadid) const { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - integral_nonzero_constant const - word_count(reducer.value_size() / sizeof(size_type)); - - reference_type value = - reducer.init(kokkos_impl_hip_shared_memory() + - threadIdx.y * word_count.value); - // Iterate this block through the league - iterate_through_league(threadid, value); - - // Reduce with final value at blockDim.y - 1 location. - bool do_final_reduce = (m_league_size == 0); - if (!do_final_reduce) - do_final_reduce = - hip_single_inter_block_reduce_scan( - reducer, blockIdx.x, gridDim.x, - kokkos_impl_hip_shared_memory(), m_scratch_space, - m_scratch_flags); - if (do_final_reduce) { - // This is the final block with the final result at the final threads' - // location - - size_type* const shared = kokkos_impl_hip_shared_memory() + - (blockDim.y - 1) * word_count.value; - size_type* const global = m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) - : m_scratch_space; - - if (threadIdx.y == 0) { - reducer.final(reinterpret_cast(shared)); - } - - if (HIPTraits::WarpSize < word_count.value) { - __syncthreads(); - } - - for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { - global[i] = shared[i]; - } - } - } - - __device__ inline void run(ShflReductionTag, int const threadid) const { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - value_type value; - reducer.init(&value); - - // Iterate this block through the league - iterate_through_league(threadid, value); - - pointer_type const result = - m_result_ptr_device_accessible - ? m_result_ptr - : reinterpret_cast(m_scratch_space); - - value_type init; - reducer.init(&init); - if (m_league_size == 0) { - reducer.final(&value); - *result = value; - } else if (Impl::hip_inter_block_shuffle_reduction( - value, init, reducer, m_scratch_space, result, - m_scratch_flags, blockDim.y)) { - unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; - if (id == 0) { - reducer.final(&value); - *result = value; - } - } - } - - inline void execute() { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - const bool is_empty_range = m_league_size == 0 || m_team_size == 0; - const bool need_device_set = ReducerType::has_init_member_function() || - ReducerType::has_final_member_function() || - !m_result_ptr_host_accessible || - Policy::is_graph_kernel::value || - !std::is_same::value; - if (!is_empty_range || need_device_set) { - int const block_count = compute_block_count(); - - m_scratch_space = hip_internal_scratch_space( - m_policy.space(), reducer.value_size() * block_count); - m_scratch_flags = - hip_internal_scratch_flags(m_policy.space(), sizeof(size_type)); - - dim3 block(m_vector_size, m_team_size, 1); - dim3 grid(block_count, 1, 1); - if (is_empty_range) { - block = dim3(1, 1, 1); - grid = dim3(1, 1, 1); - } - const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; - - Impl::hip_parallel_launch( - *this, grid, block, shmem_size_total, - m_policy.space().impl_internal_space_instance(), - true); // copy to device and execute - - if (!m_result_ptr_device_accessible) { - m_policy.space().impl_internal_space_instance()->fence(); - - if (m_result_ptr) { - const int size = reducer.value_size(); - DeepCopy(m_result_ptr, m_scratch_space, size); - } - } - } else { - if (m_result_ptr) { - reducer.init(m_result_ptr); - } - } - } - - template - ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer, - Policy const& arg_policy, ViewType const& arg_result) - : m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr(arg_result.data()), - m_result_ptr_device_accessible( - MemorySpaceAccess::accessible), - m_result_ptr_host_accessible( - MemorySpaceAccess::accessible), - m_scratch_space(nullptr), - m_scratch_flags(nullptr), - m_team_begin(0), - m_shmem_begin(0), - m_shmem_size(0), - m_scratch_ptr{nullptr, nullptr}, - m_league_size(arg_policy.league_size()), - m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()) { - auto internal_space_instance = - m_policy.space().impl_internal_space_instance(); - m_team_size = m_team_size >= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor_reducer.get_functor(), - arg_functor_reducer.get_reducer(), - ParallelReduceTag()); - - m_team_begin = - UseShflReduction - ? 0 - : hip_single_inter_block_reduce_scan_shmem( - arg_functor_reducer.get_functor(), m_team_size); - m_shmem_begin = sizeof(double) * (m_team_size + 2); - m_shmem_size = m_policy.scratch_size(0, m_team_size) + - FunctorTeamShmemSize::value( - arg_functor_reducer.get_functor(), m_team_size); - m_scratch_size[0] = m_shmem_size; - m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - m_scratch_locks = internal_space_instance->m_scratch_locks; - m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; - if (m_team_size <= 0) { - m_scratch_ptr[1] = nullptr; - } else { - m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space(); - m_scratch_ptr[1] = internal_space_instance->resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * - (std::min( - static_cast(HIP().concurrency() / - (m_team_size * m_vector_size)), - static_cast(m_league_size)))); - } - - // The global parallel_reduce does not support vector_length other than 1 at - // the moment - if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) - Impl::throw_runtime_exception( - "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " - "greater than 1 is not currently supported for HIP for dynamic " - "sized reduction types."); - - if ((m_team_size < HIPTraits::WarpSize) && !UseShflReduction) - Impl::throw_runtime_exception( - "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller " - "than 64 is not currently supported with HIP for dynamic sized " - "reduction types."); - - // Functor's reduce memory, team scan memory, and team shared memory depend - // upon team size. - - const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; - - if (!Kokkos::Impl::is_integral_power_of_two(m_team_size) && - !UseShflReduction) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size")); - } - - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< HIP > requested too much " - "L0 scratch memory")); - } - - size_t max_size = arg_policy.team_size_max( - arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(), - ParallelReduceTag()); - if (static_cast(m_team_size) > static_cast(max_size)) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< HIP > requested too " - "large team size.")); - } - } - - ~ParallelReduce() { - if (m_scratch_pool_id >= 0) { - m_policy.space() - .impl_internal_space_instance() - ->release_team_scratch_space(m_scratch_pool_id); - } - } -}; -} // namespace Impl -} // namespace Kokkos - -#endif - -#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp index ea599989e7ad..ab24004f5fcb 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp @@ -18,138 +18,14 @@ #define KOKKOS_IMPL_PUBLIC_INCLUDE #endif -#include -#include #include - -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord - SharedAllocationRecord::s_root_record; - -SharedAllocationRecord - SharedAllocationRecord::s_root_record; - -SharedAllocationRecord - SharedAllocationRecord::s_root_record; -#endif - -SharedAllocationRecord::~SharedAllocationRecord() { - auto alloc_size = SharedAllocationRecord::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - alloc_size, (alloc_size - sizeof(SharedAllocationHeader))); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size); -} - -SharedAllocationRecord::SharedAllocationRecord( - const HIPSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - HIP exec; - Kokkos::Impl::DeepCopy( - exec, RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - exec.fence( - "SharedAllocationRecord::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -SharedAllocationRecord::SharedAllocationRecord( - const HIP& arg_exec_space, const HIPSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - Kokkos::Impl::DeepCopy(arg_exec_space, - RecordBase::m_alloc_ptr, &header, - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord::SharedAllocationRecord( - const HIPHostPinnedSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - // Fill in the Header information, directly accessible via host pinned memory - this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord::SharedAllocationRecord( - const HIPManagedSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - // Fill in the Header information, directly accessible via managed memory - this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, - arg_label); -} - -} // namespace Impl -} // namespace Kokkos +#include +#include +#include + +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::HIPSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::HIPHostPinnedSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::HIPManagedSpace); diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp index e68bad972307..fbae51883448 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp @@ -18,120 +18,11 @@ #define KOKKOS_HIP_SHARED_ALLOCATION_RECORD_HPP #include +#include -namespace Kokkos { -namespace Impl { - -template <> -class SharedAllocationRecord - : public HostInaccessibleSharedAllocationRecordCommon { - private: - friend class SharedAllocationRecordCommon; - friend class HostInaccessibleSharedAllocationRecordCommon; - using base_t = HostInaccessibleSharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; -#endif - - const HIPSpace m_space; - - protected: - ~SharedAllocationRecord(); - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec*/, const HIPSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const HIP& exec_space, const HIPSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const HIPSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon { - private: - friend class SharedAllocationRecordCommon; - using base_t = SharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; -#endif - - const HIPHostPinnedSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, const HIPHostPinnedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const HIPHostPinnedSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon { - private: - friend class SharedAllocationRecordCommon; - using base_t = SharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; -#endif - - const HIPManagedSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, const HIPManagedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const HIPManagedSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; -} // namespace Impl -} // namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::HIPSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HIPHostPinnedSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HIPManagedSpace); #endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp index 7f6aa0d8e82d..e8bdfca66fe2 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp @@ -24,10 +24,8 @@ #include #include -#include #include -#include #include #include @@ -287,22 +285,3 @@ void HIPManagedSpace::impl_deallocate( } } // namespace Kokkos - -/*--------------------------------------------------------------------------*/ -/*--------------------------------------------------------------------------*/ - -#include - -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class HostInaccessibleSharedAllocationRecordCommon; -template class SharedAllocationRecordCommon; -template class SharedAllocationRecordCommon; -template class SharedAllocationRecordCommon; - -} // end namespace Impl -} // end namespace Kokkos diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp index f3e5adf87e5c..7f2004e5cbc6 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp @@ -65,6 +65,15 @@ class HIPSpace { ~HIPSpace() = default; /**\brief Allocate untracked memory in the hip space */ + // FIXME_HIP Use execution space instance + void* allocate(const HIP&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + // FIXME_HIP Use execution space instance + void* allocate(const HIP&, const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -76,8 +85,6 @@ class HIPSpace { const size_t arg_logical_size = 0) const; private: - template - friend class LogicalMemorySpace; void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -94,8 +101,6 @@ class HIPSpace { private: int m_device; ///< Which HIP device - - friend class Kokkos::Impl::SharedAllocationRecord; }; template <> @@ -129,6 +134,16 @@ class HIPHostPinnedSpace { ~HIPHostPinnedSpace() = default; /**\brief Allocate untracked memory in the space */ + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -140,8 +155,6 @@ class HIPHostPinnedSpace { const size_t arg_logical_size = 0) const; private: - template - friend class LogicalMemorySpace; void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -194,6 +207,16 @@ class HIPManagedSpace { ~HIPManagedSpace() = default; /**\brief Allocate untracked memory in the space */ + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -209,8 +232,6 @@ class HIPManagedSpace { private: int m_device; ///< Which HIP device - template - friend class LogicalMemorySpace; void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -239,8 +260,7 @@ struct Impl::is_hip_type_space : public std::true_type {}; namespace Kokkos { namespace Impl { -static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); +static_assert(Kokkos::Impl::MemorySpaceAccess::assignable); //---------------------------------------- diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp new file mode 100644 index 000000000000..67e1181125c2 --- /dev/null +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp @@ -0,0 +1,421 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_TEAM_POLICY_INTERNAL_HPP +#define KOKKOS_HIP_TEAM_POLICY_INTERNAL_HPP + +#include + +namespace Kokkos { +namespace Impl { + +template +class TeamPolicyInternal + : public PolicyTraits { + public: + using execution_policy = TeamPolicyInternal; + + using traits = PolicyTraits; + + template + friend class TeamPolicyInternal; + + private: + typename traits::execution_space m_space; + int m_league_size; + int m_team_size; + int m_vector_length; + size_t m_team_scratch_size[2]; + size_t m_thread_scratch_size[2]; + int m_chunk_size; + bool m_tune_team_size; + bool m_tune_vector_length; + + public: + using execution_space = HIP; + + template + TeamPolicyInternal(TeamPolicyInternal const& p) { + m_league_size = p.m_league_size; + m_team_size = p.m_team_size; + m_vector_length = p.m_vector_length; + m_team_scratch_size[0] = p.m_team_scratch_size[0]; + m_team_scratch_size[1] = p.m_team_scratch_size[1]; + m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; + m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; + m_chunk_size = p.m_chunk_size; + m_space = p.m_space; + m_tune_team_size = p.m_tune_team_size; + m_tune_vector_length = p.m_tune_vector_length; + } + + template + int team_size_max(FunctorType const& f, ParallelForTag const&) const { + using closure_type = + Impl::ParallelFor>; + + return internal_team_size_common(f); + } + + template + inline int team_size_max(const FunctorType& f, + const ParallelReduceTag&) const { + using functor_analysis_type = + Impl::FunctorAnalysis; + using closure_type = Impl::ParallelReduce< + CombinedFunctorReducer, + TeamPolicy, Kokkos::HIP>; + return internal_team_size_common< + BlockType::Max, closure_type, + typename functor_analysis_type::value_type>(f); + } + + template + inline int team_size_max(const FunctorType& f, const ReducerType&, + const ParallelReduceTag&) const { + using closure_type = + Impl::ParallelReduce, + TeamPolicy, Kokkos::HIP>; + return internal_team_size_common(f); + } + + template + int team_size_recommended(FunctorType const& f, ParallelForTag const&) const { + using closure_type = + Impl::ParallelFor>; + + return internal_team_size_common( + f); + } + + template + inline int team_size_recommended(FunctorType const& f, + ParallelReduceTag const&) const { + using functor_analysis_type = + Impl::FunctorAnalysis; + using closure_type = Impl::ParallelReduce< + CombinedFunctorReducer, + TeamPolicy, Kokkos::HIP>; + return internal_team_size_common< + BlockType::Preferred, closure_type, + typename functor_analysis_type::value_type>(f); + } + + template + int team_size_recommended(FunctorType const& f, ReducerType const&, + ParallelReduceTag const&) const { + using closure_type = + Impl::ParallelReduce, + TeamPolicy, Kokkos::HIP>; + return internal_team_size_common(f); + } + + inline bool impl_auto_vector_length() const { return m_tune_vector_length; } + inline bool impl_auto_team_size() const { return m_tune_team_size; } + static int vector_length_max() { return HIPTraits::WarpSize; } + + static int verify_requested_vector_length(int requested_vector_length) { + int test_vector_length = + std::min(requested_vector_length, vector_length_max()); + + // Allow only power-of-two vector_length + if (!(is_integral_power_of_two(test_vector_length))) { + int test_pow2 = 1; + constexpr int warp_size = HIPTraits::WarpSize; + while (test_pow2 < warp_size) { + test_pow2 <<= 1; + if (test_pow2 > test_vector_length) { + break; + } + } + test_vector_length = test_pow2 >> 1; + } + + return test_vector_length; + } + + inline static int scratch_size_max(int level) { + // HIP Teams use (team_size + 2)*sizeof(double) shared memory for team + // reductions. They also use one int64_t in static shared memory for a + // shared ID. Furthermore, they use additional scratch memory in some + // reduction scenarios, which depend on the size of the value_type and is + // NOT captured here + constexpr size_t max_possible_team_size = 1024; + constexpr size_t max_reserved_shared_mem_per_team = + (max_possible_team_size + 2) * sizeof(double) + sizeof(int64_t); + // arbitrarily setting level 1 scratch limit to 20MB, for a + // MI250 that would give us about 4.4GB for 2 teams per CU + constexpr size_t max_l1_scratch_size = 20 * 1024 * 1024; + + size_t max_shmem = HIP().hip_device_prop().sharedMemPerBlock; + return (level == 0 ? max_shmem - max_reserved_shared_mem_per_team + : max_l1_scratch_size); + } + + inline void impl_set_vector_length(size_t size) { m_vector_length = size; } + inline void impl_set_team_size(size_t size) { m_team_size = size; } + int impl_vector_length() const { return m_vector_length; } + + int team_size() const { return m_team_size; } + + int league_size() const { return m_league_size; } + + size_t scratch_size(int level, int team_size_ = -1) const { + if (team_size_ < 0) team_size_ = m_team_size; + return m_team_scratch_size[level] + + team_size_ * m_thread_scratch_size[level]; + } + + size_t team_scratch_size(int level) const { + return m_team_scratch_size[level]; + } + + size_t thread_scratch_size(int level) const { + return m_thread_scratch_size[level]; + } + + typename traits::execution_space space() const { return m_space; } + + TeamPolicyInternal() + : m_space(typename traits::execution_space()), + m_league_size(0), + m_team_size(-1), + m_vector_length(0), + m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(HIPTraits::WarpSize), + m_tune_team_size(false), + m_tune_vector_length(false) {} + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(const execution_space space_, int league_size_, + int team_size_request, int vector_length_request = 1) + : m_space(space_), + m_league_size(league_size_), + m_team_size(team_size_request), + m_vector_length( + (vector_length_request > 0) + ? verify_requested_vector_length(vector_length_request) + : (verify_requested_vector_length(1))), + m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(HIPTraits::WarpSize), + m_tune_team_size(bool(team_size_request <= 0)), + m_tune_vector_length(bool(vector_length_request <= 0)) { + // Make sure league size is permissible + if (league_size_ >= static_cast(hip_internal_maximum_grid_count()[0])) + Impl::throw_runtime_exception( + "Requested too large league_size for TeamPolicy on HIP execution " + "space."); + + // Make sure total block size is permissible + if (m_team_size * m_vector_length > HIPTraits::MaxThreadsPerBlock) { + Impl::throw_runtime_exception( + std::string("Kokkos::TeamPolicy< HIP > the team size is too large. " + "Team size x vector length must be smaller than 1024.")); + } + } + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(const execution_space space_, int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + int vector_length_request = 1) + : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {} + // FLAG + /** \brief Specify league size and team size, request vector length*/ + TeamPolicyInternal(const execution_space space_, int league_size_, + int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */ + ) + : TeamPolicyInternal(space_, league_size_, team_size_request, -1) + + {} + + /** \brief Specify league size, request team size and vector length*/ + TeamPolicyInternal(const execution_space space_, int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(space_, league_size_, -1, -1) + + {} + + TeamPolicyInternal(int league_size_, int team_size_request, + int vector_length_request = 1) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, vector_length_request) {} + + TeamPolicyInternal(int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + int vector_length_request = 1) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, + vector_length_request) {} + + /** \brief Specify league size and team size, request vector length*/ + TeamPolicyInternal(int league_size_, int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, -1) + + {} + + /** \brief Specify league size, request team size and vector length*/ + TeamPolicyInternal(int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, + -1) {} + + int chunk_size() const { return m_chunk_size; } + + TeamPolicyInternal& set_chunk_size(typename traits::index_type chunk_size_) { + m_chunk_size = chunk_size_; + return *this; + } + + /** \brief set per team scratch size for a specific level of the scratch + * hierarchy */ + TeamPolicyInternal& set_scratch_size(int level, + PerTeamValue const& per_team) { + m_team_scratch_size[level] = per_team.value; + return *this; + } + + /** \brief set per thread scratch size for a specific level of the scratch + * hierarchy */ + TeamPolicyInternal& set_scratch_size(int level, + PerThreadValue const& per_thread) { + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + /** \brief set per thread and per team scratch size for a specific level of + * the scratch hierarchy */ + TeamPolicyInternal& set_scratch_size(int level, PerTeamValue const& per_team, + PerThreadValue const& per_thread) { + m_team_scratch_size[level] = per_team.value; + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + using member_type = Kokkos::Impl::HIPTeamMember; + + protected: + template + int internal_team_size_common(FunctorType const& f) const { + const unsigned shmem_block = team_scratch_size(0) + 2 * sizeof(double); + unsigned shmem_thread = thread_scratch_size(0) + sizeof(double); + using Tag = typename PatternTagFromImplSpecialization::type; + if constexpr (std::is_same_v) { + using Interface = + typename Impl::DeduceFunctorPatternInterface::type; + using Analysis = + Impl::FunctorAnalysis; + shmem_thread += + ((Analysis::StaticValueSize != 0) ? 0 : Analysis::value_size(f)); + } + const int vector_length = impl_vector_length(); + + const auto functor = [&f, shmem_block, shmem_thread, vector_length]( + const hipFuncAttributes& attr, int block_size) { + int functor_shmem = + ::Kokkos::Impl::FunctorTeamShmemSize::value( + f, block_size / vector_length); + return shmem_block + shmem_thread * (block_size / vector_length) + + functor_shmem + attr.sharedSizeBytes; + }; + int block_size; + if constexpr (BlockSize == BlockType::Max) { + block_size = hip_get_max_team_blocksize( + space().impl_internal_space_instance(), functor); + } else { + block_size = + hip_get_preferred_team_blocksize( + space().impl_internal_space_instance(), functor); + } + + if (block_size == 0) { + Kokkos::Impl::throw_runtime_exception(std::string( + "Kokkos::Impl::ParallelFor/Reduce< HIP > could not find a valid " + "team size.")); + } + if constexpr (std::is_same_v) { + return block_size / impl_vector_length(); + } else { + // Currently we require Power-of-2 team size for reductions. + int p2 = 1; + while (p2 <= block_size) p2 *= 2; + p2 /= 2; + return p2 / impl_vector_length(); + } + } +}; + +__device__ inline int64_t hip_get_scratch_index(HIP::size_type league_size, + int32_t* scratch_locks, + size_t num_scratch_locks) { + int64_t threadid = 0; + __shared__ int64_t base_thread_id; + if (threadIdx.x == 0 && threadIdx.y == 0) { + int64_t const wraparound_len = + Kokkos::min(int64_t(league_size), + int64_t(num_scratch_locks) / (blockDim.x * blockDim.y)); + threadid = (blockIdx.x * blockDim.z + threadIdx.z) % wraparound_len; + threadid *= blockDim.x * blockDim.y; + int done = 0; + while (!done) { + done = (0 == atomicCAS(&scratch_locks[threadid], 0, 1)); + if (!done) { + threadid += blockDim.x * blockDim.y; + if (int64_t(threadid + blockDim.x * blockDim.y) >= + wraparound_len * blockDim.x * blockDim.y) + threadid = 0; + } + } + base_thread_id = threadid; + } + __syncthreads(); + threadid = base_thread_id; + return threadid; +} + +__device__ inline void hip_release_scratch_index(int32_t* scratch_locks, + int64_t threadid) { + __syncthreads(); + if (threadIdx.x == 0 && threadIdx.y == 0) { + scratch_locks[threadid] = 0; + } +} + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp index 313e5f521729..3d70b5964635 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp @@ -19,7 +19,6 @@ #include #include -#include namespace Kokkos { diff --git a/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp b/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp index 5c40d0fbc8d0..4bca29868f78 100644 --- a/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp +++ b/packages/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp @@ -25,19 +25,11 @@ namespace Impl { template struct ZeroMemset> { - ZeroMemset(const HIP& exec_space, const View& dst, - typename View::const_value_type&) { + ZeroMemset(const HIP& exec_space, const View& dst) { KOKKOS_IMPL_HIP_SAFE_CALL(hipMemsetAsync( dst.data(), 0, dst.size() * sizeof(typename View::value_type), exec_space.hip_stream())); } - - ZeroMemset(const View& dst, - typename View::const_value_type&) { - KOKKOS_IMPL_HIP_SAFE_CALL( - hipMemset(dst.data(), 0, - dst.size() * sizeof(typename View::value_type))); - } }; } // namespace Impl diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp b/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp index 4a40ffcaa4f6..6d541a64148a 100644 --- a/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp +++ b/packages/kokkos/core/src/HPX/Kokkos_HPX.cpp @@ -103,6 +103,7 @@ void HPX::print_configuration(std::ostream &os, const bool) const { os << hpx::configuration_string() << '\n'; } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 bool &HPX::impl_get_in_parallel() noexcept { static thread_local bool in_parallel = false; return in_parallel; @@ -127,6 +128,7 @@ HPX::impl_not_in_parallel_scope::~impl_not_in_parallel_scope() noexcept { KOKKOS_EXPECTS(!impl_get_in_parallel()); impl_get_in_parallel() = true; } +#endif void HPX::impl_decrement_active_parallel_region_count() { std::unique_lock l(m_active_parallel_region_count_mutex); diff --git a/packages/kokkos/core/src/HPX/Kokkos_HPX.hpp b/packages/kokkos/core/src/HPX/Kokkos_HPX.hpp index 1dfc5b406464..26181a7c05d3 100644 --- a/packages/kokkos/core/src/HPX/Kokkos_HPX.hpp +++ b/packages/kokkos/core/src/HPX/Kokkos_HPX.hpp @@ -27,14 +27,6 @@ static_assert(false, #include -#include -#include -#include - -#ifdef KOKKOS_ENABLE_HBWSPACE -#include -#endif - #include #include #include @@ -59,6 +51,7 @@ static_assert(false, #include +#include #include #include #include @@ -201,6 +194,7 @@ class HPX { return impl_get_instance_data().m_instance_id; } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static bool &impl_get_in_parallel() noexcept; struct impl_in_parallel_scope { @@ -223,9 +217,10 @@ class HPX { delete; }; - static bool in_parallel(HPX const & = HPX()) noexcept { + KOKKOS_DEPRECATED static bool in_parallel(HPX const & = HPX()) noexcept { return impl_get_in_parallel(); } +#endif static void impl_decrement_active_parallel_region_count(); static void impl_increment_active_parallel_region_count(); @@ -248,18 +243,6 @@ class HPX { #endif } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - template - KOKKOS_DEPRECATED static void partition_master( - F const &, int requested_num_partitions = 0, int = 0) { - if (requested_num_partitions > 1) { - Kokkos::abort( - "Kokkos::Experimental::HPX::partition_master: can't partition an " - "HPX instance\n"); - } - } -#endif - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(); #else @@ -355,7 +338,9 @@ class HPX { hpx::threads::thread_stacksize::default_) const { impl_bulk_plain_erased(force_synchronous, is_light_weight_policy, {[functor](Index i) { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 impl_in_parallel_scope p; +#endif functor.execute_range(i); }}, n, stacksize); @@ -417,15 +402,21 @@ class HPX { hpx::threads::thread_stacksize::default_) const { impl_bulk_setup_finalize_erased(force_synchronous, is_light_weight_policy, {[functor](Index i) { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 impl_in_parallel_scope p; +#endif functor.execute_range(i); }}, {[functor]() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 impl_in_parallel_scope p; +#endif functor.setup(); }}, {[functor]() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 impl_in_parallel_scope p; +#endif functor.finalize(); }}, n, stacksize); @@ -1292,6 +1283,7 @@ class ParallelScan, const WorkRange range(m_policy, t, num_worker_threads); execute_chunk(range.begin(), range.end(), update_sum, false); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 { // Since arrive_and_wait may yield and resume on another worker thread we // set in_parallel = false on the current thread before suspending and set @@ -1299,6 +1291,9 @@ class ParallelScan, Kokkos::Experimental::HPX::impl_not_in_parallel_scope p; barrier.arrive_and_wait(); } +#else + barrier.arrive_and_wait(); +#endif if (t == 0) { final_reducer.init(reinterpret_cast( @@ -1320,6 +1315,7 @@ class ParallelScan, } } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 { // Since arrive_and_wait may yield and resume on another worker thread we // set in_parallel = false on the current thread before suspending and set @@ -1327,6 +1323,9 @@ class ParallelScan, Kokkos::Experimental::HPX::impl_not_in_parallel_scope p; barrier.arrive_and_wait(); } +#else + barrier.arrive_and_wait(); +#endif reference_type update_base = Analysis::Reducer::reference(reinterpret_cast( @@ -1407,6 +1406,7 @@ class ParallelScanWithTotal, const WorkRange range(m_policy, t, num_worker_threads); execute_chunk(range.begin(), range.end(), update_sum, false); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 { // Since arrive_and_wait may yield and resume on another worker thread we // set in_parallel = false on the current thread before suspending and set @@ -1414,6 +1414,9 @@ class ParallelScanWithTotal, Kokkos::Experimental::HPX::impl_not_in_parallel_scope p; barrier.arrive_and_wait(); } +#else + barrier.arrive_and_wait(); +#endif if (t == 0) { final_reducer.init(reinterpret_cast( @@ -1435,6 +1438,7 @@ class ParallelScanWithTotal, } } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 { // Since arrive_and_wait may yield and resume on another worker thread we // set in_parallel = false on the current thread before suspending and set @@ -1442,6 +1446,9 @@ class ParallelScanWithTotal, Kokkos::Experimental::HPX::impl_not_in_parallel_scope p; barrier.arrive_and_wait(); } +#else + barrier.arrive_and_wait(); +#endif reference_type update_base = Analysis::Reducer::reference(reinterpret_cast( diff --git a/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp index c9080db01caf..297b1fadee94 100644 --- a/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp +++ b/packages/kokkos/core/src/KokkosExp_MDRangePolicy.hpp @@ -73,7 +73,7 @@ is_less_than_value_initialized_variable(T arg) { // Checked narrowing conversion that calls abort if the cast changes the value template -constexpr To checked_narrow_cast(From arg) { +constexpr To checked_narrow_cast(From arg, std::size_t idx) { constexpr const bool is_different_signedness = (std::is_signed::value != std::is_signed::value); auto const ret = static_cast(arg); @@ -81,7 +81,12 @@ constexpr To checked_narrow_cast(From arg) { (is_different_signedness && is_less_than_value_initialized_variable(arg) != is_less_than_value_initialized_variable(ret))) { - Kokkos::abort("unsafe narrowing conversion"); + auto msg = + "Kokkos::MDRangePolicy bound type error: an unsafe implicit conversion " + "is performed on a bound (" + + std::to_string(arg) + ") in dimension (" + std::to_string(idx) + + "), which may not preserve its original value.\n"; + Kokkos::abort(msg.c_str()); } return ret; } @@ -96,15 +101,15 @@ constexpr Array to_array_potentially_narrowing(const U (&init)[M]) { using T = typename Array::value_type; Array a{}; constexpr std::size_t N = a.size(); - static_assert(M <= N, ""); + static_assert(M <= N); auto* ptr = a.data(); // NOTE equivalent to // std::transform(std::begin(init), std::end(init), a.data(), // [](U x) { return static_cast(x); }); // except that std::transform is not constexpr. - for (auto x : init) { - *ptr++ = checked_narrow_cast(x); - (void)checked_narrow_cast(x); // see note above + for (std::size_t i = 0; i < M; ++i) { + *ptr++ = checked_narrow_cast(init[i], i); + (void)checked_narrow_cast(init[i], i); // see note above } return a; } @@ -120,10 +125,10 @@ constexpr NVCC_WONT_LET_ME_CALL_YOU_Array to_array_potentially_narrowing( using T = typename NVCC_WONT_LET_ME_CALL_YOU_Array::value_type; NVCC_WONT_LET_ME_CALL_YOU_Array a{}; constexpr std::size_t N = a.size(); - static_assert(M <= N, ""); + static_assert(M <= N); for (std::size_t i = 0; i < M; ++i) { - a[i] = checked_narrow_cast(other[i]); - (void)checked_narrow_cast(other[i]); // see note above + a[i] = checked_narrow_cast(other[i], i); + (void)checked_narrow_cast(other[i], i); // see note above } return a; } @@ -150,9 +155,20 @@ TileSizeProperties get_tile_size_properties(const ExecutionSpace&) { // multi-dimensional iteration pattern template -struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { - using traits = Kokkos::Impl::PolicyTraits; - using range_policy = RangePolicy; +struct MDRangePolicy; + +// Note: If MDRangePolicy has a primary template, implicit CTAD (deduction +// guides) are generated -> MDRangePolicy<> by some compilers, which is +// incorrect. By making it a template specialization instead, no implicit CTAD +// is generated. This works because there has to be at least one property +// specified (which is Rank<...>); otherwise, we'd get the static_assert +// "Kokkos::Error: MD iteration pattern not defined". This template +// specialization uses in all places for correctness. +template +struct MDRangePolicy + : public Kokkos::Impl::PolicyTraits { + using traits = Kokkos::Impl::PolicyTraits; + using range_policy = RangePolicy; typename traits::execution_space m_space; @@ -161,8 +177,8 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { typename traits::schedule_type, typename traits::index_type>; using execution_policy = - MDRangePolicy; // needed for is_execution_space - // interrogation + MDRangePolicy; // needed for is_execution_policy + // interrogation template friend struct MDRangePolicy; @@ -327,6 +343,20 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { } for (int i = rank_start; i != rank_end; i += increment) { const index_type length = m_upper[i] - m_lower[i]; + + if (m_upper[i] < m_lower[i]) { + std::string msg = + "Kokkos::MDRangePolicy bounds error: The lower bound (" + + std::to_string(m_lower[i]) + ") is greater than its upper bound (" + + std::to_string(m_upper[i]) + ") in dimension " + std::to_string(i) + + ".\n"; +#if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + Kokkos::abort(msg.c_str()); +#elif defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) + Kokkos::Impl::log_warning(msg); +#endif + } + if (m_tile[i] <= 0) { m_tune_tile_size = true; if ((inner_direction == Iterate::Right && (i < rank - 1)) || @@ -358,6 +388,60 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { } }; +template +MDRangePolicy(const LT (&)[N], const UT (&)[N])->MDRangePolicy>; + +template +MDRangePolicy(const LT (&)[N], const UT (&)[N], const TT (&)[TN]) + ->MDRangePolicy>; + +template +MDRangePolicy(DefaultExecutionSpace const&, const LT (&)[N], const UT (&)[N]) + ->MDRangePolicy>; + +template +MDRangePolicy(DefaultExecutionSpace const&, const LT (&)[N], const UT (&)[N], + const TT (&)[TN]) + ->MDRangePolicy>; + +template >> +MDRangePolicy(ES const&, const LT (&)[N], const UT (&)[N]) + ->MDRangePolicy>; + +template >> +MDRangePolicy(ES const&, const LT (&)[N], const UT (&)[N], const TT (&)[TN]) + ->MDRangePolicy>; + +template +MDRangePolicy(Array const&, Array const&)->MDRangePolicy>; + +template +MDRangePolicy(Array const&, Array const&, Array const&) + ->MDRangePolicy>; + +template +MDRangePolicy(DefaultExecutionSpace const&, Array const&, + Array const&) + ->MDRangePolicy>; + +template +MDRangePolicy(DefaultExecutionSpace const&, Array const&, + Array const&, Array const&) + ->MDRangePolicy>; + +template >> +MDRangePolicy(ES const&, Array const&, Array const&) + ->MDRangePolicy>; + +template >> +MDRangePolicy(ES const&, Array const&, Array const&, + Array const&) + ->MDRangePolicy>; + } // namespace Kokkos #endif // KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP diff --git a/packages/kokkos/core/src/Kokkos_Array.hpp b/packages/kokkos/core/src/Kokkos_Array.hpp index 82ceaaec2183..ba1626bb72e6 100644 --- a/packages/kokkos/core/src/Kokkos_Array.hpp +++ b/packages/kokkos/core/src/Kokkos_Array.hpp @@ -22,6 +22,7 @@ #endif #include +#include #include #include @@ -320,6 +321,9 @@ struct Array::strided> { : m_elem(arg_ptr), m_size(arg_size), m_stride(arg_stride) {} }; +template +Array(T, Us...)->Array; + } // namespace Kokkos // diff --git a/packages/kokkos/core/src/Kokkos_Assert.hpp b/packages/kokkos/core/src/Kokkos_Assert.hpp index c3b9004734a1..6fea286005e0 100644 --- a/packages/kokkos/core/src/Kokkos_Assert.hpp +++ b/packages/kokkos/core/src/Kokkos_Assert.hpp @@ -44,9 +44,6 @@ __LINE__) " \n"); \ } \ } -// some projects already define this for themselves, so don't mess -// them up -#ifndef KOKKOS_ASSERT #define KOKKOS_ASSERT(...) \ { \ if (!bool(__VA_ARGS__)) { \ @@ -58,8 +55,7 @@ __LINE__) " \n"); \ } \ } -#endif // ifndef KOKKOS_ASSERT -#else // not debug mode +#else // not debug mode #define KOKKOS_EXPECTS(...) #define KOKKOS_ENSURES(...) #ifndef KOKKOS_ASSERT diff --git a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp index 1c4347463219..9acacef901a7 100644 --- a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp +++ b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp @@ -25,7 +25,7 @@ static_assert(false, #include #include -#ifdef KOKKOS_INTERNAL_NOT_PARALLEL +#ifdef KOKKOS_ENABLE_ATOMICS_BYPASS #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeCaller() #else #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeDevice() diff --git a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp index bda37839805c..eebdd20f15d4 100644 --- a/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp +++ b/packages/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp @@ -49,7 +49,7 @@ inline const char* atomic_query_version() { return "KOKKOS_DESUL_ATOMICS"; } #endif // ============================================================ -#ifdef KOKKOS_INTERNAL_NOT_PARALLEL +#ifdef KOKKOS_ENABLE_ATOMICS_BYPASS #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeCaller() #else #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeDevice() diff --git a/packages/kokkos/core/src/Kokkos_Clamp.hpp b/packages/kokkos/core/src/Kokkos_Clamp.hpp new file mode 100644 index 000000000000..033cde9ab848 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Clamp.hpp @@ -0,0 +1,41 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_CLAMP_HPP +#define KOKKOS_CLAMP_HPP + +#include + +namespace Kokkos { + +template +constexpr KOKKOS_INLINE_FUNCTION const T& clamp(const T& value, const T& lo, + const T& hi) { + KOKKOS_EXPECTS(!(hi < lo)); + return (value < lo) ? lo : (hi < value) ? hi : value; +} + +template +constexpr KOKKOS_INLINE_FUNCTION const T& clamp(const T& value, const T& lo, + const T& hi, + ComparatorType comp) { + KOKKOS_EXPECTS(!comp(hi, lo)); + return comp(value, lo) ? lo : comp(hi, value) ? hi : value; +} + +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/Kokkos_CopyViews.hpp b/packages/kokkos/core/src/Kokkos_CopyViews.hpp index a0ca55be7043..08f6ba8d696a 100644 --- a/packages/kokkos/core/src/Kokkos_CopyViews.hpp +++ b/packages/kokkos/core/src/Kokkos_CopyViews.hpp @@ -22,6 +22,7 @@ static_assert(false, #ifndef KOKKOS_COPYVIEWS_HPP_ #define KOKKOS_COPYVIEWS_HPP_ #include +#include #include #include #include @@ -612,12 +613,17 @@ void view_copy(const DstType& dst, const SrcType& src) { }; if (!DstExecCanAccessSrc && !SrcExecCanAccessDst) { - std::string message( - "Error: Kokkos::deep_copy with no available copy mechanism: "); - message += src.label(); - message += " to "; - message += dst.label(); - Kokkos::Impl::throw_runtime_exception(message); + std::ostringstream ss; + ss << "Error: Kokkos::deep_copy with no available copy mechanism: " + << "from source view (\"" << src.label() << "\") to destination view (\"" + << dst.label() << "\").\n" + << "There is no common execution space that can access both source's " + "space\n" + << "(" << src_memory_space().name() << ") and destination's space (" + << dst_memory_space().name() << "), " + << "so source and destination\n" + << "must be contiguous and have the same layout.\n"; + Kokkos::Impl::throw_runtime_exception(ss.str()); } // Figure out iteration order in case we need it @@ -1330,13 +1336,12 @@ inline void contiguous_fill( // Default implementation for execution spaces that don't provide a definition template struct ZeroMemset { - ZeroMemset(const ExecutionSpace& exec_space, const ViewType& dst, - typename ViewType::const_value_type& value) { - contiguous_fill(exec_space, dst, value); - } - - ZeroMemset(const ViewType& dst, typename ViewType::const_value_type& value) { - contiguous_fill(ExecutionSpace(), dst, value); + ZeroMemset(const ExecutionSpace& exec_space, const ViewType& dst) { + using ValueType = typename ViewType::value_type; + alignas(alignof(ValueType)) unsigned char + zero_initialized_storage[sizeof(ValueType)] = {}; + contiguous_fill(exec_space, dst, + *reinterpret_cast(zero_initialized_storage)); } }; @@ -1348,13 +1353,18 @@ inline std::enable_if_t< contiguous_fill_or_memset( const ExecutionSpace& exec_space, const View& dst, typename ViewTraits::const_value_type& value) { -// On A64FX memset seems to do the wrong thing with regards to first touch -// leading to the significant performance issues -#ifndef KOKKOS_ARCH_A64FX - if (Impl::is_zero_byte(value)) - ZeroMemset>(exec_space, dst, value); - else + // With OpenMP, using memset has significant performance issues. + if (Impl::is_zero_byte(value) +#ifdef KOKKOS_ENABLE_OPENMP + && !std::is_same_v #endif + ) + // FIXME intel/19 icpc fails to deduce template parameters here, + // resulting in compilation errors; explicitly passing the template + // parameters to ZeroMemset helps workaround the issue + // See https://github.com/kokkos/kokkos/issues/6775 + ZeroMemset>(exec_space, dst); + else contiguous_fill(exec_space, dst, value); } @@ -1379,15 +1389,20 @@ contiguous_fill_or_memset( typename ViewTraits::const_value_type& value) { using ViewType = View; using exec_space_type = typename ViewType::execution_space; + exec_space_type exec; // On A64FX memset seems to do the wrong thing with regards to first touch // leading to the significant performance issues #ifndef KOKKOS_ARCH_A64FX if (Impl::is_zero_byte(value)) - ZeroMemset>(dst, value); + // FIXME intel/19 icpc fails to deduce template parameters here, + // resulting in compilation errors; explicitly passing the template + // parameters to ZeroMemset helps workaround the issue + // See https://github.com/kokkos/kokkos/issues/6775 + ZeroMemset(exec, dst); else #endif - contiguous_fill(exec_space_type(), dst, value); + contiguous_fill(exec, dst, value); } template diff --git a/packages/kokkos/core/src/Kokkos_Core.hpp b/packages/kokkos/core/src/Kokkos_Core.hpp index 805411a699ec..1f146563be20 100644 --- a/packages/kokkos/core/src/Kokkos_Core.hpp +++ b/packages/kokkos/core/src/Kokkos_Core.hpp @@ -46,14 +46,15 @@ #include #include -#include #include -#include +#include +#include #include #include #include #include #include +#include #include #include #include @@ -101,6 +102,7 @@ void declare_configuration_metadata(const std::string& category, [[nodiscard]] bool is_finalized() noexcept; [[nodiscard]] int device_id() noexcept; +[[nodiscard]] int num_devices() noexcept; [[nodiscard]] int num_threads() noexcept; bool show_warnings() noexcept; @@ -300,9 +302,6 @@ std::vector partition_space(ExecSpace const& space, // implementation of the RAII wrapper is using Kokkos::single. #include -// Specializations required after core definitions -#include - //---------------------------------------------------------------------------- // Redefinition of the macros min and max if we pushed them at entry of // Kokkos_Core.hpp diff --git a/packages/kokkos/core/src/Kokkos_Core_fwd.hpp b/packages/kokkos/core/src/Kokkos_Core_fwd.hpp index 44f1c5b42f4d..7edb35f00eb4 100644 --- a/packages/kokkos/core/src/Kokkos_Core_fwd.hpp +++ b/packages/kokkos/core/src/Kokkos_Core_fwd.hpp @@ -30,10 +30,6 @@ #include #include -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -#include -#endif - //---------------------------------------------------------------------------- // Have assumed a 64-bit build (8-byte pointers) throughout the code base. // 32-bit build allowed but unsupported. @@ -75,9 +71,6 @@ template struct Device; // forward declare here so that backend initializer calls can use it. -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -struct InitArguments; -#endif class InitializationSettings; } // namespace Kokkos @@ -262,12 +255,6 @@ KOKKOS_FUNCTION void runtime_check_memory_access_violation( } } // namespace Impl - -namespace Experimental { -template -class LogicalMemorySpace; -} - } // namespace Kokkos //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp b/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp index ae1585a4989f..5f251eeb26ac 100644 --- a/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp +++ b/packages/kokkos/core/src/Kokkos_ExecPolicy.hpp @@ -28,6 +28,7 @@ static_assert(false, #include #include #include +#include //---------------------------------------------------------------------------- @@ -114,62 +115,67 @@ class RangePolicy : public Impl::PolicyTraits { m_granularity_mask(0) {} /** \brief Total range */ + template && + std::is_convertible_v), + bool> = false> + inline RangePolicy(const IndexType1 work_begin, const IndexType2 work_end) + : RangePolicy(typename traits::execution_space(), work_begin, work_end) {} + + /** \brief Total range */ + template && + std::is_convertible_v), + bool> = false> inline RangePolicy(const typename traits::execution_space& work_space, - const member_type work_begin, const member_type work_end) + const IndexType1 work_begin, const IndexType2 work_end) : m_space(work_space), - m_begin(work_begin < work_end ? work_begin : 0), - m_end(work_begin < work_end ? work_end : 0), + m_begin(work_begin), + m_end(work_end), m_granularity(0), m_granularity_mask(0) { + check_conversion_safety(work_begin); + check_conversion_safety(work_end); + check_bounds_validity(); set_auto_chunk_size(); } - /** \brief Total range */ - inline RangePolicy(const member_type work_begin, const member_type work_end) - : RangePolicy(typename traits::execution_space(), work_begin, work_end) { - set_auto_chunk_size(); - } - - /** \brief Total range */ - template - inline RangePolicy(const typename traits::execution_space& work_space, - const member_type work_begin, const member_type work_end, - Args... args) + template && + std::is_convertible_v), + bool> = false> + RangePolicy(const typename traits::execution_space& work_space, + const IndexType1 work_begin, const IndexType2 work_end, + const ChunkSize chunk_size) : m_space(work_space), - m_begin(work_begin < work_end ? work_begin : 0), - m_end(work_begin < work_end ? work_end : 0), + m_begin(work_begin), + m_end(work_end), m_granularity(0), m_granularity_mask(0) { - set_auto_chunk_size(); - set(args...); + check_conversion_safety(work_begin); + check_conversion_safety(work_end); + check_bounds_validity(); + set_chunk_size(chunk_size.value); } /** \brief Total range */ - template - inline RangePolicy(const member_type work_begin, const member_type work_end, - Args... args) - : RangePolicy(typename traits::execution_space(), work_begin, work_end) { - set_auto_chunk_size(); - set(args...); - } - - private: - inline void set() {} + template && + std::is_convertible_v), + bool> = false> + RangePolicy(const IndexType1 work_begin, const IndexType2 work_end, + const ChunkSize chunk_size) + : RangePolicy(typename traits::execution_space(), work_begin, work_end, + chunk_size) {} public: - template - inline void set(Args...) { - static_assert( - 0 == sizeof...(Args), - "Kokkos::RangePolicy: unhandled constructor arguments encountered."); - } - - template - inline void set(const ChunkSize& chunksize, Args... args) { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED_WITH_COMMENT("Use set_chunk_size instead") + inline void set(ChunkSize chunksize) { m_granularity = chunksize.value; m_granularity_mask = m_granularity - 1; - set(args...); } +#endif public: /** \brief return chunk_size */ @@ -218,6 +224,67 @@ class RangePolicy : public Impl::PolicyTraits { m_granularity_mask = m_granularity - 1; } + void check_bounds_validity() { + if (m_end < m_begin) { + std::string msg = "Kokkos::RangePolicy bounds error: The lower bound (" + + std::to_string(m_begin) + + ") is greater than the upper bound (" + + std::to_string(m_end) + ").\n"; +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 + Kokkos::abort(msg.c_str()); +#endif + m_begin = 0; + m_end = 0; +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + Kokkos::Impl::log_warning(msg); +#endif + } + } + + // To be replaced with std::in_range (c++20) + template + static void check_conversion_safety(const IndexType bound) { +#if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) || \ + defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) + + std::string msg = + "Kokkos::RangePolicy bound type error: an unsafe implicit conversion " + "is performed on a bound (" + + std::to_string(bound) + + "), which may " + "not preserve its original value.\n"; + bool warn = false; + + if constexpr (std::is_signed_v != + std::is_signed_v) { + // check signed to unsigned + if constexpr (std::is_signed_v) + warn |= (bound < static_cast( + std::numeric_limits::min())); + + // check unsigned to signed + if constexpr (std::is_signed_v) + warn |= (bound > static_cast( + std::numeric_limits::max())); + } + + // check narrowing + warn |= (static_cast(static_cast(bound)) != bound); + + if (warn) { +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 + Kokkos::abort(msg.c_str()); +#endif + +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + Kokkos::Impl::log_warning(msg); +#endif + } +#else + (void)bound; +#endif + } + public: /** \brief Subrange for a partition's rank and size. * @@ -261,6 +328,21 @@ class RangePolicy : public Impl::PolicyTraits { }; }; +RangePolicy()->RangePolicy<>; + +RangePolicy(int64_t, int64_t)->RangePolicy<>; +RangePolicy(int64_t, int64_t, ChunkSize const&)->RangePolicy<>; + +RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t)->RangePolicy<>; +RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t, ChunkSize const&) + ->RangePolicy<>; + +template >> +RangePolicy(ES const&, int64_t, int64_t)->RangePolicy; + +template >> +RangePolicy(ES const&, int64_t, int64_t, ChunkSize const&)->RangePolicy; + } // namespace Kokkos //---------------------------------------------------------------------------- @@ -983,7 +1065,16 @@ template const& policy, Lambda const& lambda, ReducerValueType& val) { + static_assert(/*!Kokkos::is_view_v &&*/ + !std::is_array_v && + !std::is_pointer_v && + !Kokkos::is_reducer_v, + "Only scalar return types are allowed!"); + + val = ReducerValueType{}; Impl::md_parallel_impl(policy, lambda, val); + policy.team.team_reduce( + Kokkos::Sum{val}); } template @@ -997,7 +1088,29 @@ template const& policy, Lambda const& lambda, ReducerValueType& val) { + static_assert(/*!Kokkos::is_view_v &&*/ + !std::is_array_v && + !std::is_pointer_v && + !Kokkos::is_reducer_v, + "Only a scalar return types are allowed!"); + + val = ReducerValueType{}; Impl::md_parallel_impl(policy, lambda, val); + if constexpr (false +#ifdef KOKKOS_ENABLE_CUDA + || std::is_same_v +#elif defined(KOKKOS_ENABLE_HIP) + || std::is_same_v +#elif defined(KOKKOS_ENABLE_SYCL) + || std::is_same_v +#endif + ) + policy.team.vector_reduce( + Kokkos::Sum{ + val}); } template @@ -1011,7 +1124,31 @@ template const& policy, Lambda const& lambda, ReducerValueType& val) { + static_assert(/*!Kokkos::is_view_v &&*/ + !std::is_array_v && + !std::is_pointer_v && + !Kokkos::is_reducer_v, + "Only a scalar return types are allowed!"); + + val = ReducerValueType{}; Impl::md_parallel_impl(policy, lambda, val); + if constexpr (false +#ifdef KOKKOS_ENABLE_CUDA + || std::is_same_v +#elif defined(KOKKOS_ENABLE_HIP) + || std::is_same_v +#elif defined(KOKKOS_ENABLE_SYCL) + || std::is_same_v +#endif + ) + policy.team.vector_reduce( + Kokkos::Sum{ + val}); + policy.team.team_reduce( + Kokkos::Sum{val}); } template diff --git a/packages/kokkos/core/src/Kokkos_HBWSpace.hpp b/packages/kokkos/core/src/Kokkos_HBWSpace.hpp deleted file mode 100644 index 369b7bafb7b8..000000000000 --- a/packages/kokkos/core/src/Kokkos_HBWSpace.hpp +++ /dev/null @@ -1,308 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_HBWSPACE_HPP -#define KOKKOS_HBWSPACE_HPP - -#include -#ifdef KOKKOS_ENABLE_HBWSPACE - -#include - -namespace Kokkos { - -namespace Experimental { - -/// \class HBWSpace -/// \brief Memory management for host memory. -/// -/// HBWSpace is a memory space that governs host memory. "Host" -/// memory means the usual CPU-accessible memory. -class HBWSpace { - public: - //! Tag this class as a kokkos memory space - using memory_space = HBWSpace; - using size_type = size_t; - - /// \typedef execution_space - /// \brief Default execution space for this memory space. - /// - /// Every memory space has a default execution space. This is - /// useful for things like initializing a View (which happens in - /// parallel using the View's default execution space). - using execution_space = Kokkos::DefaultHostExecutionSpace; - - //! This memory space preferred device_type - using device_type = Kokkos::Device; - - /**\brief Default memory space instance */ - HBWSpace(); - HBWSpace(const HBWSpace& rhs) = default; - HBWSpace& operator=(const HBWSpace&) = default; - ~HBWSpace() = default; - - /**\brief Non-default memory space instance to choose allocation mechansim, - * if available */ - - enum AllocationMechanism { - STD_MALLOC, - POSIX_MEMALIGN, - POSIX_MMAP, - INTEL_MM_ALLOC - }; - - explicit HBWSpace(const AllocationMechanism&); - - /**\brief Allocate untracked memory in the space */ - void* allocate(const size_t arg_alloc_size) const; - void* allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const; - - /**\brief Deallocate untracked memory in the space */ - void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const; - void deallocate(const char* arg_label, void* const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const; - - private: - template - friend class LogicalMemorySpace; - - void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - const Kokkos::Tools::SpaceHandle = - Kokkos::Tools::make_space_handle(name())) const; - void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - const Kokkos::Tools::SpaceHandle = - Kokkos::Tools::make_space_handle(name())) const; - - public: - /**\brief Return Name of the MemorySpace */ - static constexpr const char* name() { return "HBW"; } - - private: - AllocationMechanism m_alloc_mech; - friend class Kokkos::Impl::SharedAllocationRecord< - Kokkos::Experimental::HBWSpace, void>; -}; - -} // namespace Experimental - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { - -template <> -class SharedAllocationRecord - : public SharedAllocationRecord { - private: - friend Kokkos::Experimental::HBWSpace; - - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - static void deallocate(RecordBase*); - -#ifdef KOKKOS_ENABLE_DEBUG - /**\brief Root record for tracked allocations from this HBWSpace instance */ - static RecordBase s_root_record; -#endif - - const Kokkos::Experimental::HBWSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - SharedAllocationRecord( - const Kokkos::Experimental::HBWSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); - - public: - inline std::string get_label() const { - return std::string(RecordBase::head()->m_label); - } - - KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( - const Kokkos::Experimental::HBWSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size) { - KOKKOS_IF_ON_HOST((return new SharedAllocationRecord(arg_space, arg_label, - arg_alloc_size);)) - KOKKOS_IF_ON_DEVICE(((void)arg_space; (void)arg_label; (void)arg_alloc_size; - return nullptr;)) - } - - /**\brief Allocate tracked memory in the space */ - static void* allocate_tracked(const Kokkos::Experimental::HBWSpace& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size); - - /**\brief Reallocate tracked memory in the space */ - static void* reallocate_tracked(void* const arg_alloc_ptr, - const size_t arg_alloc_size); - - /**\brief Deallocate tracked memory in the space */ - static void deallocate_tracked(void* const arg_alloc_ptr); - - static SharedAllocationRecord* get_record(void* arg_alloc_ptr); - - static void print_records(std::ostream&, - const Kokkos::Experimental::HBWSpace&, - bool detail = false); -}; - -} // namespace Impl - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { - -static_assert( - Kokkos::Impl::MemorySpaceAccess::assignable, - ""); - -template <> -struct MemorySpaceAccess { - enum : bool { assignable = true }; - enum : bool { accessible = true }; - enum : bool { deepcopy = true }; -}; - -template <> -struct MemorySpaceAccess { - enum : bool { assignable = false }; - enum : bool { accessible = true }; - enum : bool { deepcopy = true }; -}; - -} // namespace Impl - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { - -template <> -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const DefaultHostExecutionSpace& exec, void* dst, const void* src, - size_t n) { - hostspace_parallel_deepcopy(exec, dst, src, n); - } -}; - -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const DefaultHostExecutionSpace& exec, void* dst, const void* src, - size_t n) { - hostspace_parallel_deepcopy(exec, dst, src, n); - } -}; - -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); - hostspace_parallel_deepcopy_async(copy_space, dst, src, n); - } -}; - -template <> -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const DefaultHostExecutionSpace& exec, void* dst, const void* src, - size_t n) { - hostspace_parallel_deepcopy(exec, dst, src, n); - } -}; - -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); - hostspace_parallel_deepcopy_async(dst, src, n); - } -}; - -} // namespace Impl - -} // namespace Kokkos - -#endif -#endif // #define KOKKOS_HBWSPACE_HPP diff --git a/packages/kokkos/core/src/Kokkos_HostSpace.hpp b/packages/kokkos/core/src/Kokkos_HostSpace.hpp index 252aabd949f8..a1fb0f5a677d 100644 --- a/packages/kokkos/core/src/Kokkos_HostSpace.hpp +++ b/packages/kokkos/core/src/Kokkos_HostSpace.hpp @@ -37,7 +37,6 @@ static_assert(false, #include #include "impl/Kokkos_HostSpace_deepcopy.hpp" -#include /*--------------------------------------------------------------------------*/ @@ -94,6 +93,16 @@ class HostSpace { #endif /**\brief Allocate untracked memory in the space */ + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -105,9 +114,6 @@ class HostSpace { const size_t arg_logical_size = 0) const; private: - template - friend class Kokkos::Experimental::LogicalMemorySpace; - void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -124,7 +130,6 @@ class HostSpace { private: static constexpr const char* m_name = "Host"; - friend class Kokkos::Impl::SharedAllocationRecord; }; } // namespace Kokkos @@ -136,8 +141,7 @@ namespace Kokkos { namespace Impl { static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); template struct HostMirror { @@ -173,75 +177,7 @@ struct HostMirror { //---------------------------------------------------------------------------- -namespace Kokkos { - -namespace Impl { - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon { - private: - friend Kokkos::HostSpace; - friend class SharedAllocationRecordCommon; - - using base_t = SharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - /**\brief Root record for tracked allocations from this HostSpace instance */ - static RecordBase s_root_record; -#endif - - Kokkos::HostSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - // This constructor does not forward to the one without exec_space arg - // in order to work around https://github.com/kokkos/kokkos/issues/5258 - // This constructor is templated so I can't just put it into the cpp file - // like the other constructor. - template - SharedAllocationRecord( - const ExecutionSpace& /* exec_space*/, const Kokkos::HostSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, - arg_label); - } - - SharedAllocationRecord( - const Kokkos::HostSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); - - public: - KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( - const Kokkos::HostSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size) { - KOKKOS_IF_ON_HOST((return new SharedAllocationRecord(arg_space, arg_label, - arg_alloc_size);)) - KOKKOS_IF_ON_DEVICE(((void)arg_space; (void)arg_label; (void)arg_alloc_size; - return nullptr;)) - } -}; - -} // namespace Impl - -} // namespace Kokkos +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HostSpace); //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/Kokkos_LogicalSpaces.hpp b/packages/kokkos/core/src/Kokkos_LogicalSpaces.hpp deleted file mode 100644 index 1ee1d2c81fe5..000000000000 --- a/packages/kokkos/core/src/Kokkos_LogicalSpaces.hpp +++ /dev/null @@ -1,413 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_LOGICALSPACES_HPP -#define KOKKOS_LOGICALSPACES_HPP - -#include -#include -#include -#include -#include -#include -#include -#include -namespace Kokkos { -namespace Experimental { -struct DefaultMemorySpaceNamer { - static constexpr const char* get_name() { - return "DefaultLogicalMemorySpaceName"; - } -}; - -struct LogicalSpaceSharesAccess { - struct shared_access {}; - struct no_shared_access {}; -}; - -/// \class LogicalMemorySpace -/// \brief -/// -/// LogicalMemorySpace is a space that is identical to another space, -/// but differentiable by name and template argument -template -class LogicalMemorySpace { -#ifdef KOKKOS_ENABLE_OPENMPTARGET - // [DZP] For some reason I don't yet know, using LogicalMemorySpaces - // inside an OpenMPTarget build causes errors in the - // SharedAllocationRecords of other types. This is my way of erroring - // a build if we instantiate a LogicalMemSpace in an OMPTarget build - static_assert(!std::is_same::value, - "Can't use LogicalMemorySpaces in an OpenMPTarget build, we're " - "debugging memory issues"); -#endif - public: - //! Tag this class as a kokkos memory space - using memory_space = LogicalMemorySpace; - using size_type = typename BaseSpace::size_type; - - /// \typedef execution_space - /// \brief Default execution space for this memory space. - /// - /// Every memory space has a default execution space. This is - /// useful for things like initializing a View (which happens in - /// parallel using the View's default execution space). - - using execution_space = - std::conditional_t::value, - typename BaseSpace::execution_space, - DefaultBaseExecutionSpace>; - - using device_type = Kokkos::Device; - - LogicalMemorySpace() = default; - - template - LogicalMemorySpace(Args&&... args) : underlying_space((Args &&) args...) {} - - /**\brief Allocate untracked memory in the space */ - void* allocate(const size_t arg_alloc_size) const { - return allocate("[unlabeled]", arg_alloc_size); - } - void* allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const { - return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); - } - - /**\brief Deallocate untracked memory in the space */ - void deallocate(void* const arg_alloc_ptr, - const size_t arg_alloc_size) const { - deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); - } - void deallocate(const char* arg_label, void* const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const { - impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); - } - - /**\brief Return Name of the MemorySpace */ - constexpr static const char* name() { return Namer::get_name(); } - - private: - BaseSpace underlying_space; - template - friend class LogicalMemorySpace; - friend class Kokkos::Impl::SharedAllocationRecord; - - void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - Kokkos::Tools::SpaceHandle arg_handle = - Kokkos::Tools::make_space_handle(name())) const { - return underlying_space.impl_allocate(arg_label, arg_alloc_size, - arg_logical_size, arg_handle); - } - void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - const Kokkos::Tools::SpaceHandle arg_handle = - Kokkos::Tools::make_space_handle(name())) const { - underlying_space.impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, - arg_logical_size, arg_handle); - } -}; -} // namespace Experimental -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { - -template -struct MemorySpaceAccess< - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>, - OtherSpace> { - enum { assignable = MemorySpaceAccess::assignable }; - enum { accessible = MemorySpaceAccess::accessible }; - enum { deepcopy = MemorySpaceAccess::deepcopy }; -}; - -template -struct MemorySpaceAccess< - OtherSpace, - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>> { - enum { assignable = MemorySpaceAccess::assignable }; - enum { accessible = MemorySpaceAccess::accessible }; - enum { deepcopy = MemorySpaceAccess::deepcopy }; -}; - -template -struct MemorySpaceAccess< - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>, - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>> { - enum { assignable = true }; - enum { accessible = true }; - enum { deepcopy = true }; -}; - -} // namespace Impl - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { -template -class SharedAllocationRecord, - void> : public SharedAllocationRecord { - private: - using SpaceType = - Kokkos::Experimental::LogicalMemorySpace; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - static void deallocate(RecordBase* arg_rec) { - delete static_cast(arg_rec); - } - -#ifdef KOKKOS_ENABLE_DEBUG - /**\brief Root record for tracked allocations from this - * LogicalMemorySpace instance */ - static RecordBase s_root_record; -#endif - - const SpaceType m_space; - - protected: - ~SharedAllocationRecord() { - m_space.deallocate(RecordBase::m_alloc_ptr->m_label, - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size, - (SharedAllocationRecord::m_alloc_size - - sizeof(SharedAllocationHeader))); - } - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, const SpaceType& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const SpaceType& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : SharedAllocationRecord( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - // Fill in the Header information - RecordBase::m_alloc_ptr->m_record = - static_cast*>(this); - - strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length - 1); - // Set last element zero, in case c_str is too long - RecordBase::m_alloc_ptr - ->m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0'; - } - - public: - inline std::string get_label() const { - return std::string(RecordBase::head()->m_label); - } - KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( - const SpaceType& arg_space, const std::string& arg_label, - const size_t arg_alloc_size) { - KOKKOS_IF_ON_HOST((return new SharedAllocationRecord(arg_space, arg_label, - arg_alloc_size);)) - KOKKOS_IF_ON_DEVICE(((void)arg_space; (void)arg_label; (void)arg_alloc_size; - return nullptr;)) - } - - /**\brief Allocate tracked memory in the space */ - static void* allocate_tracked(const SpaceType& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size) { - if (!arg_alloc_size) return (void*)nullptr; - - SharedAllocationRecord* const r = - allocate(arg_space, arg_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); - } - - /**\brief Reallocate tracked memory in the space */ - static void* reallocate_tracked(void* const arg_alloc_ptr, - const size_t arg_alloc_size) { - SharedAllocationRecord* const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord* const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - Kokkos::fence( - "SharedAllocationRecord::reallocate_tracked: fence after copying data"); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); - } - /**\brief Deallocate tracked memory in the space */ - static void deallocate_tracked(void* const arg_alloc_ptr) { - if (arg_alloc_ptr != nullptr) { - SharedAllocationRecord* const r = get_record(arg_alloc_ptr); - - RecordBase::decrement(r); - } - } - - static SharedAllocationRecord* get_record(void* alloc_ptr) { - using Header = SharedAllocationHeader; - using RecordHost = SharedAllocationRecord; - - SharedAllocationHeader const* const head = - alloc_ptr ? Header::get_header(alloc_ptr) - : (SharedAllocationHeader*)nullptr; - RecordHost* const record = - head ? static_cast(head->m_record) : (RecordHost*)nullptr; - - if (!alloc_ptr || record->m_alloc_ptr != head) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::SharedAllocationRecord< LogicalMemorySpace<> , " - "void >::get_record ERROR")); - } - - return record; - } -#ifdef KOKKOS_ENABLE_DEBUG - static void print_records(std::ostream& s, const SpaceType&, - bool detail = false) { - SharedAllocationRecord::print_host_accessible_records( - s, "HostSpace", &s_root_record, detail); - } -#else - static void print_records(std::ostream&, const SpaceType&, - bool detail = false) { - (void)detail; - throw_runtime_exception( - "SharedAllocationRecord::print_records only works " - "with KOKKOS_ENABLE_DEBUG enabled"); - } -#endif -}; -#ifdef KOKKOS_ENABLE_DEBUG -/**\brief Root record for tracked allocations from this LogicalSpace - * instance */ -template -SharedAllocationRecord - SharedAllocationRecord, - void>::s_root_record; -#endif - -} // namespace Impl - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { - -template -struct DeepCopy, - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, SharesAccess>, - ExecutionSpace> { - DeepCopy(void* dst, void* src, size_t n) { - DeepCopy(dst, src, n); - } - DeepCopy(const ExecutionSpace& exec, void* dst, void* src, size_t n) { - DeepCopy(exec, dst, src, n); - } -}; - -template -struct DeepCopy, - ExecutionSpace> { - DeepCopy(void* dst, void* src, size_t n) { - DeepCopy(dst, src, n); - } - DeepCopy(const ExecutionSpace& exec, void* dst, void* src, size_t n) { - DeepCopy(exec, dst, src, n); - } -}; - -template -struct DeepCopy, - DestinationSpace, ExecutionSpace> { - DeepCopy(void* dst, void* src, size_t n) { - DeepCopy(dst, src, n); - } - DeepCopy(const ExecutionSpace& exec, void* dst, void* src, size_t n) { - DeepCopy(exec, dst, src, n); - } -}; -} // namespace Impl - -} // namespace Kokkos -#endif // KOKKOS_LOGICALSPACES_HPP diff --git a/packages/kokkos/core/src/Kokkos_Macros.hpp b/packages/kokkos/core/src/Kokkos_Macros.hpp index 3cf7ac4fa24b..b255d2a51950 100644 --- a/packages/kokkos/core/src/Kokkos_Macros.hpp +++ b/packages/kokkos/core/src/Kokkos_Macros.hpp @@ -84,11 +84,12 @@ //---------------------------------------------------------------------------- -#if !defined(KOKKOS_ENABLE_THREADS) && !defined(KOKKOS_ENABLE_CUDA) && \ - !defined(KOKKOS_ENABLE_OPENMP) && !defined(KOKKOS_ENABLE_HPX) && \ - !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_HIP) && \ - !defined(KOKKOS_ENABLE_SYCL) -#define KOKKOS_INTERNAL_NOT_PARALLEL +#if defined(KOKKOS_ENABLE_ATOMICS_BYPASS) && \ + (defined(KOKKOS_ENABLE_THREADS) || defined(KOKKOS_ENABLE_CUDA) || \ + defined(KOKKOS_ENABLE_OPENMP) || defined(KOKKOS_ENABLE_HPX) || \ + defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_HIP) || \ + defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENACC)) +#error Atomics may only be disabled if neither a host parallel nor a device backend is enabled #endif #define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA @@ -339,12 +340,6 @@ #define KOKKOS_IMPL_DEVICE_FUNCTION #endif -// Temporary solution for SYCL not supporting printf in kernels. -// Might disappear at any point once we have found another solution. -#if !defined(KOKKOS_IMPL_DO_NOT_USE_PRINTF) -#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(...) ::printf(__VA_ARGS__) -#endif - //---------------------------------------------------------------------------- // Define final version of functions. This is so that clang tidy can find these // macros more easily @@ -433,22 +428,6 @@ #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL #endif -//---------------------------------------------------------------------------- -// Determine for what space the code is being compiled: -#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_3) - -#if defined(__CUDACC__) && defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) -#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA -#elif defined(__SYCL_DEVICE_ONLY__) && defined(KOKKOS_ENABLE_SYCL) -#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL -#elif defined(__HIPCC__) && defined(__HIP_DEVICE_COMPILE__) && \ - defined(KOKKOS_ENABLE_HIP) -#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU -#else -#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST -#endif - -#endif //---------------------------------------------------------------------------- // Remove surrounding parentheses if present diff --git a/packages/kokkos/core/src/Kokkos_MasterLock.hpp b/packages/kokkos/core/src/Kokkos_MasterLock.hpp deleted file mode 100644 index 1d09617371a6..000000000000 --- a/packages/kokkos/core/src/Kokkos_MasterLock.hpp +++ /dev/null @@ -1,56 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_MASTER_LOCK_HPP -#define KOKKOS_MASTER_LOCK_HPP - -#include - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - -namespace Kokkos { -namespace Experimental { - -// my be used to coordinate work between master instances -// SHOULD NOT be used within a parallel algorithm -// -// This lock should be used with with a scoped lock guard -// i.e. std::unique_lock, std::lock_guard -// -// cannot be copied or moved -// has the following functions available -// -// Lock() -// ~Lock() -// -// void lock() -// void unlock() -// bool try_lock() -// -template -class MasterLock; - -} // namespace Experimental -} // namespace Kokkos - -#endif - -#endif // KOKKOS_MASTER_LOCK_HPP diff --git a/packages/kokkos/core/src/Kokkos_MathematicalConstants.hpp b/packages/kokkos/core/src/Kokkos_MathematicalConstants.hpp index 51a50d347dee..1a77f373fd85 100644 --- a/packages/kokkos/core/src/Kokkos_MathematicalConstants.hpp +++ b/packages/kokkos/core/src/Kokkos_MathematicalConstants.hpp @@ -51,24 +51,6 @@ KOKKOS_IMPL_MATH_CONSTANT(phi, 1.618033988749894848204586834365638118L); } // namespace Kokkos::numbers -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -namespace Kokkos::Experimental { -using Kokkos::numbers::e_v; -using Kokkos::numbers::egamma_v; -using Kokkos::numbers::inv_pi_v; -using Kokkos::numbers::inv_sqrt3_v; -using Kokkos::numbers::inv_sqrtpi_v; -using Kokkos::numbers::ln10_v; -using Kokkos::numbers::ln2_v; -using Kokkos::numbers::log10e_v; -using Kokkos::numbers::log2e_v; -using Kokkos::numbers::phi_v; -using Kokkos::numbers::pi_v; -using Kokkos::numbers::sqrt2_v; -using Kokkos::numbers::sqrt3_v; -} // namespace Kokkos::Experimental -#endif - #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHCONSTANTS #undef KOKKOS_IMPL_PUBLIC_INCLUDE #undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHCONSTANTS diff --git a/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp b/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp index ee64c67b93bd..3fead8dd2936 100644 --- a/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp +++ b/packages/kokkos/core/src/Kokkos_MathematicalFunctions.hpp @@ -92,16 +92,6 @@ using promote_3_t = typename promote_3::type; #endif #endif -#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_3) -#define KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - USING_DECLARATIONS_IN_EXPERIMENTAL_NAMESPACE) \ - USING_DECLARATIONS_IN_EXPERIMENTAL_NAMESPACE -#else -#define KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - USING_DECLARATIONS_IN_EXPERIMENTAL_NAMESPACE) \ - /* nothing */ -#endif - #define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC) \ KOKKOS_INLINE_FUNCTION float FUNC(float x) { \ using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ @@ -128,13 +118,7 @@ using promote_3_t = typename promote_3::type; T x) { \ using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ return FUNC(static_cast(x)); \ - } \ - KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - namespace Experimental { \ - using ::Kokkos::FUNC; \ - using ::Kokkos::FUNC##f; \ - using ::Kokkos::FUNC##l; \ - }) + } // isinf, isnan, and isinfinite do not work on Windows with CUDA with std:: // getting warnings about calling host function in device function then @@ -151,9 +135,7 @@ using promote_3_t = typename promote_3::type; KOKKOS_INLINE_FUNCTION std::enable_if_t, bool> FUNC( \ T x) { \ return ::FUNC(static_cast(x)); \ - } \ - KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - namespace Experimental { using ::Kokkos::FUNC; }) + } #else #define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC) \ KOKKOS_INLINE_FUNCTION bool FUNC(float x) { \ @@ -173,9 +155,7 @@ using promote_3_t = typename promote_3::type; T x) { \ using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ return FUNC(static_cast(x)); \ - } \ - KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - namespace Experimental { using ::Kokkos::FUNC; }) + } #endif #define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC) \ @@ -218,16 +198,10 @@ using promote_3_t = typename promote_3::type; long double> \ FUNC(T1 x, T2 y) { \ using Promoted = Kokkos::Impl::promote_2_t; \ - static_assert(std::is_same_v, ""); \ + static_assert(std::is_same_v); \ using std::FUNC; \ return FUNC(static_cast(x), static_cast(y)); \ - } \ - KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - namespace Experimental { \ - using ::Kokkos::FUNC; \ - using ::Kokkos::FUNC##f; \ - using ::Kokkos::FUNC##l; \ - }) + } #define KOKKOS_IMPL_MATH_TERNARY_FUNCTION(FUNC) \ KOKKOS_INLINE_FUNCTION float FUNC(float x, float y, float z) { \ @@ -314,8 +288,6 @@ inline long double abs(long double x) { using std::abs; return abs(x); } -KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( - namespace Experimental { using ::Kokkos::abs; }) KOKKOS_IMPL_MATH_UNARY_FUNCTION(fabs) KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmod) KOKKOS_IMPL_MATH_BINARY_FUNCTION(remainder) @@ -336,12 +308,6 @@ KOKKOS_INLINE_FUNCTION float nanf(char const*) { return sycl::nan(0u); } KOKKOS_INLINE_FUNCTION double nan(char const*) { return sycl::nan(0ul); } #endif inline long double nanl(char const* arg) { return ::nanl(arg); } -KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( - namespace Experimental { - using ::Kokkos::nan; - using ::Kokkos::nanf; - using ::Kokkos::nanl; - }) // Exponential functions KOKKOS_IMPL_MATH_UNARY_FUNCTION(exp) // FIXME_NVHPC nvc++ has issues with exp2 @@ -478,7 +444,6 @@ KOKKOS_IMPL_MATH_UNARY_PREDICATE(signbit) // islessgreater // isunordered -#undef KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED #undef KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE #undef KOKKOS_IMPL_MATH_UNARY_FUNCTION #undef KOKKOS_IMPL_MATH_UNARY_PREDICATE diff --git a/packages/kokkos/core/src/Kokkos_MinMaxClamp.hpp b/packages/kokkos/core/src/Kokkos_MinMax.hpp similarity index 83% rename from packages/kokkos/core/src/Kokkos_MinMaxClamp.hpp rename to packages/kokkos/core/src/Kokkos_MinMax.hpp index 37a28a80b68e..5c60a88bfb1e 100644 --- a/packages/kokkos/core/src/Kokkos_MinMaxClamp.hpp +++ b/packages/kokkos/core/src/Kokkos_MinMax.hpp @@ -14,13 +14,8 @@ // //@HEADER -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_MIN_MAX_CLAMP_HPP -#define KOKKOS_MIN_MAX_CLAMP_HPP +#ifndef KOKKOS_MIN_MAX_HPP +#define KOKKOS_MIN_MAX_HPP #include #include @@ -29,22 +24,6 @@ static_assert(false, namespace Kokkos { -// clamp -template -constexpr KOKKOS_INLINE_FUNCTION const T& clamp(const T& value, const T& lo, - const T& hi) { - KOKKOS_EXPECTS(!(hi < lo)); - return (value < lo) ? lo : (hi < value) ? hi : value; -} - -template -constexpr KOKKOS_INLINE_FUNCTION const T& clamp(const T& value, const T& lo, - const T& hi, - ComparatorType comp) { - KOKKOS_EXPECTS(!comp(hi, lo)); - return comp(value, lo) ? lo : comp(hi, value) ? hi : value; -} - // max template constexpr KOKKOS_INLINE_FUNCTION const T& max(const T& a, const T& b) { @@ -199,15 +178,6 @@ KOKKOS_INLINE_FUNCTION constexpr Kokkos::pair minmax( return result; } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -namespace Experimental { -using ::Kokkos::clamp; -using ::Kokkos::max; -using ::Kokkos::min; -using ::Kokkos::minmax; -} // namespace Experimental -#endif - } // namespace Kokkos #endif diff --git a/packages/kokkos/core/src/Kokkos_Pair.hpp b/packages/kokkos/core/src/Kokkos_Pair.hpp index 7127c78280e2..9be8d8d7aa19 100644 --- a/packages/kokkos/core/src/Kokkos_Pair.hpp +++ b/packages/kokkos/core/src/Kokkos_Pair.hpp @@ -28,6 +28,7 @@ #endif #include +#include #include namespace Kokkos { @@ -484,7 +485,6 @@ KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( } namespace Impl { - template struct is_pair_like : std::false_type {}; template diff --git a/packages/kokkos/core/src/Kokkos_Printf.hpp b/packages/kokkos/core/src/Kokkos_Printf.hpp index 39f95825c382..63a4cce2aeb6 100644 --- a/packages/kokkos/core/src/Kokkos_Printf.hpp +++ b/packages/kokkos/core/src/Kokkos_Printf.hpp @@ -30,8 +30,11 @@ namespace Kokkos { // In contrast to std::printf, return void to get a consistent behavior across // backends. The GPU backends always return 1 and NVHPC only compiles if we // don't ask for the return value. +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) +using ::printf; +#else template -KOKKOS_FUNCTION void printf(const char* format, Args... args) { +KOKKOS_FORCEINLINE_FUNCTION void printf(const char* format, Args... args) { #ifdef KOKKOS_ENABLE_SYCL // Some compilers warn if "args" is empty and format is not a string literal if constexpr (sizeof...(Args) == 0) @@ -39,15 +42,13 @@ KOKKOS_FUNCTION void printf(const char* format, Args... args) { else sycl::ext::oneapi::experimental::printf(format, args...); #else - if constexpr (sizeof...(Args) == 0) ::printf("%s", format); - // FIXME_OPENMPTARGET non-string-literal argument used in printf is not - // supported for spir64 -#if !(defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU)) + if constexpr (sizeof...(Args) == 0) + ::printf("%s", format); else ::printf(format, args...); #endif -#endif } +#endif } // namespace Kokkos diff --git a/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp b/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp index 29a04ac3b07e..e7a9ba0c7ed7 100644 --- a/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp +++ b/packages/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp @@ -22,49 +22,34 @@ #endif #include -#include #include #include -namespace Kokkos { -namespace Profiling { +namespace Kokkos::Profiling { + +class [[nodiscard]] ProfilingSection { + uint32_t sectionID; -class ProfilingSection { public: ProfilingSection(ProfilingSection const&) = delete; ProfilingSection& operator=(ProfilingSection const&) = delete; - ProfilingSection(const std::string& sectionName) { - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::createProfileSection(sectionName, &secID); - } - } - - void start() { - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::startSection(secID); - } +#if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201907 + [[nodiscard]] +#endif + explicit ProfilingSection(const std::string& sectionName) { + Kokkos::Profiling::createProfileSection(sectionName, §ionID); } - void stop() { - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::stopSection(secID); - } - } + void start() { Kokkos::Profiling::startSection(sectionID); } - ~ProfilingSection() { - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::destroyProfileSection(secID); - } - } + void stop() { Kokkos::Profiling::stopSection(sectionID); } - protected: - uint32_t secID; + ~ProfilingSection() { Kokkos::Profiling::destroyProfileSection(sectionID); } }; -} // namespace Profiling -} // namespace Kokkos +} // namespace Kokkos::Profiling #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_CORE #undef KOKKOS_IMPL_PUBLIC_INCLUDE diff --git a/packages/kokkos/core/src/Kokkos_Swap.hpp b/packages/kokkos/core/src/Kokkos_Swap.hpp new file mode 100644 index 000000000000..2f849a13ab61 --- /dev/null +++ b/packages/kokkos/core/src/Kokkos_Swap.hpp @@ -0,0 +1,68 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SWAP_HPP +#define KOKKOS_SWAP_HPP + +#include + +#include +#include +#include + +namespace Kokkos { + +template +KOKKOS_FUNCTION constexpr std::enable_if_t && + std::is_move_assignable_v> +kokkos_swap(T& a, T& b) noexcept(std::is_nothrow_move_constructible_v&& + std::is_nothrow_move_assignable_v) { + T t(std::move(a)); + a = std::move(b); + b = std::move(t); +} + +namespace Impl { + +template +struct is_swappable { + template + static decltype(kokkos_swap(std::declval(), std::declval())) + test_swap(int); + struct Nope; + template + static Nope test_swap(long); + static constexpr bool value = + !std::is_same_v(0)), Nope>; +}; + +template +inline constexpr bool is_nothrow_swappable_v = + noexcept(kokkos_swap(std::declval(), std::declval())); + +} // namespace Impl + +template +KOKKOS_FUNCTION constexpr std::enable_if_t::value> +kokkos_swap(T (&a)[N], T (&b)[N]) noexcept(Impl::is_nothrow_swappable_v) { + for (std::size_t i = 0; i < N; ++i) { + kokkos_swap(a[i], b[i]); + } +} + +} // namespace Kokkos + +#endif diff --git a/packages/kokkos/core/src/Kokkos_Tuners.hpp b/packages/kokkos/core/src/Kokkos_Tuners.hpp index 618401654e74..f5ffc66af5b5 100644 --- a/packages/kokkos/core/src/Kokkos_Tuners.hpp +++ b/packages/kokkos/core/src/Kokkos_Tuners.hpp @@ -256,13 +256,14 @@ auto get_point_helper(const PointType& in, const ArrayType& indices, template struct GetPoint; -template -struct GetPoint> { +template +struct GetPoint< + PointType, + std::array> { using index_set_type = - std::array; + std::array; static auto build(const PointType& in, const index_set_type& indices) { - return get_point_helper(in, indices, std::make_index_sequence{}); + return get_point_helper(in, indices, std::make_index_sequence{}); } }; diff --git a/packages/kokkos/core/src/Kokkos_View.hpp b/packages/kokkos/core/src/Kokkos_View.hpp index bcbb28014cd9..484a0e6f62e4 100644 --- a/packages/kokkos/core/src/Kokkos_View.hpp +++ b/packages/kokkos/core/src/Kokkos_View.hpp @@ -39,7 +39,7 @@ static_assert(false, #ifdef KOKKOS_ENABLE_IMPL_MDSPAN #include #endif -#include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -75,25 +75,59 @@ constexpr KOKKOS_INLINE_FUNCTION std::size_t count_valid_integers( (i6 != KOKKOS_INVALID_INDEX) + (i7 != KOKKOS_INVALID_INDEX); } -KOKKOS_INLINE_FUNCTION -void runtime_check_rank(const size_t rank, const size_t dyn_rank, - const bool is_void_spec, const size_t i0, - const size_t i1, const size_t i2, const size_t i3, - const size_t i4, const size_t i5, const size_t i6, - const size_t i7, const std::string& label) { +// FIXME Ideally, we would not instantiate this function for every possible View +// type. We should be able to only pass "extent" when we use mdspan. +template +KOKKOS_INLINE_FUNCTION void runtime_check_rank( + const View&, const bool is_void_spec, const size_t i0, const size_t i1, + const size_t i2, const size_t i3, const size_t i4, const size_t i5, + const size_t i6, const size_t i7, const char* label) { (void)(label); if (is_void_spec) { const size_t num_passed_args = count_valid_integers(i0, i1, i2, i3, i4, i5, i6, i7); + // We either allow to pass as many extents as the dynamic rank is, or + // as many extents as the total rank is. In the latter case, the given + // extents for the static dimensions must match the + // compile-time extents. + constexpr int rank = View::rank(); + constexpr int dyn_rank = View::rank_dynamic(); + const bool n_args_is_dyn_rank = num_passed_args == dyn_rank; + const bool n_args_is_rank = num_passed_args == rank; + + if constexpr (rank != dyn_rank) { + if (n_args_is_rank) { + size_t new_extents[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; + for (int i = dyn_rank; i < rank; ++i) + if (new_extents[i] != View::static_extent(i)) { + KOKKOS_IF_ON_HOST( + const std::string message = + "The specified run-time extent for Kokkos::View '" + + std::string(label) + + "' does not match the compile-time extent in dimension " + + std::to_string(i) + ". The given extent is " + + std::to_string(new_extents[i]) + " but should be " + + std::to_string(View::static_extent(i)) + ".\n"; + Kokkos::abort(message.c_str());) + KOKKOS_IF_ON_DEVICE( + Kokkos::abort( + "The specified run-time extents for a Kokkos::View " + "do not match the compile-time extents.");) + } + } + } - if (num_passed_args != dyn_rank && num_passed_args != rank) { + if (!n_args_is_dyn_rank && !n_args_is_rank) { KOKKOS_IF_ON_HOST( const std::string message = - "Constructor for Kokkos View '" + label + - "' has mismatched number of arguments. Number of arguments = " + + "Constructor for Kokkos::View '" + std::string(label) + + "' has mismatched number of arguments. The number " + "of arguments = " + std::to_string(num_passed_args) + - " but dynamic rank = " + std::to_string(dyn_rank) + " \n"; + " neither matches the dynamic rank = " + + std::to_string(dyn_rank) + + " nor the total rank = " + std::to_string(rank) + "\n"; Kokkos::abort(message.c_str());) KOKKOS_IF_ON_DEVICE(Kokkos::abort("Constructor for Kokkos View has " "mismatched number of arguments.");) @@ -814,15 +848,15 @@ class View : public ViewTraits { template static KOKKOS_FUNCTION void check_access_member_function_valid_args(Is...) { - static_assert(rank <= sizeof...(Is), ""); - static_assert(sizeof...(Is) <= 8, ""); - static_assert(Kokkos::Impl::are_integral::value, ""); + static_assert(rank <= sizeof...(Is)); + static_assert(sizeof...(Is) <= 8); + static_assert(Kokkos::Impl::are_integral::value); } template static KOKKOS_FUNCTION void check_operator_parens_valid_args(Is...) { - static_assert(rank == sizeof...(Is), ""); - static_assert(Kokkos::Impl::are_integral::value, ""); + static_assert(rank == sizeof...(Is)); + static_assert(Kokkos::Impl::are_integral::value); } public: @@ -1402,21 +1436,30 @@ class View : public ViewTraits { "execution space"); } - size_t i0 = arg_layout.dimension[0]; - size_t i1 = arg_layout.dimension[1]; - size_t i2 = arg_layout.dimension[2]; - size_t i3 = arg_layout.dimension[3]; - size_t i4 = arg_layout.dimension[4]; - size_t i5 = arg_layout.dimension[5]; - size_t i6 = arg_layout.dimension[6]; - size_t i7 = arg_layout.dimension[7]; - - const std::string& alloc_name = - Impl::get_property(prop_copy); - Impl::runtime_check_rank( - rank, rank_dynamic, - std::is_same::value, i0, i1, i2, i3, - i4, i5, i6, i7, alloc_name); +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + is_layouttiled::value) { + size_t i0 = arg_layout.dimension[0]; + size_t i1 = arg_layout.dimension[1]; + size_t i2 = arg_layout.dimension[2]; + size_t i3 = arg_layout.dimension[3]; + size_t i4 = arg_layout.dimension[4]; + size_t i5 = arg_layout.dimension[5]; + size_t i6 = arg_layout.dimension[6]; + size_t i7 = arg_layout.dimension[7]; + + const std::string& alloc_name = + Impl::get_property(prop_copy); + Impl::runtime_check_rank( + *this, std::is_same::value, i0, i1, + i2, i3, i4, i5, i6, i7, alloc_name.c_str()); + } +#endif Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( prop_copy, arg_layout, Impl::ViewCtorProp::has_execution_space); @@ -1445,6 +1488,29 @@ class View : public ViewTraits { typename Impl::ViewCtorProp::pointer_type>::value, "Constructing View to wrap user memory must supply matching pointer " "type"); + +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + is_layouttiled::value) { + size_t i0 = arg_layout.dimension[0]; + size_t i1 = arg_layout.dimension[1]; + size_t i2 = arg_layout.dimension[2]; + size_t i3 = arg_layout.dimension[3]; + size_t i4 = arg_layout.dimension[4]; + size_t i5 = arg_layout.dimension[5]; + size_t i6 = arg_layout.dimension[6]; + size_t i7 = arg_layout.dimension[7]; + + Impl::runtime_check_rank( + *this, std::is_same::value, i0, i1, + i2, i3, i4, i5, i6, i7, "UNMANAGED"); + } +#endif } // Simple dimension-only layout diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp index f54c44d66f01..99daf379b6ff 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp @@ -58,8 +58,10 @@ void Kokkos::Experimental::OpenACC::impl_initialize( Impl::OpenACCInternal::m_acc_device_num = acc_get_device_num(acc_device_host); } else { + using Kokkos::Impl::get_visible_devices; + std::vector const& visible_devices = get_visible_devices(); using Kokkos::Impl::get_gpu; - int const dev_num = get_gpu(settings); + int const dev_num = get_gpu(settings).value_or(visible_devices[0]); acc_set_device_num(dev_num, Impl::OpenACC_Traits::dev_type); Impl::OpenACCInternal::m_acc_device_num = dev_num; } diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp index b012f6a42a41..5155bee33dc3 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp @@ -91,7 +91,11 @@ class OpenACC { #else int concurrency() const { return 256000; } // FIXME_OPENACC #endif - static bool in_parallel() { return acc_on_device(acc_device_not_host); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static bool in_parallel() { + return acc_on_device(acc_device_not_host); + } +#endif uint32_t impl_instance_id() const noexcept; Impl::OpenACCInternal* impl_internal_space_instance() const { return m_space_instance.get(); diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.cpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.cpp index 141ec77fd1f0..acc0dcd3c6e2 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.cpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.cpp @@ -19,8 +19,8 @@ #include #include #include -#include #include +#include #include @@ -66,6 +66,19 @@ void *Kokkos::Experimental::OpenACCSpace::impl_allocate( ptr = acc_malloc(arg_alloc_size); + if (!ptr) { + size_t alignment = 1; // OpenACC does not handle alignment + using Kokkos::Experimental::RawMemoryAllocationFailure; + auto failure_mode = + arg_alloc_size > 0 + ? RawMemoryAllocationFailure::FailureMode::OutOfMemoryError + : RawMemoryAllocationFailure::FailureMode::InvalidAllocationSize; + auto alloc_mechanism = + RawMemoryAllocationFailure::AllocationMechanism::OpenACCMalloc; + throw RawMemoryAllocationFailure(arg_alloc_size, alignment, failure_mode, + alloc_mechanism); + } + if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp index 4aed7e00f765..ca022192b0bc 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp @@ -34,7 +34,7 @@ struct Kokkos::Impl::DeepCopy 0) { - acc_memcpy_device(dst, const_cast(src), n); + acc_memcpy_device_async(dst, const_cast(src), n, acc_async_noval); } } DeepCopy(const Kokkos::Experimental::OpenACC& exec, void* dst, @@ -52,7 +52,7 @@ struct Kokkos::Impl::DeepCopy { DeepCopy(void* dst, const void* src, size_t n) { if (n > 0) { - acc_memcpy_device(dst, const_cast(src), n); + acc_memcpy_device_async(dst, const_cast(src), n, acc_async_noval); } } DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { @@ -60,7 +60,7 @@ struct Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); if (n > 0) { - acc_memcpy_device(dst, const_cast(src), n); + acc_memcpy_device_async(dst, const_cast(src), n, acc_async_noval); } } }; @@ -70,7 +70,9 @@ struct Kokkos::Impl::DeepCopy { DeepCopy(void* dst, const void* src, size_t n) { - if (n > 0) acc_memcpy_to_device(dst, const_cast(src), n); + if (n > 0) + acc_memcpy_to_device_async(dst, const_cast(src), n, + acc_async_noval); } DeepCopy(const Kokkos::Experimental::OpenACC& exec, void* dst, const void* src, size_t n) { @@ -85,7 +87,8 @@ struct Kokkos::Impl::DeepCopy { DeepCopy(void* dst, const void* src, size_t n) { if (n > 0) { - acc_memcpy_to_device(dst, const_cast(src), n); + acc_memcpy_to_device_async(dst, const_cast(src), n, + acc_async_noval); } } DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { @@ -93,7 +96,8 @@ struct Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); if (n > 0) { - acc_memcpy_to_device(dst, const_cast(src), n); + acc_memcpy_to_device_async(dst, const_cast(src), n, + acc_async_noval); } } }; @@ -104,7 +108,8 @@ struct Kokkos::Impl::DeepCopy { DeepCopy(void* dst, const void* src, size_t n) { if (n > 0) { - acc_memcpy_from_device(dst, const_cast(src), n); + acc_memcpy_from_device_async(dst, const_cast(src), n, + acc_async_noval); } } DeepCopy(const Kokkos::Experimental::OpenACC& exec, void* dst, @@ -120,14 +125,17 @@ template struct Kokkos::Impl::DeepCopy< Kokkos::HostSpace, Kokkos::Experimental::OpenACCSpace, ExecutionSpace> { DeepCopy(void* dst, const void* src, size_t n) { - if (n > 0) acc_memcpy_from_device(dst, const_cast(src), n); + if (n > 0) + acc_memcpy_from_device_async(dst, const_cast(src), n, + acc_async_noval); } DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { exec.fence( "Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); if (n > 0) { - acc_memcpy_from_device(dst, const_cast(src), n); + acc_memcpy_from_device_async(dst, const_cast(src), n, + acc_async_noval); } } }; diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp index 6645616ba519..c3d723687270 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp @@ -35,7 +35,7 @@ class OpenACCInternal { public: static int m_acc_device_num; - int m_async_arg = acc_async_sync; + int m_async_arg = acc_async_noval; OpenACCInternal() = default; @@ -43,7 +43,7 @@ class OpenACCInternal { bool verify_is_initialized(const char* const label) const; - void initialize(int async_arg = acc_async_sync); + void initialize(int async_arg = acc_async_noval); void finalize(); bool is_initialized() const; diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp index 2c7793dc1165..5afb5e75d392 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp @@ -31,7 +31,7 @@ template ::value, + static_assert(Kokkos::Impl::always_false::value, "not implemented"); } }; @@ -136,6 +136,7 @@ class Kokkos::Impl::ParallelReduce> struct OpenACCParallelReduceHelper { OpenACCParallelReduceHelper(Functor const&, Reducer const&, Policy const&) { - static_assert(!Kokkos::Impl::always_true::value, + static_assert(Kokkos::Impl::always_false::value, "not implemented"); } }; @@ -140,6 +140,7 @@ class Kokkos::Impl::ParallelReduce::value, + static_assert(Kokkos::Impl::always_false::value, "not implemented"); } }; @@ -129,7 +129,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamThreadRangeBoundariesStruct& loop_boundaries, const Lambda& lambda, const JoinType& join, ValueType& init_result) { - static_assert(!Kokkos::Impl::always_true::value, + static_assert(Kokkos::Impl::always_false::value, "custom reduction is not implemented"); } @@ -140,7 +140,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, const JoinType& join, ValueType& init_result) { - static_assert(!Kokkos::Impl::always_true::value, + static_assert(Kokkos::Impl::always_false::value, "custom reduction is not implemented"); } @@ -394,6 +394,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( vector_length); \ functor(team, val); \ } \ + acc_wait(async_arg); \ aval = val; \ } \ } // namespace Kokkos::Experimental::Impl diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp index 91faa64f7333..76e1514476a0 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp @@ -16,92 +16,11 @@ #define KOKKOS_IMPL_PUBLIC_INCLUDE -#include +#include #include -#include -#include - -#ifdef KOKKOS_ENABLE_DEBUG -Kokkos::Impl::SharedAllocationRecord SharedAllocationRecord< - Kokkos::Experimental::OpenACCSpace, void>::s_root_record; -#endif - -Kokkos::Impl::SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - (SharedAllocationRecord::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -Kokkos::Impl::SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::OpenACCSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - Kokkos::Impl::DeepCopy( - RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - Kokkos::fence( - "SharedAllocationRecord::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -Kokkos::Impl::SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::OpenACC &arg_exec_space, - const Kokkos::Experimental::OpenACCSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_exec_space, arg_space, - arg_label, arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - Kokkos::Impl::DeepCopy( - arg_exec_space, RecordBase::m_alloc_ptr, &header, - sizeof(SharedAllocationHeader)); -} - -//============================================================================== -// {{{1 +#include #include -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicitly instantiate these CRTP base classes -// here, where we have access to the associated *_timpl.hpp header files. -template class Kokkos::Impl::HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace>; -template class Kokkos::Impl::SharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace>; - -// end Explicit instantiations of CRTP Base classes }}}1 -//============================================================================== +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::Experimental::OpenACCSpace); diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp index cf83a5b27bcb..cde5ecdcb778 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp @@ -20,55 +20,7 @@ #include #include -#include - -template <> -class Kokkos::Impl::SharedAllocationRecord - : public HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace> { - private: - friend class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace>; - friend class SharedAllocationRecordCommon; - friend Kokkos::Experimental::OpenACCSpace; - - using base_t = HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace>; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - /**\brief Root record for tracked allocations from this OpenACCSpace - * instance */ - static RecordBase s_root_record; - - const Kokkos::Experimental::OpenACCSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::OpenACCSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::OpenACC& exec_space, - const Kokkos::Experimental::OpenACCSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); - - SharedAllocationRecord( - const Kokkos::Experimental::OpenACCSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); -}; +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::Experimental::OpenACCSpace); #endif diff --git a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Team.hpp b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Team.hpp index 4ec71f56ef66..20ea392452b7 100644 --- a/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Team.hpp +++ b/packages/kokkos/core/src/OpenACC/Kokkos_OpenACC_Team.hpp @@ -82,7 +82,7 @@ class OpenACCTeamMember { // FIXME_OPENACC: team_broadcast() is not implemented. template KOKKOS_FUNCTION void team_broadcast(ValueType& value, int thread_id) const { - static_assert(!Kokkos::Impl::always_true::value, + static_assert(Kokkos::Impl::always_false::value, "Kokkos Error: team_broadcast() is not implemented for the " "OpenACC backend"); return ValueType(); @@ -99,7 +99,7 @@ class OpenACCTeamMember { template KOKKOS_FUNCTION ValueType team_reduce(const ValueType& value, const JoinOp& op_in) const { - static_assert(!Kokkos::Impl::always_true::value, + static_assert(Kokkos::Impl::always_false::value, "Kokkos Error: team_reduce() is not implemented for the " "OpenACC backend"); return ValueType(); @@ -110,7 +110,7 @@ class OpenACCTeamMember { KOKKOS_FUNCTION ArgType team_scan(const ArgType& /*value*/, ArgType* const /*global_accum*/) const { static_assert( - !Kokkos::Impl::always_true::value, + Kokkos::Impl::always_false::value, "Kokkos Error: team_scan() is not implemented for the OpenACC backend"); return ArgType(); } diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp index 9a169a435c73..81f2c5c30560 100644 --- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp @@ -81,29 +81,16 @@ bool OpenMP::impl_is_initialized() noexcept { return Impl::OpenMPInternal::singleton().is_initialized(); } -bool OpenMP::in_parallel(OpenMP const &exec_space) noexcept { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - return ( - (exec_space.impl_internal_space_instance()->m_level < omp_get_level()) && - (!Impl::t_openmp_instance || - Impl::t_openmp_instance->m_level < omp_get_level())); -#else +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_DEPRECATED bool OpenMP::in_parallel(OpenMP const &exec_space) noexcept { return exec_space.impl_internal_space_instance()->m_level < omp_get_level(); -#endif } +#endif int OpenMP::impl_thread_pool_size() const noexcept { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - return OpenMP::in_parallel(*this) - ? omp_get_num_threads() - : (Impl::t_openmp_instance - ? Impl::t_openmp_instance->m_pool_size - : impl_internal_space_instance()->m_pool_size); -#else - return OpenMP::in_parallel(*this) + return (impl_internal_space_instance()->get_level() < omp_get_level()) ? omp_get_num_threads() : impl_internal_space_instance()->m_pool_size; -#endif } int OpenMP::impl_max_hardware_threads() noexcept { diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp index 594f40d5245a..11292af84ad4 100644 --- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp @@ -27,14 +27,7 @@ static_assert(false, #include -#include -#include #include - -#ifdef KOKKOS_ENABLE_HBWSPACE -#include -#endif - #include #include #include @@ -45,6 +38,8 @@ static_assert(false, #include +#include +#include #include /*--------------------------------------------------------------------------*/ @@ -53,11 +48,6 @@ namespace Kokkos { namespace Impl { class OpenMPInternal; - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -// FIXME_OPENMP we can remove this after we remove partition_master -inline thread_local OpenMPInternal* t_openmp_instance = nullptr; -#endif } // namespace Impl /// \class OpenMP @@ -67,12 +57,7 @@ class OpenMP { //! Tag this class as a kokkos execution space using execution_space = OpenMP; - using memory_space = -#ifdef KOKKOS_ENABLE_HBWSPACE - Experimental::HBWSpace; -#else - HostSpace; -#endif + using memory_space = HostSpace; //! This execution space preferred device_type using device_type = Kokkos::Device; @@ -87,8 +72,10 @@ class OpenMP { /// \brief Print configuration information to the given output stream. void print_configuration(std::ostream& os, bool verbose = false) const; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 /// \brief is the instance running a parallel algorithm - static bool in_parallel(OpenMP const& = OpenMP()) noexcept; + KOKKOS_DEPRECATED static bool in_parallel(OpenMP const& = OpenMP()) noexcept; +#endif /// \brief Wait until all dispatched functors complete on the given instance /// @@ -104,18 +91,6 @@ class OpenMP { /// This always returns false on OpenMP inline static bool is_asynchronous(OpenMP const& = OpenMP()) noexcept; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - /// \brief Partition the default instance and call 'f' on each new 'master' - /// thread - /// - /// Func is a functor with the following signiture - /// void( int partition_id, int num_partitions ) - template - KOKKOS_DEPRECATED static void partition_master( - F const& f, int requested_num_partitions = 0, - int requested_partition_size = 0); -#endif - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(OpenMP const& = OpenMP()); #else @@ -166,14 +141,7 @@ class OpenMP { }; inline int OpenMP::impl_thread_pool_rank() noexcept { - // FIXME_OPENMP Can we remove this when removing partition_master? It's only - // used in one partition_master test -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - KOKKOS_IF_ON_HOST( - (return Impl::t_openmp_instance ? 0 : omp_get_thread_num();)) -#else KOKKOS_IF_ON_HOST((return omp_get_thread_num();)) -#endif KOKKOS_IF_ON_DEVICE((return -1;)) } diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp index 12bf3b71f7c1..32172fbc6c73 100644 --- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp @@ -47,61 +47,6 @@ void OpenMPInternal::release_lock() { desul::MemoryScopeDevice()); } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -void OpenMPInternal::validate_partition_impl(const int nthreads, - int &num_partitions, - int &partition_size) { - if (nthreads == 1) { - num_partitions = 1; - partition_size = 1; - } else if (num_partitions < 1 && partition_size < 1) { - int idle = nthreads; - for (int np = 2; np <= nthreads; ++np) { - for (int ps = 1; ps <= nthreads / np; ++ps) { - if (nthreads - np * ps < idle) { - idle = nthreads - np * ps; - num_partitions = np; - partition_size = ps; - } - if (idle == 0) { - break; - } - } - } - } else if (num_partitions < 1 && partition_size > 0) { - if (partition_size <= nthreads) { - num_partitions = nthreads / partition_size; - } else { - num_partitions = 1; - partition_size = nthreads; - } - } else if (num_partitions > 0 && partition_size < 1) { - if (num_partitions <= nthreads) { - partition_size = nthreads / num_partitions; - } else { - num_partitions = nthreads; - partition_size = 1; - } - } else if (num_partitions * partition_size > nthreads) { - int idle = nthreads; - const int NP = num_partitions; - const int PS = partition_size; - for (int np = NP; np > 0; --np) { - for (int ps = PS; ps > 0; --ps) { - if ((np * ps <= nthreads) && (nthreads - np * ps < idle)) { - idle = nthreads - np * ps; - num_partitions = np; - partition_size = ps; - } - if (idle == 0) { - break; - } - } - } - } -} -#endif - void OpenMPInternal::clear_thread_data() { const size_t member_bytes = sizeof(int64_t) * diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp index 03f5fff395a8..35b9aa93ba7c 100644 --- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp @@ -41,16 +41,6 @@ #include /*--------------------------------------------------------------------------*/ -namespace Kokkos { -namespace Impl { - -inline bool execute_in_serial(OpenMP const& space = OpenMP()) { - return (OpenMP::in_parallel(space) && - !(omp_get_nested() && (omp_get_level() == 1))); -} - -} // namespace Impl -} // namespace Kokkos namespace Kokkos { namespace Impl { @@ -99,11 +89,6 @@ class OpenMPInternal { // Release lock used to protect access to m_pool void release_lock(); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - static void validate_partition_impl(const int nthreads, int& num_partitions, - int& partition_size); -#endif - void resize_thread_data(size_t pool_reduce_bytes, size_t team_reduce_bytes, size_t team_shared_bytes, size_t thread_local_bytes); @@ -115,6 +100,8 @@ class OpenMPInternal { return m_pool[i]; } + int get_level() const { return m_level; } + bool is_initialized() const { return m_initialized; } bool verify_is_initialized(const char* const label) const; @@ -122,32 +109,20 @@ class OpenMPInternal { void print_configuration(std::ostream& s) const; }; -} // namespace Impl - -namespace Experimental { - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -template <> -class MasterLock { - public: - void lock() { omp_set_lock(&m_lock); } - void unlock() { omp_unset_lock(&m_lock); } - bool try_lock() { return static_cast(omp_test_lock(&m_lock)); } - - KOKKOS_DEPRECATED MasterLock() { omp_init_lock(&m_lock); } - ~MasterLock() { omp_destroy_lock(&m_lock); } - - MasterLock(MasterLock const&) = delete; - MasterLock(MasterLock&&) = delete; - MasterLock& operator=(MasterLock const&) = delete; - MasterLock& operator=(MasterLock&&) = delete; - - private: - omp_lock_t m_lock; -}; +inline bool execute_in_serial(OpenMP const& space = OpenMP()) { +// The default value returned by `omp_get_max_active_levels` with gcc version +// lower than 11.1.0 is 2147483647 instead of 1. +#if (!defined(KOKKOS_COMPILER_GNU) || KOKKOS_COMPILER_GNU >= 1110) && \ + _OPENMP >= 201511 + bool is_nested = omp_get_max_active_levels() > 1; +#else + bool is_nested = static_cast(omp_get_nested()); #endif + return (space.impl_internal_space_instance()->get_level() < omp_get_level() && + !(is_nested && (omp_get_level() == 1))); +} -} // namespace Experimental +} // namespace Impl namespace Experimental { namespace Impl { @@ -202,50 +177,6 @@ std::vector partition_space(OpenMP const& main_instance, return Impl::create_OpenMP_instances(main_instance, weights); } } // namespace Experimental - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -template -KOKKOS_DEPRECATED void OpenMP::partition_master(F const& f, int num_partitions, - int partition_size) { -#if _OPENMP >= 201511 - if (omp_get_max_active_levels() > 1) { -#else - if (omp_get_nested()) { -#endif - using Exec = Impl::OpenMPInternal; - - Exec* prev_instance = &Impl::OpenMPInternal::singleton(); - - Exec::validate_partition_impl(prev_instance->m_pool_size, num_partitions, - partition_size); - - OpenMP::memory_space space; - -#pragma omp parallel num_threads(num_partitions) - { - Exec thread_local_instance(partition_size); - Impl::t_openmp_instance = &thread_local_instance; - - size_t pool_reduce_bytes = 32 * partition_size; - size_t team_reduce_bytes = 32 * partition_size; - size_t team_shared_bytes = 1024 * partition_size; - size_t thread_local_bytes = 1024; - - thread_local_instance.resize_thread_data( - pool_reduce_bytes, team_reduce_bytes, team_shared_bytes, - thread_local_bytes); - - omp_set_num_threads(partition_size); - f(omp_get_thread_num(), omp_get_num_threads()); - Impl::t_openmp_instance = nullptr; - } - } else { - // nested openmp not enabled - f(0, 1); - } -} -#endif - } // namespace Kokkos #endif diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp index 96dc664eb79a..823a7e668e57 100644 --- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp @@ -147,15 +147,7 @@ class ParallelFor, Kokkos::OpenMP> { inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy) : m_instance(nullptr), m_functor(arg_functor), m_policy(arg_policy) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } }; @@ -251,16 +243,9 @@ class ParallelFor, inline ParallelFor(const FunctorType& arg_functor, MDRangePolicy arg_policy) : m_instance(nullptr), m_iter(arg_policy, arg_functor) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } + template static int max_tile_size_product(const Policy&, const Functor&) { /** @@ -409,15 +394,7 @@ class ParallelFor, m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize::value( arg_functor, arg_policy.team_size())) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } }; diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp index 52cdef18e659..05fd1c9dce3c 100644 --- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp @@ -170,15 +170,7 @@ class ParallelReduce, m_functor_reducer(arg_functor_reducer), m_policy(arg_policy), m_result_ptr(arg_view.data()) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, @@ -319,15 +311,7 @@ class ParallelReduce::accessible, @@ -543,15 +527,7 @@ class ParallelReduce::value( arg_functor_reducer.get_functor(), arg_policy.team_size())) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif static_assert( Kokkos::Impl::MemorySpaceAccess, inline ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy) : m_instance(nullptr), m_functor(arg_functor), m_policy(arg_policy) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } }; @@ -292,15 +284,7 @@ class ParallelScanWithTotal, Kokkos::Impl::MemorySpaceAccess::accessible, "Kokkos::OpenMP parallel_scan result must be host-accessible!"); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } //---------------------------------------- diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp index adf972dd081f..ea4e7f6baba2 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp @@ -65,7 +65,11 @@ class OpenMPTarget { using scratch_memory_space = ScratchMemorySpace; - inline static bool in_parallel() { return omp_in_parallel(); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED inline static bool in_parallel() { + return omp_in_parallel(); + } +#endif static void fence(const std::string& name = "Kokkos::OpenMPTarget::fence: Unnamed Instance Fence"); diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp index 81fbc56de005..a414b34d7c68 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp @@ -37,7 +37,6 @@ #include #include #include -#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -110,79 +109,13 @@ void OpenMPTargetSpace::deallocate(const char* arg_label, } // namespace Experimental } // namespace Kokkos -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord SharedAllocationRecord< - Kokkos::Experimental::OpenMPTargetSpace, void>::s_root_record; -#endif - -SharedAllocationRecord::~SharedAllocationRecord() { - auto alloc_size = SharedAllocationRecord::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - alloc_size, (alloc_size - sizeof(SharedAllocationHeader))); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::OpenMPTargetSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // TODO DeepCopy - // DeepCopy - Kokkos::Impl::DeepCopy( - RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - Kokkos::fence( - "SharedAllocationRecord::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -//---------------------------------------------------------------------------- - -} // namespace Impl -} // namespace Kokkos - //============================================================================== // {{{1 #include -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; -template class SharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; - -} // end namespace Impl -} // end namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::Experimental::OpenMPTargetSpace); // end Explicit instantiations of CRTP Base classes }}}1 //============================================================================== diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp index e5b33d0982f8..ed625cfcc82c 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp @@ -98,6 +98,16 @@ class OpenMPTargetSpace { ~OpenMPTargetSpace() = default; /**\brief Allocate untracked memory in the space */ + // FIXME_OPENMPTARGET Use execution space instance + void* allocate(const OpenMPTarget&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + // FIXME_OPENMPTARGET Use execution space instance + void* allocate(const OpenMPTarget&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -121,9 +131,6 @@ class OpenMPTargetSpace { const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = Kokkos::Tools::make_space_handle(name())) const; - - friend class Kokkos::Impl::SharedAllocationRecord< - Kokkos::Experimental::OpenMPTargetSpace, void>; }; } // namespace Experimental } // namespace Kokkos @@ -131,64 +138,8 @@ class OpenMPTargetSpace { //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -namespace Kokkos { -namespace Impl { - -template <> -class SharedAllocationRecord - : public HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace> { - private: - friend class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; - friend class SharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; - friend Kokkos::Experimental::OpenMPTargetSpace; - - using base_t = HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - /**\brief Root record for tracked allocations from this OpenMPTargetSpace - * instance */ - static RecordBase s_root_record; - - const Kokkos::Experimental::OpenMPTargetSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::OpenMPTargetSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::OpenMPTargetSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); - - public: - KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( - const Kokkos::Experimental::OpenMPTargetSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc) { - KOKKOS_IF_ON_HOST( - (return new SharedAllocationRecord(arg_space, arg_label, arg_alloc);)) - KOKKOS_IF_ON_DEVICE( - ((void)arg_space; (void)arg_label; (void)arg_alloc; return nullptr;)) - } -}; - -} // namespace Impl -} // namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::Experimental::OpenMPTargetSpace); //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp index 1902c38409a9..b39f5aca3533 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp @@ -75,6 +75,7 @@ int* OpenMPTargetExec::m_lock_array = nullptr; uint64_t OpenMPTargetExec::m_lock_size = 0; uint32_t* OpenMPTargetExec::m_uniquetoken_ptr = nullptr; int OpenMPTargetExec::MAX_ACTIVE_THREADS = 0; +std::mutex OpenMPTargetExec::m_mutex_scratch_ptr; void OpenMPTargetExec::clear_scratch() { Kokkos::Experimental::OpenMPTargetSpace space; @@ -98,6 +99,11 @@ void OpenMPTargetExec::resize_scratch(int64_t team_size, int64_t shmem_size_L0, int64_t shmem_size_L1, int64_t league_size) { Kokkos::Experimental::OpenMPTargetSpace space; + // Level-0 scratch when using clang/17 and higher comes from their OpenMP + // extension, `ompx_dyn_cgroup_mem`. +#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) + shmem_size_L0 = 0; +#endif const int64_t shmem_size = shmem_size_L0 + shmem_size_L1; // L0 + L1 scratch memory per team. const int64_t padding = shmem_size * 10 / 100; // Padding per team. diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp index 9e8844a6f208..3387108da395 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp @@ -178,8 +178,10 @@ void OpenMPTarget::impl_static_fence(const std::string& name) { } void OpenMPTarget::impl_initialize(InitializationSettings const& settings) { + using Kokkos::Impl::get_visible_devices; + std::vector const& visible_devices = get_visible_devices(); using Kokkos::Impl::get_gpu; - const int device_num = get_gpu(settings); + const int device_num = get_gpu(settings).value_or(visible_devices[0]); omp_set_default_device(device_num); Impl::OpenMPTargetInternal::impl_singleton()->impl_initialize(); diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp new file mode 100644 index 000000000000..2bd672f4d06b --- /dev/null +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp @@ -0,0 +1,46 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_MACROS_HPP +#define KOKKOS_OPENMPTARGET_MACROS_HPP + +// Intel architectures prefer the classical hierarchical parallelism that relies +// on OpenMP. +#if defined(KOKKOS_ARCH_INTEL_GPU) +#define KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU +#endif + +// Define a macro for llvm compiler greater than version 17 and on NVIDIA and +// AMD GPUs. This would be useful in cases where non-OpenMP standard llvm +// extensions can be used. +#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1700) && \ + (defined(KOKKOS_ARCH_AMD_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU)) +#define KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS +#endif + +#define KOKKOS_IMPL_OPENMPTARGET_PRAGMA_HELPER(x) _Pragma(#x) +#define KOKKOS_IMPL_OMPTARGET_PRAGMA(x) \ + KOKKOS_IMPL_OPENMPTARGET_PRAGMA_HELPER(omp target x) + +// Use scratch memory extensions to request dynamic shared memory for the +// right compiler/architecture combination. +#ifdef KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS +#define KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(N) ompx_dyn_cgroup_mem(N) +#else +#define KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(N) +#endif + +#endif // KOKKOS_OPENMPTARGET_MACROS_HPP diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp index 9767d8e53eff..dcc509d2faf9 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp @@ -21,16 +21,10 @@ #include #include #include -#include #include #include "Kokkos_OpenMPTarget_Abort.hpp" - -// Intel architectures prefer the classical hierarchical parallelism that relies -// on OpenMP. -#if defined(KOKKOS_ARCH_INTEL_GPU) -#define KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU -#endif +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -113,14 +107,20 @@ class OpenMPTargetExecTeamMember { team_broadcast(value, thread_id); } - // FIXME_OPENMPTARGET this function has the wrong interface and currently - // ignores the reducer passed. - template - KOKKOS_INLINE_FUNCTION ValueType team_reduce(const ValueType& value, - const JoinOp&) const { + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + team_reduce(ReducerType const& reducer) const noexcept { + team_reduce(reducer, reducer.reference()); + } + + // FIXME_OPENMPTARGET this function currently ignores the reducer passed. + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + team_reduce(ReducerType const&, typename ReducerType::value_type& value) const + noexcept { #pragma omp barrier - using value_type = ValueType; + using value_type = typename ReducerType::value_type; // const JoinLambdaAdapter op(op_in); // Make sure there is enough scratch space: @@ -149,8 +149,9 @@ class OpenMPTargetExecTeamMember { } #pragma omp barrier } - return team_scratch[0]; + value = team_scratch[0]; } + /** \brief Intra-team exclusive prefix sum with team_rank() ordering * with intra-team non-deterministic ordering accumulation. * @@ -249,15 +250,37 @@ class OpenMPTargetExecTeamMember { // and L1 shmem size. TEAM_REDUCE_SIZE = 512 bytes saved per team for // hierarchical reduction. There is an additional 10% of the requested // scratch memory allocated per team as padding. Hence the product with 0.1. + // + // Use llvm extensions for dynamic shared memory with compilers/architecture + // combinations where it is supported. + // + // Size allocated in HBM will now change based on whether we use llvm + // extensions. +#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) + const int total_shmem = shmem_size_L1 + shmem_size_L1 * 0.1; +#else + const int total_shmem = + shmem_size_L0 + shmem_size_L1 + (shmem_size_L0 + shmem_size_L1) * 0.1; +#endif + + // Per team offset for buffer in HBM. const int reduce_offset = - m_shmem_block_index * - (shmem_size_L0 + shmem_size_L1 + - ((shmem_size_L0 + shmem_size_L1) * 0.1) + TEAM_REDUCE_SIZE); + m_shmem_block_index * (total_shmem + TEAM_REDUCE_SIZE); + +#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) + const int l1_offset = reduce_offset + TEAM_REDUCE_SIZE; + char* l0_scratch = + static_cast(llvm_omp_target_dynamic_shared_alloc()); + m_team_shared = scratch_memory_space( + l0_scratch, shmem_size_L0, static_cast(glb_scratch) + l1_offset, + shmem_size_L1); +#else const int l0_offset = reduce_offset + TEAM_REDUCE_SIZE; const int l1_offset = l0_offset + shmem_size_L0; m_team_shared = scratch_memory_space( (static_cast(glb_scratch) + l0_offset), shmem_size_L0, static_cast(glb_scratch) + l1_offset, shmem_size_L1); +#endif m_reduce_scratch = static_cast(glb_scratch) + reduce_offset; m_league_rank = league_rank; m_team_rank = omp_tid; @@ -751,6 +774,7 @@ class OpenMPTargetExec { int64_t thread_local_bytes, int64_t league_size); static void* m_scratch_ptr; + static std::mutex m_mutex_scratch_ptr; static int64_t m_scratch_size; static int* m_lock_array; static uint64_t m_lock_size; diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp index 1abc925caed5..26085f11400f 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -140,8 +141,10 @@ class ParallelFor, // guarantees that the number of teams specified in the `num_teams` clause is // always less than or equal to the maximum concurrently running teams. #if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU) -#pragma omp target teams thread_limit(team_size) firstprivate(a_functor) \ - num_teams(max_active_teams) is_device_ptr(scratch_ptr) + KOKKOS_IMPL_OMPTARGET_PRAGMA( + teams thread_limit(team_size) firstprivate(a_functor) + num_teams(max_active_teams) is_device_ptr(scratch_ptr) + KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) #pragma omp parallel { if (omp_get_num_teams() > max_active_teams) diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp index 4452af3846d2..caa568a89252 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp @@ -55,6 +55,9 @@ class ParallelReduce, const pointer_type m_result_ptr; bool m_result_ptr_on_device; const int m_result_ptr_num_elems; + // Only let one ParallelReduce instance at a time use the scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_memory_lock; using TagType = typename Policy::work_tag; public: @@ -105,7 +108,8 @@ class ParallelReduce, m_result_ptr_on_device( MemorySpaceAccess::accessible), - m_result_ptr_num_elems(arg_result_view.size()) {} + m_result_ptr_num_elems(arg_result_view.size()), + m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} }; } // namespace Impl diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp index a302fa715115..8abffa47a43e 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp @@ -470,6 +470,10 @@ class ParallelReduce m_scratch_memory_lock; + public: void execute() const { const FunctorType& functor = m_functor_reducer.get_functor(); @@ -517,7 +521,8 @@ class ParallelReduce::value( - arg_functor_reducer.get_functor(), arg_policy.team_size())) {} + arg_functor_reducer.get_functor(), arg_policy.team_size())), + m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} }; } // namespace Impl diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp index 1d6677a1df6b..c1f7851f4137 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp @@ -238,8 +238,10 @@ class ParallelScanWithTotal, if (!base_t::m_result_ptr_device_accessible) { const int size = base_t::m_functor_reducer.get_reducer().value_size(); - DeepCopy( - base_t::m_result_ptr, chunk_values.data() + (n_chunks - 1), size); + DeepCopy( + base_t::m_policy.space(), base_t::m_result_ptr, + chunk_values.data() + (n_chunks - 1), size); } } else if (!base_t::m_result_ptr_device_accessible) { base_t::m_functor_reducer.get_reducer().init(base_t::m_result_ptr); diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp index fb75f05f2701..eb3dc3773c4c 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp @@ -21,6 +21,7 @@ #include #include #include +#include namespace Kokkos { namespace Impl { @@ -394,9 +395,11 @@ struct ParallelReduceSpecialize, initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) #if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU) -#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \ - firstprivate(f) is_device_ptr(scratch_ptr) reduction(custom \ - : result) + KOKKOS_IMPL_OMPTARGET_PRAGMA( + teams num_teams(max_active_teams) thread_limit(team_size) + firstprivate(f) is_device_ptr(scratch_ptr) reduction(custom + : result) + KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) #pragma omp parallel reduction(custom : result) { if (omp_get_num_teams() > max_active_teams) @@ -482,9 +485,11 @@ struct ParallelReduceSpecialize, // Case where reduction is on a native data type. if constexpr (std::is_arithmetic::value) { -#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) reduction(+: result) + // Use scratch memory extensions to request dynamic shared memory for + // the right compiler/architecture combination. + KOKKOS_IMPL_OMPTARGET_PRAGMA(teams num_teams(max_active_teams) thread_limit(team_size) map(to: f) \ + is_device_ptr(scratch_ptr) reduction(+: result) \ + KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) #pragma omp parallel reduction(+ : result) { if (omp_get_num_teams() > max_active_teams) @@ -636,11 +641,13 @@ struct ParallelReduceSpecialize, return; } - -#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) - { + // Use scratch memory extensions to request dynamic shared memory for the + // right compiler/architecture combination. + KOKKOS_IMPL_OMPTARGET_PRAGMA( + teams num_teams(nteams) thread_limit(team_size) map(to + : f) + is_device_ptr(scratch_ptr) + KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) { #pragma omp parallel { const int team_num = omp_get_team_num(); @@ -665,9 +672,8 @@ struct ParallelReduceSpecialize, int tree_neighbor_offset = 1; do { -#pragma omp target teams distribute parallel for simd map(to \ - : final_reducer) \ - is_device_ptr(scratch_ptr) +#pragma omp target teams distribute parallel for simd firstprivate( \ + final_reducer) is_device_ptr(scratch_ptr) for (int i = 0; i < nteams - tree_neighbor_offset; i += 2 * tree_neighbor_offset) { ValueType* team_scratch = static_cast(scratch_ptr); diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp index 41e62ce6e6b3..6878531730d9 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp @@ -438,6 +438,10 @@ class ParallelReduce m_scratch_memory_lock; + public: inline void execute() const { execute_tile( @@ -452,7 +456,8 @@ class ParallelReduce::accessible) {} + typename ViewType::memory_space>::accessible), + m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} template inline std::enable_if_t execute_tile(const FunctorType& functor, diff --git a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp index 672271ed6b90..9b578aca1129 100644 --- a/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp +++ b/packages/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp @@ -18,7 +18,6 @@ #define KOKKOS_OPENMPTARGETREDUCER_HPP #include -#include #include #include "Kokkos_OpenMPTarget_Abort.hpp" diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp index 7fa935f693a4..9a246f7642f9 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.cpp @@ -88,26 +88,57 @@ bool SYCL::impl_is_initialized() { void SYCL::impl_finalize() { Impl::SYCLInternal::singleton().finalize(); } void SYCL::print_configuration(std::ostream& os, bool verbose) const { - os << "Devices:\n"; - os << " KOKKOS_ENABLE_SYCL: yes\n"; - os << "\nRuntime Configuration:\n"; - os << "macro KOKKOS_ENABLE_SYCL : defined\n"; +#ifdef KOKKOS_ENABLE_ONEDPL + os << "macro KOKKOS_ENABLE_ONEDPL : defined\n"; +#else + os << "macro KOKKOS_ENABLE_ONEDPL : undefined\n"; +#endif #ifdef KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED os << "macro KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED : defined\n"; #else os << "macro KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED : undefined\n"; #endif - +#ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL + os << "macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL : defined\n"; +#else + os << "macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL : undefined\n"; +#endif #ifdef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES os << "macro KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES : defined\n"; #else os << "macro KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES : undefined\n"; #endif - if (verbose) + int counter = 0; + int active_device = Kokkos::device_id(); + std::cout << "\nAvailable devices: \n"; + std::vector devices = Impl::get_sycl_devices(); + for (const auto& device : devices) { + std::string device_type; + switch (device.get_info()) { + case sycl::info::device_type::cpu: device_type = "cpu"; break; + case sycl::info::device_type::gpu: device_type = "gpu"; break; + case sycl::info::device_type::accelerator: + device_type = "accelerator"; + break; + case sycl::info::device_type::custom: device_type = "custom"; break; + case sycl::info::device_type::automatic: device_type = "automatic"; break; + case sycl::info::device_type::host: device_type = "host"; break; + case sycl::info::device_type::all: device_type = "all"; break; + } + os << "[" << device.get_backend() << "]:" << device_type << ':' << counter + << "] " << device.get_info(); + if (counter == active_device) os << " : Selected"; + os << '\n'; + ++counter; + } + + if (verbose) { + os << '\n'; SYCL::impl_sycl_info(os, m_space_instance->m_queue->get_device()); + } } void SYCL::fence(const std::string& name) const { @@ -137,20 +168,11 @@ void SYCL::impl_static_fence(const std::string& name) { } void SYCL::impl_initialize(InitializationSettings const& settings) { - std::vector gpu_devices = - sycl::device::get_devices(sycl::info::device_type::gpu); - // If the device id is not specified and there are no GPUs, sidestep Kokkos - // device selection and use whatever is available (if no GPU architecture is - // specified). -#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) - if (!settings.has_device_id() && gpu_devices.empty()) { - Impl::SYCLInternal::singleton().initialize(sycl::device()); - Impl::SYCLInternal::m_syclDev = 0; - return; - } -#endif - const auto id = ::Kokkos::Impl::get_gpu(settings); - Impl::SYCLInternal::singleton().initialize(gpu_devices[id]); + const auto& visible_devices = ::Kokkos::Impl::get_visible_devices(); + const auto id = + ::Kokkos::Impl::get_gpu(settings).value_or(visible_devices[0]); + std::vector sycl_devices = Impl::get_sycl_devices(); + Impl::SYCLInternal::singleton().initialize(sycl_devices[id]); Impl::SYCLInternal::m_syclDev = id; } @@ -243,9 +265,32 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os, namespace Impl { +std::vector get_sycl_devices() { +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) || \ + defined(KOKKOS_ARCH_AMD_GPU) + std::vector devices = + sycl::device::get_devices(sycl::info::device_type::gpu); +#if defined(KOKKOS_ARCH_INTEL_GPU) + sycl::backend backend = sycl::backend::ext_oneapi_level_zero; +#elif defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + sycl::backend backend = sycl::backend::ext_oneapi_cuda; +#elif defined(KOKKOS_ARCH_AMD_GPU) + sycl::backend backend = sycl::backend::ext_oneapi_hip; +#endif + devices.erase(std::remove_if(devices.begin(), devices.end(), + [backend](const sycl::device& d) { + return d.get_backend() != backend; + }), + devices.end()); +#else + std::vector devices = sycl::device::get_devices(); +#endif + return devices; +} + int g_sycl_space_factory_initialized = Kokkos::Impl::initialize_space_factory("170_SYCL"); -} +} // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.hpp index be6b4b893028..0f3d1f0994df 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL.hpp @@ -78,19 +78,15 @@ class SYCL { //! \name Functions that all Kokkos devices must implement. //@{ - KOKKOS_INLINE_FUNCTION static int in_parallel() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static int in_parallel() { #if defined(__SYCL_DEVICE_ONLY__) return true; #else return false; #endif } - - /** \brief Set the device in a "sleep" state. */ - static bool sleep(); - - /** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */ - static bool wake(); +#endif /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */ static void impl_static_fence(const std::string& name); @@ -188,6 +184,10 @@ std::vector partition_space(const SYCL& sycl_space, sycl::queue(context, device, sycl::property::queue::in_order())); return instances; } + +namespace Impl { +std::vector get_sycl_devices(); +} // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 080369770d75..0e67adb5787d 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -54,7 +54,7 @@ Kokkos::View sycl_global_unique_token_locks( } SYCLInternal::~SYCLInternal() { - if (!was_finalized || m_scratchSpace || m_scratchFlags) { + if (!was_finalized || m_scratchSpace || m_scratchHost || m_scratchFlags) { std::cerr << "Kokkos::Experimental::SYCL ERROR: Failed to call " "Kokkos::Experimental::SYCL::finalize()" << std::endl; @@ -102,6 +102,23 @@ void SYCLInternal::initialize(const sycl::device& d) { void SYCLInternal::initialize(const sycl::queue& q) { KOKKOS_EXPECTS(!is_initialized()); +#define KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(BACKEND, REQUIRED) \ + if (BACKEND != REQUIRED) \ + Kokkos::abort( \ + "The SYCL execution space instance was initialized with an " \ + "unsupported backend type! For this GPU architecture, only " #REQUIRED \ + " is supported.") +#if defined(KOKKOS_ARCH_INTEL_GPU) + KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(q.get_backend(), + sycl::backend::ext_oneapi_level_zero); +#elif defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(q.get_backend(), + sycl::backend::ext_oneapi_cuda); +#elif defined(KOKKOS_ARCH_AMD_GPU) + KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(q.get_backend(), + sycl::backend::ext_oneapi_hip); +#endif + if (was_finalized) Kokkos::abort("Calling SYCL::initialize after SYCL::finalize is illegal\n"); @@ -196,14 +213,22 @@ void SYCLInternal::finalize() { #endif } - using RecordSYCL = Kokkos::Impl::SharedAllocationRecord; + auto device_mem_space = SYCLDeviceUSMSpace(*m_queue); + auto host_mem_space = SYCLHostUSMSpace(*m_queue); if (nullptr != m_scratchSpace) - RecordSYCL::decrement(RecordSYCL::get_record(m_scratchSpace)); + device_mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + if (nullptr != m_scratchHost) + host_mem_space.deallocate(m_scratchHost, + m_scratchHostCount * sizeScratchGrain); if (nullptr != m_scratchFlags) - RecordSYCL::decrement(RecordSYCL::get_record(m_scratchFlags)); + device_mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); m_syclDev = -1; m_scratchSpaceCount = 0; m_scratchSpace = nullptr; + m_scratchHostCount = 0; + m_scratchHost = nullptr; m_scratchFlagsCount = 0; m_scratchFlags = nullptr; @@ -228,54 +253,68 @@ void SYCLInternal::finalize() { sycl::device_ptr SYCLInternal::scratch_space(const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { - m_scratchSpaceCount = scratch_count(size); - - using Record = Kokkos::Impl::SharedAllocationRecord< - Kokkos::Experimental::SYCLDeviceUSMSpace, void>; + auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); if (nullptr != m_scratchSpace) - Record::decrement(Record::get_record(m_scratchSpace)); + mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + + m_scratchSpaceCount = scratch_count(size); std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchSpaceCount, sizeScratchGrain); - Record* const r = Record::allocate( - Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue), - "Kokkos::Experimental::SYCL::InternalScratchSpace", alloc_size); + m_scratchSpace = static_cast(mem_space.allocate( + "Kokkos::Experimental::SYCL::InternalScratchSpace", alloc_size)); + } + + return m_scratchSpace; +} + +sycl::host_ptr SYCLInternal::scratch_host(const std::size_t size) { + if (verify_is_initialized("scratch_unified") && + m_scratchHostCount < scratch_count(size)) { + auto mem_space = Kokkos::Experimental::SYCLHostUSMSpace(*m_queue); - Record::increment(r); + if (nullptr != m_scratchHost) + mem_space.deallocate(m_scratchHost, + m_scratchHostCount * sizeScratchGrain); - m_scratchSpace = reinterpret_cast(r->data()); + m_scratchHostCount = scratch_count(size); + + std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( + m_scratchHostCount, sizeScratchGrain); + m_scratchHost = static_cast(mem_space.allocate( + "Kokkos::Experimental::SYCL::InternalScratchHost", alloc_size)); } - return m_scratchSpace; + return m_scratchHost; } sycl::device_ptr SYCLInternal::scratch_flags(const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { - m_scratchFlagsCount = scratch_count(size); - - using Record = Kokkos::Impl::SharedAllocationRecord< - Kokkos::Experimental::SYCLDeviceUSMSpace, void>; + auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); if (nullptr != m_scratchFlags) - Record::decrement(Record::get_record(m_scratchFlags)); + mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); + + m_scratchFlagsCount = scratch_count(size); std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchFlagsCount, sizeScratchGrain); - Record* const r = Record::allocate( - Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue), - "Kokkos::Experimental::SYCL::InternalScratchFlags", alloc_size); - - Record::increment(r); - - m_scratchFlags = reinterpret_cast(r->data()); - } - auto memset_event = m_queue->memset(m_scratchFlags, 0, - m_scratchFlagsCount * sizeScratchGrain); + m_scratchFlags = static_cast(mem_space.allocate( + "Kokkos::Experimental::SYCL::InternalScratchFlags", alloc_size)); + + // We only zero-initialize the allocation when we actually allocate. + // It's the responsibility of the features using scratch_flags, + // namely parallel_reduce and parallel_scan, to reset the used values to 0. + auto memset_event = m_queue->memset(m_scratchFlags, 0, + m_scratchFlagsCount * sizeScratchGrain); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES - m_queue->ext_oneapi_submit_barrier(std::vector{memset_event}); + m_queue->ext_oneapi_submit_barrier(std::vector{memset_event}); #endif + } return m_scratchFlags; } @@ -318,15 +357,12 @@ size_t SYCLInternal::USMObjectMem::reserve(size_t n) { assert(m_q); if (m_capacity < n) { - using Record = Kokkos::Impl::SharedAllocationRecord; - // First free what we have (in case malloc can reuse it) - if (m_data) Record::decrement(Record::get_record(m_data)); + AllocationSpace alloc_space(*m_q); + if (m_data) alloc_space.deallocate(m_data, m_capacity); - Record* const r = Record::allocate( - AllocationSpace(*m_q), "Kokkos::Experimental::SYCL::USMObjectMem", n); - Record::increment(r); + m_data = + alloc_space.allocate("Kokkos::Experimental::SYCL::USMObjectMem", n); - m_data = r->data(); if constexpr (sycl::usm::alloc::device == Kind) m_staging.reset(new char[n]); m_capacity = n; @@ -340,8 +376,8 @@ void SYCLInternal::USMObjectMem::reset() { if (m_data) { // This implies a fence since this class is not copyable // and deallocating implies a fence across all registered queues. - using Record = Kokkos::Impl::SharedAllocationRecord; - Record::decrement(Record::get_record(m_data)); + AllocationSpace alloc_space(*m_q); + alloc_space.deallocate(m_data, m_capacity); m_capacity = 0; m_data = nullptr; diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp index 51a617054d6d..ab7e8ce71e06 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -45,6 +45,7 @@ class SYCLInternal { sycl::device_ptr scratch_space(const std::size_t size); sycl::device_ptr scratch_flags(const std::size_t size); + sycl::host_ptr scratch_host(const std::size_t size); int acquire_team_scratch_space(); sycl::device_ptr resize_team_scratch_space(int scratch_pool_id, std::int64_t bytes, @@ -60,6 +61,8 @@ class SYCLInternal { std::size_t m_scratchSpaceCount = 0; sycl::device_ptr m_scratchSpace = nullptr; + std::size_t m_scratchHostCount = 0; + sycl::host_ptr m_scratchHost = nullptr; std::size_t m_scratchFlagsCount = 0; sycl::device_ptr m_scratchFlags = nullptr; // mutex to access shared memory @@ -330,8 +333,8 @@ struct sycl::is_device_copyable< Kokkos::Experimental::Impl::SYCLFunctionWrapper> : std::true_type {}; -// FIXME_SYCL Remove when this specialization when specializations for -// sycl::device_copyable also apply to const-qualified types. +#if (defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER < 20240000) || \ + (defined(__LIBSYCL_MAJOR_VERSION) && __LIBSYCL_MAJOR_VERSION < 7) template struct NonTriviallyCopyableAndDeviceCopyable { NonTriviallyCopyableAndDeviceCopyable( @@ -356,3 +359,4 @@ struct sycl::is_device_copyable< : std::true_type {}; #endif #endif +#endif diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp index f4fada570b0e..7fbf5420f83e 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp @@ -118,6 +118,8 @@ class Kokkos::Impl::ParallelFor, const BarePolicy bare_policy(m_policy); + desul::ensure_sycl_lock_arrays_on_device(q); + auto parallel_for_event = q.submit([&](sycl::handler& cgh) { const auto range = compute_ranges(); const sycl::range<3> global_range = range.get_global_range(); diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp index 9c5767d209ff..b4de7eb89ffa 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp @@ -81,6 +81,8 @@ class Kokkos::Impl::ParallelFor, const Kokkos::Experimental::SYCL& space = policy.space(); sycl::queue& q = space.sycl_queue(); + desul::ensure_sycl_lock_arrays_on_device(q); + auto parallel_for_event = q.submit([&](sycl::handler& cgh) { #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(memcpy_event); diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index 4fc5818ce9bd..ecb4a863da2d 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -46,9 +46,9 @@ class Kokkos::Impl::ParallelFor, int m_shmem_size; sycl::device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; - // Only let one ParallelFor/Reduce modify the team scratch memory. The - // constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_lock; + // Only let one ParallelFor instance at a time use the team scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_buffers_lock; int m_scratch_pool_id = -1; template @@ -59,6 +59,8 @@ class Kokkos::Impl::ParallelFor, const Kokkos::Experimental::SYCL& space = policy.space(); sycl::queue& q = space.sycl_queue(); + desul::ensure_sycl_lock_arrays_on_device(q); + auto parallel_for_event = q.submit([&](sycl::handler& cgh) { // FIXME_SYCL accessors seem to need a size greater than zero at least for // host queues @@ -74,7 +76,8 @@ class Kokkos::Impl::ParallelFor, auto lambda = [=](sycl::nd_item<2> item) { const member_type team_member( - team_scratch_memory_L0.get_pointer(), shmem_begin, scratch_size[0], + KOKKOS_IMPL_SYCL_GET_MULTI_PTR(team_scratch_memory_L0), shmem_begin, + scratch_size[0], global_scratch_ptr + item.get_group(1) * scratch_size[1], scratch_size[1], item, item.get_group_linear_id(), item.get_group_range(1)); @@ -141,9 +144,9 @@ class Kokkos::Impl::ParallelFor, m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), m_vector_size(arg_policy.impl_vector_length()), - m_scratch_lock(arg_policy.space() - .impl_internal_space_instance() - ->m_team_scratch_mutex) { + m_scratch_buffers_lock(arg_policy.space() + .impl_internal_space_instance() + ->m_team_scratch_mutex) { // FIXME_SYCL optimize if (m_team_size < 0) m_team_size = diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index 6964c2dbcf0d..f55280e22e38 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -78,7 +78,7 @@ class Kokkos::Impl::ParallelReduce::accessible), - m_shared_memory_lock( + m_scratch_buffers_lock( m_space.impl_internal_space_instance()->m_mutexScratchSpace) {} private: @@ -95,9 +95,16 @@ class Kokkos::Impl::ParallelReduce results_ptr; + auto host_result_ptr = + (m_result_ptr && !m_result_ptr_device_accessible) + ? static_cast>( + instance.scratch_host(sizeof(value_type) * value_count)) + : nullptr; sycl::event last_reduction_event; + desul::ensure_sycl_lock_arrays_on_device(q); + // If n_tiles==0 we only call init() and final() working with the global // scratch memory but don't copy back to m_result_ptr yet. if (n_tiles == 0) { @@ -109,8 +116,10 @@ class Kokkos::Impl::ParallelReduce>( instance.scratch_space(sizeof(value_type) * value_count)); - sycl::global_ptr device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast>(m_result_ptr) + : static_cast>(host_result_ptr); cgh.single_task([=]() { const CombinedFunctorReducerType& functor_reducer = functor_reducer_wrapper.get_functor(); @@ -148,8 +157,10 @@ class Kokkos::Impl::ParallelReduce>( instance.scratch_space(sizeof(value_type) * value_count * n_wgroups)); - sycl::global_ptr device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast>(m_result_ptr) + : static_cast>(host_result_ptr); auto scratch_flags = static_cast>( instance.scratch_flags(sizeof(unsigned int))); @@ -223,6 +234,7 @@ class Kokkos::Impl::ParallelReduce= static_cast(n_wgroups)) reducer.init(&local_mem[local_id * value_count]); else { @@ -268,6 +280,7 @@ class Kokkos::Impl::ParallelReduce= static_cast(n_wgroups)) reducer.init(&local_value); else { @@ -296,11 +309,13 @@ class Kokkos::Impl::ParallelReduce( - m_space, m_result_ptr, results_ptr, - sizeof(*m_result_ptr) * value_count); + // Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower. + if (host_result_ptr) { + m_space.fence( + "Kokkos::Impl::ParallelReduce::execute: result " + "not device-accessible"); + std::memcpy(m_result_ptr, host_result_ptr, + sizeof(value_type) * value_count); } return last_reduction_event; @@ -335,9 +350,9 @@ class Kokkos::Impl::ParallelReduce m_shared_memory_lock; + // Only let one ParallelReduce instance at a time use the host scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_buffers_lock; }; #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_MDRANGE_HPP */ diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp index 8c900cfa4280..5333e3c8a83a 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp @@ -51,7 +51,7 @@ class Kokkos::Impl::ParallelReduce::accessible), - m_shared_memory_lock( + m_scratch_buffers_lock( p.space().impl_internal_space_instance()->m_mutexScratchSpace) {} private: @@ -70,11 +70,20 @@ class Kokkos::Impl::ParallelReduce results_ptr = nullptr; - sycl::global_ptr device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; + auto host_result_ptr = + (m_result_ptr && !m_result_ptr_device_accessible) + ? static_cast>( + instance.scratch_host(sizeof(value_type) * value_count)) + : nullptr; + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast>(m_result_ptr) + : static_cast>(host_result_ptr); sycl::event last_reduction_event; + desul::ensure_sycl_lock_arrays_on_device(q); + // If size<=1 we only call init(), the functor and possibly final once // working with the global scratch memory but don't copy back to // m_result_ptr yet. @@ -168,6 +177,7 @@ class Kokkos::Impl::ParallelReduce= n_wgroups) reducer.init(&local_mem[local_id * value_count]); else { @@ -210,6 +220,7 @@ class Kokkos::Impl::ParallelReduce= n_wgroups) reducer.init(&local_value); else { @@ -320,11 +331,13 @@ class Kokkos::Impl::ParallelReduce( - space, m_result_ptr, results_ptr, - sizeof(*m_result_ptr) * value_count); + // Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower. + if (host_result_ptr) { + space.fence( + "Kokkos::Impl::ParallelReduce::execute: result " + "not device-accessible"); + std::memcpy(m_result_ptr, host_result_ptr, + sizeof(*m_result_ptr) * value_count); } return last_reduction_event; @@ -354,9 +367,9 @@ class Kokkos::Impl::ParallelReduce m_shared_memory_lock; + // Only let one ParallelReduce instance at a time use the host scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_buffers_lock; }; #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_RANGE_HPP */ diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index 07145b0fb93c..27165c59e3a9 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -59,9 +59,10 @@ class Kokkos::Impl::ParallelReduce m_scratch_lock; + // Only let one ParallelReduce instance at a time use the team scratch memory + // and the host scratch memory. The constructor acquires the mutex which is + // released in the destructor. + std::scoped_lock m_scratch_buffers_lock; int m_scratch_pool_id = -1; template @@ -79,9 +80,16 @@ class Kokkos::Impl::ParallelReduce>( + instance.scratch_host(sizeof(value_type) * value_count)) + : nullptr; sycl::event last_reduction_event; + desul::ensure_sycl_lock_arrays_on_device(q); + // If size<=1 we only call init(), the functor and possibly final once // working with the global scratch memory but don't copy back to // m_result_ptr yet. @@ -89,8 +97,10 @@ class Kokkos::Impl::ParallelReduce>(instance.scratch_space( sizeof(value_type) * std::max(value_count, 1u))); - sycl::global_ptr device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast>(m_result_ptr) + : static_cast>(host_result_ptr); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { // FIXME_SYCL accessors seem to need a size greater than zero at least @@ -121,9 +131,10 @@ class Kokkos::Impl::ParallelReduce) functor(team_member, update); else @@ -160,12 +171,16 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = m_global_scratch_ptr; + sycl::local_accessor num_teams_done(1, cgh); auto team_reduction_factory = [&](sycl::local_accessor local_mem, sycl::device_ptr results_ptr) { - sycl::global_ptr device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast>(m_result_ptr) + : static_cast>( + host_result_ptr); auto lambda = [=](sycl::nd_item<2> item) { auto n_wgroups = item.get_group_range()[1]; int wgroup_size = @@ -173,8 +188,6 @@ class Kokkos::Impl::ParallelReduce( - local_mem[wgroup_size * std::max(value_count, 1u)]); const auto local_id = item.get_local_linear_id(); const CombinedFunctorReducerType& functor_reducer = functor_reducer_wrapper.get_functor(); @@ -188,8 +201,8 @@ class Kokkos::Impl::ParallelReduce scratch_flags_ref(*scratch_flags); - num_teams_done = ++scratch_flags_ref; + num_teams_done[0] = ++scratch_flags_ref; } sycl::group_barrier(item.get_group()); - if (num_teams_done == n_wgroups) { + if (num_teams_done[0] == n_wgroups) { + if (local_id == 0) *scratch_flags = 0; if (local_id >= n_wgroups) reducer.init(&local_mem[local_id * value_count]); else { @@ -241,8 +255,8 @@ class Kokkos::Impl::ParallelReduce scratch_flags_ref(*scratch_flags); - num_teams_done = ++scratch_flags_ref; + num_teams_done[0] = ++scratch_flags_ref; } item.barrier(sycl::access::fence_space::local_space); - if (num_teams_done == n_wgroups) { + if (num_teams_done[0] == n_wgroups) { + if (local_id == 0) *scratch_flags = 0; if (local_id >= n_wgroups) reducer.init(&local_value); else { @@ -311,10 +326,7 @@ class Kokkos::Impl::ParallelReduce local_mem( - sycl::range<1>(wgroup_size) * std::max(value_count, 1u) + - (sizeof(unsigned int) + sizeof(value_type) - 1) / - sizeof(value_type), - cgh); + sycl::range<1>(wgroup_size) * std::max(value_count, 1u), cgh); const auto init_size = std::max((size + wgroup_size - 1) / wgroup_size, 1); @@ -358,11 +370,13 @@ class Kokkos::Impl::ParallelReduce( - space, m_result_ptr, results_ptr, - sizeof(*m_result_ptr) * value_count); + // Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower. + if (host_result_ptr) { + space.fence( + "Kokkos::Impl::ParallelReduce::execute: result not " + "device-accessible"); + std::memcpy(m_result_ptr, host_result_ptr, + sizeof(*m_result_ptr) * value_count); } return last_reduction_event; @@ -448,9 +462,9 @@ class Kokkos::Impl::ParallelReducem_team_scratch_mutex) { + m_scratch_buffers_lock(arg_policy.space() + .impl_internal_space_instance() + ->m_team_scratch_mutex) { initialize(); } }; diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index 04425723e198..977b69bc9eb7 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKO_SYCL_PARALLEL_SCAN_RANGE_HPP -#define KOKKO_SYCL_PARALLEL_SCAN_RANGE_HPP +#ifndef KOKKOS_SYCL_PARALLEL_SCAN_RANGE_HPP +#define KOKKOS_SYCL_PARALLEL_SCAN_RANGE_HPP #include #include @@ -111,13 +111,13 @@ class ParallelScanSYCLBase { const CombinedFunctorReducer m_functor_reducer; const Policy m_policy; - pointer_type m_scratch_space = nullptr; - const pointer_type m_result_ptr; + sycl::host_ptr m_scratch_host = nullptr; + pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; - // Only let one Parallel/Scan modify the shared memory. The - // constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_shared_memory_lock; + // Only let one ParallelScan instance at a time use the host scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_buffers_lock; private: template @@ -187,6 +187,7 @@ class ParallelScanSYCLBase { } item.barrier(sycl::access::fence_space::global_space); if (num_teams_done[0] == n_wgroups) { + if (local_id == 0) *scratch_flags = 0; value_type total; reducer.init(&total); @@ -220,6 +221,8 @@ class ParallelScanSYCLBase { sycl::device_ptr global_mem; sycl::device_ptr group_results; + desul::ensure_sycl_lock_arrays_on_device(q); + auto perform_work_group_scans = q.submit([&](sycl::handler& cgh) { sycl::local_accessor num_teams_done(1, cgh); @@ -253,7 +256,8 @@ class ParallelScanSYCLBase { global_mem = static_cast>(instance.scratch_space( n_wgroups * (wgroup_size + 1) * sizeof(value_type))); - m_scratch_space = global_mem; + m_scratch_host = static_cast>( + instance.scratch_host(sizeof(value_type))); group_results = global_mem + n_wgroups * wgroup_size; @@ -281,10 +285,11 @@ class ParallelScanSYCLBase { // Write results to global memory auto update_global_results = q.submit([&](sycl::handler& cgh) { - auto result_ptr_device_accessible = m_result_ptr_device_accessible; // The compiler failed with CL_INVALID_ARG_VALUE if using m_result_ptr // directly. - auto result_ptr = m_result_ptr_device_accessible ? m_result_ptr : nullptr; + pointer_type result_ptr = m_result_ptr_device_accessible + ? m_result_ptr + : static_cast(m_scratch_host); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(perform_work_group_scans); @@ -293,7 +298,6 @@ class ParallelScanSYCLBase { cgh.parallel_for( sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size), [=](sycl::nd_item<1> item) { - auto global_mem_copy = global_mem; const index_type global_id = item.get_global_linear_id(); const CombinedFunctorReducer< FunctorType, typename Analysis::Reducer>& functor_reducer = @@ -312,9 +316,7 @@ class ParallelScanSYCLBase { else functor(WorkTag(), global_id + begin, update, true); - global_mem_copy[global_id] = update; - if (global_id == size - 1 && result_ptr_device_accessible) - *result_ptr = update; + if (global_id == size - 1) *result_ptr = update; } }); }); @@ -351,9 +353,9 @@ class ParallelScanSYCLBase { m_policy(arg_policy), m_result_ptr(arg_result_ptr), m_result_ptr_device_accessible(arg_result_ptr_device_accessible), - m_shared_memory_lock(m_policy.space() - .impl_internal_space_instance() - ->m_mutexScratchSpace) {} + m_scratch_buffers_lock(m_policy.space() + .impl_internal_space_instance() + ->m_mutexScratchSpace) {} }; } // namespace Kokkos::Impl @@ -390,11 +392,13 @@ class Kokkos::Impl::ParallelScanWithTotal< Base::impl_execute([&]() { const long long nwork = Base::m_policy.end() - Base::m_policy.begin(); if (nwork > 0 && !Base::m_result_ptr_device_accessible) { + // Using DeepCopy instead of fence+memcpy turned out to be up to 2x + // slower. + m_exec.fence( + "Kokkos::Impl::ParallelReduce::execute: " + "result not device-accessible"); const int size = Base::m_functor_reducer.get_reducer().value_size(); - DeepCopy(m_exec, Base::m_result_ptr, - Base::m_scratch_space + nwork - 1, - size); + std::memcpy(Base::m_result_ptr, Base::m_scratch_host, size); } }); } diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp index 64b7f56796a5..9cc8008cdf31 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp @@ -25,7 +25,6 @@ #include #include #include -#include #include /*--------------------------------------------------------------------------*/ @@ -243,226 +242,17 @@ void SYCLHostUSMSpace::deallocate(const char* arg_label, } // namespace Experimental } // namespace Kokkos -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord SharedAllocationRecord< - Kokkos::Experimental::SYCLDeviceUSMSpace, void>::s_root_record; - -SharedAllocationRecord SharedAllocationRecord< - Kokkos::Experimental::SYCLSharedUSMSpace, void>::s_root_record; - -SharedAllocationRecord SharedAllocationRecord< - Kokkos::Experimental::SYCLHostUSMSpace, void>::s_root_record; -#endif - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCLDeviceUSMSpace& space, - const std::string& label, const size_t size, - const SharedAllocationRecord::function_type dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(space, label, size), - sizeof(SharedAllocationHeader) + size, dealloc, label), - m_space(space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, label); - - // Copy to device memory - Kokkos::Experimental::SYCL exec; - Kokkos::Impl::DeepCopy( - exec, RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - exec.fence( - "SharedAllocationRecord::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& arg_exec_space, - const Kokkos::Experimental::SYCLDeviceUSMSpace& space, - const std::string& label, const size_t size, - const SharedAllocationRecord::function_type dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_exec_space, space, - label, size), - sizeof(SharedAllocationHeader) + size, dealloc, label), - m_space(space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, label); - - // Copy to device memory - Kokkos::Impl::DeepCopy( - arg_exec_space, RecordBase::m_alloc_ptr, &header, - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(exec_space, arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(exec_space, arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -SharedAllocationRecord::~SharedAllocationRecord() { - const auto alloc_size = SharedAllocationRecord::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - alloc_size, alloc_size - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - const auto alloc_size = SharedAllocationRecord::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - alloc_size, alloc_size - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - const auto alloc_size = SharedAllocationRecord::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - alloc_size, alloc_size - sizeof(SharedAllocationHeader)); -} - -//---------------------------------------------------------------------------- - -} // namespace Impl -} // namespace Kokkos - //============================================================================== // {{{1 #include -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; -template class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; -template class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLSharedUSMSpace>; -template class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLHostUSMSpace>; - -} // namespace Impl -} // namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::Experimental::SYCLDeviceUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::Experimental::SYCLSharedUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::Experimental::SYCLHostUSMSpace); // end Explicit instantiations of CRTP Base classes }}}1 //============================================================================== diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp index 239c6e3ce0b4..b86cfca413c0 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp @@ -66,11 +66,6 @@ class SYCLDeviceUSMSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; - private: - template - friend class LogicalMemorySpace; - - public: static constexpr const char* name() { return "SYCLDeviceUSM"; }; private: @@ -87,6 +82,16 @@ class SYCLSharedUSMSpace { SYCLSharedUSMSpace(); explicit SYCLSharedUSMSpace(sycl::queue queue); + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const SYCL& exec_space, const std::size_t arg_alloc_size) const; void* allocate(const SYCL& exec_space, const char* arg_label, @@ -102,11 +107,6 @@ class SYCLSharedUSMSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; - private: - template - friend class LogicalMemorySpace; - - public: static constexpr const char* name() { return "SYCLSharedUSM"; }; private: @@ -123,6 +123,16 @@ class SYCLHostUSMSpace { SYCLHostUSMSpace(); explicit SYCLHostUSMSpace(sycl::queue queue); + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const SYCL& exec_space, const std::size_t arg_alloc_size) const; void* allocate(const SYCL& exec_space, const char* arg_label, @@ -138,11 +148,6 @@ class SYCLHostUSMSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; - private: - template - friend class LogicalMemorySpace; - - public: static constexpr const char* name() { return "SYCLHostUSM"; }; private: @@ -166,19 +171,16 @@ struct is_sycl_type_space : public std::true_type {}; static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); template <> struct MemorySpaceAccess -class SharedAllocationRecord - : public HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace> { - private: - friend class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; - friend class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; - using base_t = HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord(SharedAllocationRecord&&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; -#endif - - const Kokkos::Experimental::SYCLDeviceUSMSpace m_space; - - protected: - ~SharedAllocationRecord(); - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLSharedUSMSpace> { - private: - friend class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLSharedUSMSpace>; - using base_t = - SharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord(SharedAllocationRecord&&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete; - - static RecordBase s_root_record; - - const Kokkos::Experimental::SYCLSharedUSMSpace m_space; - - protected: - ~SharedAllocationRecord(); - - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLHostUSMSpace> { - private: - friend class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLHostUSMSpace>; - using base_t = - SharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord(SharedAllocationRecord&&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete; - - static RecordBase s_root_record; - - const Kokkos::Experimental::SYCLHostUSMSpace m_space; - - protected: - ~SharedAllocationRecord(); - - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -} // namespace Impl - } // namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::Experimental::SYCLDeviceUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::Experimental::SYCLSharedUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::Experimental::SYCLHostUSMSpace); + #endif #endif diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp index 89c09c3195fe..dbba3827581c 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -140,9 +140,14 @@ class SYCLTeamMember { } value = sg.shuffle(value, 0); + const auto n_subgroups = sg.get_group_range()[0]; + if (n_subgroups == 1) { + reducer.reference() = value; + return; + } + // We need to chunk up the whole reduction because we might not have // allocated enough memory. - const auto n_subgroups = sg.get_group_range()[0]; const unsigned int maximum_work_range = std::min(m_team_reduce_size / sizeof(value_type), n_subgroups); diff --git a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp index 9548f211d9e3..61db6b34aac0 100644 --- a/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp +++ b/packages/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp @@ -26,8 +26,7 @@ namespace Impl { template struct ZeroMemset> { ZeroMemset(const Kokkos::Experimental::SYCL& exec_space, - const View& dst, - typename View::const_value_type&) { + const View& dst) { auto event = exec_space.impl_internal_space_instance()->m_queue->memset( dst.data(), 0, dst.size() * sizeof(typename View::value_type)); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES @@ -35,12 +34,6 @@ struct ZeroMemset> { ->m_queue->ext_oneapi_submit_barrier(std::vector{event}); #endif } - - ZeroMemset(const View& dst, - typename View::const_value_type&) { - Experimental::Impl::SYCLInternal::singleton().m_queue->memset( - dst.data(), 0, dst.size() * sizeof(typename View::value_type)); - } }; } // namespace Impl diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial.cpp b/packages/kokkos/core/src/Serial/Kokkos_Serial.cpp index 071ecdbc4fa3..39b201976b5a 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial.cpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial.cpp @@ -153,7 +153,7 @@ void Serial::print_configuration(std::ostream& os, bool /*verbose*/) const { os << "Host Serial Execution Space:\n"; os << " KOKKOS_ENABLE_SERIAL: yes\n"; -#ifdef KOKKOS_INTERNAL_NOT_PARALLEL +#ifdef KOKKOS_ENABLE_ATOMICS_BYPASS os << "Kokkos atomics disabled\n"; #endif diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial.hpp index 67119cac164b..43eb4992ed73 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial.hpp @@ -121,7 +121,10 @@ class Serial { /// For the Serial device, this method always returns false, /// because parallel_for or parallel_reduce with the Serial device /// always execute sequentially. - inline static int in_parallel() { return false; } + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED inline static int in_parallel() { return false; } +#endif /// \brief Wait until all dispatched functors complete. /// diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp index 69787aa5001a..67978aa3e9f7 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKO_SERIAL_PARALLEL_MDRANGE_HPP -#define KOKKO_SERIAL_PARALLEL_MDRANGE_HPP +#ifndef KOKKOS_SERIAL_PARALLEL_MDRANGE_HPP +#define KOKKOS_SERIAL_PARALLEL_MDRANGE_HPP #include #include diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp index 56894716dbd7..91b4c5671134 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKO_SERIAL_PARALLEL_RANGE_HPP -#define KOKKO_SERIAL_PARALLEL_RANGE_HPP +#ifndef KOKKOS_SERIAL_PARALLEL_RANGE_HPP +#define KOKKOS_SERIAL_PARALLEL_RANGE_HPP #include diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp index 0876f1af229d..f34a7daaca00 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKO_SERIAL_PARALLEL_TEAM_HPP -#define KOKKO_SERIAL_PARALLEL_TEAM_HPP +#ifndef KOKKOS_SERIAL_PARALLEL_TEAM_HPP +#define KOKKOS_SERIAL_PARALLEL_TEAM_HPP #include diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp index f9c86f55ce05..5905d6d32e14 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp @@ -121,7 +121,7 @@ class TaskQueueSpecializationConstrained< using task_base_type = TaskBase; using queue_type = typename scheduler_type::queue_type; - task_base_type* const end = (task_base_type*)task_base_type::EndTag; + auto* const end = reinterpret_cast(task_base_type::EndTag); execution_space serial_execution_space; auto& data = serial_execution_space.impl_internal_space_instance() @@ -157,7 +157,7 @@ class TaskQueueSpecializationConstrained< using task_base_type = TaskBase; using queue_type = typename scheduler_type::queue_type; - task_base_type* const end = (task_base_type*)task_base_type::EndTag; + auto* const end = reinterpret_cast(task_base_type::EndTag); execution_space serial_execution_space; diff --git a/packages/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp b/packages/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp index 3ec2dfbcfa0a..6ad6aabc5a7c 100644 --- a/packages/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp +++ b/packages/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp @@ -22,6 +22,7 @@ #include #include +#include namespace Kokkos { namespace Impl { @@ -34,14 +35,11 @@ template struct ZeroMemset< std::conditional_t::value, Serial, DummyExecutionSpace>, - View> - : public ZeroMemset> { - using Base = ZeroMemset>; - using Base::Base; - - ZeroMemset(const Serial&, const View& dst, - typename View::const_value_type& value) - : Base(dst, value) {} + View> { + ZeroMemset(const Serial&, const View& dst) { + using ValueType = typename View::value_type; + std::memset(dst.data(), 0, sizeof(ValueType) * dst.size()); + } }; } // namespace Impl diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads.hpp index c0d70c03ecbe..31653c46cac3 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads.hpp @@ -38,15 +38,6 @@ static_assert(false, /*--------------------------------------------------------------------------*/ -namespace Kokkos { -namespace Impl { -class ThreadsExec; -enum class fence_is_static { yes, no }; -} // namespace Impl -} // namespace Kokkos - -/*--------------------------------------------------------------------------*/ - namespace Kokkos { /** \brief Execution space for a pool of C++11 threads on a CPU. */ @@ -73,7 +64,9 @@ class Threads { /// \brief True if and only if this method is being called in a /// thread-parallel function. - static int in_parallel(); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static int in_parallel(); +#endif /// \brief Print configuration information to the given output stream. void print_configuration(std::ostream& os, bool verbose = false) const; diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp similarity index 56% rename from packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp rename to packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp index 801a1ac82e91..3842966cd77b 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp @@ -16,17 +16,15 @@ #ifndef KOKKOS_IMPL_PUBLIC_INCLUDE #define KOKKOS_IMPL_PUBLIC_INCLUDE +#include "Threads/Kokkos_Threads_Instance.hpp" #endif #include -#include -#include #include #include #include #include -#include #include @@ -41,7 +39,6 @@ namespace Kokkos { namespace Impl { namespace { -std::mutex host_internal_cppthread_mutex; // std::thread compatible driver. // Recovery from an exception would require constant intra-thread health @@ -49,7 +46,7 @@ std::mutex host_internal_cppthread_mutex; // abort the process. void internal_cppthread_driver() { try { - ThreadsExec::driver(); + ThreadsInternal::driver(); } catch (const std::exception &x) { std::cerr << "Exception thrown from worker thread: " << x.what() << std::endl; @@ -62,32 +59,17 @@ void internal_cppthread_driver() { } } -ThreadsExec s_threads_process; -ThreadsExec *s_threads_exec[ThreadsExec::MAX_THREAD_COUNT] = {nullptr}; -std::thread::id s_threads_pid[ThreadsExec::MAX_THREAD_COUNT]; -std::pair s_threads_coord[ThreadsExec::MAX_THREAD_COUNT]; +ThreadsInternal s_threads_process; +ThreadsInternal *s_threads_exec[ThreadsInternal::MAX_THREAD_COUNT] = {nullptr}; +std::thread::id s_threads_pid[ThreadsInternal::MAX_THREAD_COUNT]; +std::pair + s_threads_coord[ThreadsInternal::MAX_THREAD_COUNT]; int s_thread_pool_size[3] = {0, 0, 0}; -unsigned s_current_reduce_size = 0; -unsigned s_current_shared_size = 0; - -void (*volatile s_current_function)(ThreadsExec &, const void *); +void (*volatile s_current_function)(ThreadsInternal &, const void *); const void *volatile s_current_function_arg = nullptr; -struct Sentinel { - ~Sentinel() { - if (s_thread_pool_size[0] || s_thread_pool_size[1] || - s_thread_pool_size[2] || s_current_reduce_size || - s_current_shared_size || s_current_function || s_current_function_arg || - s_threads_exec[0]) { - std::cerr << "ERROR : Process exiting while Kokkos::Threads is still " - "initialized" - << std::endl; - } - } -}; - inline unsigned fan_size(const unsigned rank, const unsigned size) { const unsigned rank_rev = size - (rank + 1); unsigned count = 0; @@ -97,6 +79,12 @@ inline unsigned fan_size(const unsigned rank, const unsigned size) { return count; } +void wait_yield(volatile ThreadState &flag, const ThreadState value) { + while (value == flag) { + std::this_thread::yield(); + } +} + } // namespace } // namespace Impl } // namespace Kokkos @@ -107,66 +95,44 @@ inline unsigned fan_size(const unsigned rank, const unsigned size) { namespace Kokkos { namespace Impl { -//---------------------------------------------------------------------------- -// Spawn a thread - -void ThreadsExec::spawn() { - std::thread t(internal_cppthread_driver); - t.detach(); -} - -//---------------------------------------------------------------------------- - -bool ThreadsExec::is_process() { +bool ThreadsInternal::is_process() { static const std::thread::id master_pid = std::this_thread::get_id(); return master_pid == std::this_thread::get_id(); } -void ThreadsExec::global_lock() { host_internal_cppthread_mutex.lock(); } - -void ThreadsExec::global_unlock() { host_internal_cppthread_mutex.unlock(); } - //---------------------------------------------------------------------------- -void ThreadsExec::wait_yield(volatile int &flag, const int value) { - while (value == flag) { - std::this_thread::yield(); - } -} - -void execute_function_noop(ThreadsExec &, const void *) {} +void execute_function_noop(ThreadsInternal &, const void *) {} -void ThreadsExec::driver() { +void ThreadsInternal::driver() { SharedAllocationRecord::tracking_enable(); - ThreadsExec this_thread; + ThreadsInternal this_thread; - while (ThreadsExec::Active == this_thread.m_pool_state) { + while (this_thread.m_pool_state == ThreadState::Active) { (*s_current_function)(this_thread, s_current_function_arg); // Deactivate thread and wait for reactivation - this_thread.m_pool_state = ThreadsExec::Inactive; + this_thread.m_pool_state = ThreadState::Inactive; - wait_yield(this_thread.m_pool_state, ThreadsExec::Inactive); + wait_yield(this_thread.m_pool_state, ThreadState::Inactive); } } -ThreadsExec::ThreadsExec() +ThreadsInternal::ThreadsInternal() : m_pool_base(nullptr), m_scratch(nullptr), m_scratch_reduce_end(0), m_scratch_thread_end(0), - m_numa_rank(0), - m_numa_core_rank(0), m_pool_rank(0), m_pool_size(0), m_pool_fan_size(0), - m_pool_state(ThreadsExec::Terminating) { + m_pool_state(ThreadState::Terminating) { if (&s_threads_process != this) { - // A spawned thread - - ThreadsExec *const nil = nullptr; + // The code in the if is executed by a spawned thread not by the root + // thread + ThreadsInternal *const nil = nullptr; // Which entry in 's_threads_exec', possibly determined from hwloc binding const int entry = reinterpret_cast(s_current_function_arg) < @@ -178,80 +144,66 @@ ThreadsExec::ThreadsExec() // Given a good entry set this thread in the 's_threads_exec' array if (entry < s_thread_pool_size[0] && nil == atomic_compare_exchange(s_threads_exec + entry, nil, this)) { - const std::pair coord = - Kokkos::hwloc::get_this_thread_coordinate(); - - m_numa_rank = coord.first; - m_numa_core_rank = coord.second; - m_pool_base = s_threads_exec; - m_pool_rank = s_thread_pool_size[0] - (entry + 1); - m_pool_rank_rev = s_thread_pool_size[0] - (pool_rank() + 1); - m_pool_size = s_thread_pool_size[0]; - m_pool_fan_size = fan_size(m_pool_rank, m_pool_size); - m_pool_state = ThreadsExec::Active; + m_pool_base = s_threads_exec; + m_pool_rank = s_thread_pool_size[0] - (entry + 1); + m_pool_rank_rev = s_thread_pool_size[0] - (pool_rank() + 1); + m_pool_size = s_thread_pool_size[0]; + m_pool_fan_size = fan_size(m_pool_rank, m_pool_size); + m_pool_state = ThreadState::Active; s_threads_pid[m_pool_rank] = std::this_thread::get_id(); // Inform spawning process that the threads_exec entry has been set. - s_threads_process.m_pool_state = ThreadsExec::Active; + s_threads_process.m_pool_state = ThreadState::Active; } else { // Inform spawning process that the threads_exec entry could not be set. - s_threads_process.m_pool_state = ThreadsExec::Terminating; + s_threads_process.m_pool_state = ThreadState::Terminating; } } else { // Enables 'parallel_for' to execute on unitialized Threads device m_pool_rank = 0; m_pool_size = 1; - m_pool_state = ThreadsExec::Inactive; + m_pool_state = ThreadState::Inactive; s_threads_pid[m_pool_rank] = std::this_thread::get_id(); } } -ThreadsExec::~ThreadsExec() { +ThreadsInternal::~ThreadsInternal() { const unsigned entry = m_pool_size - (m_pool_rank + 1); - using Record = Kokkos::Impl::SharedAllocationRecord; - if (m_scratch) { - Record *const r = Record::get_record(m_scratch); - + Kokkos::kokkos_free(m_scratch); m_scratch = nullptr; - - Record::decrement(r); } m_pool_base = nullptr; m_scratch_reduce_end = 0; m_scratch_thread_end = 0; - m_numa_rank = 0; - m_numa_core_rank = 0; m_pool_rank = 0; m_pool_size = 0; m_pool_fan_size = 0; - m_pool_state = ThreadsExec::Terminating; + m_pool_state = ThreadState::Terminating; if (&s_threads_process != this && entry < MAX_THREAD_COUNT) { - ThreadsExec *const nil = nullptr; + ThreadsInternal *const nil = nullptr; atomic_compare_exchange(s_threads_exec + entry, this, nil); - s_threads_process.m_pool_state = ThreadsExec::Terminating; + s_threads_process.m_pool_state = ThreadState::Terminating; } } -int ThreadsExec::get_thread_count() { return s_thread_pool_size[0]; } - -ThreadsExec *ThreadsExec::get_thread(const int init_thread_rank) { - ThreadsExec *const th = +ThreadsInternal *ThreadsInternal::get_thread(const int init_thread_rank) { + ThreadsInternal *const th = init_thread_rank < s_thread_pool_size[0] ? s_threads_exec[s_thread_pool_size[0] - (init_thread_rank + 1)] : nullptr; if (nullptr == th || th->m_pool_rank != init_thread_rank) { std::ostringstream msg; - msg << "Kokkos::Impl::ThreadsExec::get_thread ERROR : " + msg << "Kokkos::Impl::ThreadsInternal::get_thread ERROR : " << "thread " << init_thread_rank << " of " << s_thread_pool_size[0]; if (nullptr == th) { msg << " does not exist"; @@ -264,24 +216,6 @@ ThreadsExec *ThreadsExec::get_thread(const int init_thread_rank) { return th; } -//---------------------------------------------------------------------------- - -void ThreadsExec::execute_sleep(ThreadsExec &exec, const void *) { - ThreadsExec::global_lock(); - ThreadsExec::global_unlock(); - - const int n = exec.m_pool_fan_size; - const int rank_rev = exec.m_pool_size - (exec.m_pool_rank + 1); - - for (int i = 0; i < n; ++i) { - Impl::spinwait_while_equal( - exec.m_pool_base[rank_rev + (1 << i)]->m_pool_state, - ThreadsExec::Active); - } - - exec.m_pool_state = ThreadsExec::Inactive; -} - } // namespace Impl } // namespace Kokkos @@ -290,8 +224,8 @@ void ThreadsExec::execute_sleep(ThreadsExec &exec, const void *) { namespace Kokkos { namespace Impl { -void ThreadsExec::verify_is_process(const std::string &name, - const bool initialized) { +void ThreadsInternal::verify_is_process(const std::string &name, + const bool initialized) { if (!is_process()) { std::string msg(name); msg.append( @@ -307,63 +241,48 @@ void ThreadsExec::verify_is_process(const std::string &name, } } -int ThreadsExec::in_parallel() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_DEPRECATED int ThreadsInternal::in_parallel() { // A thread function is in execution and // the function argument is not the special threads process argument and // the master process is a worker or is not the master process. return s_current_function && (&s_threads_process != s_current_function_arg) && (s_threads_process.m_pool_base || !is_process()); } -void ThreadsExec::fence() { internal_fence(Impl::fence_is_static::yes); } -void ThreadsExec::fence(const std::string &name) { - internal_fence(name, Impl::fence_is_static::yes); +#endif +void ThreadsInternal::fence() { + fence("Kokkos::ThreadsInternal::fence: Unnamed Instance Fence"); } - -void ThreadsExec::internal_fence(Impl::fence_is_static is_static) { - internal_fence((is_static == Impl::fence_is_static::no) - ? "Kokkos::ThreadsExec::fence: Unnamed Instance Fence" - : "Kokkos::ThreadsExec::fence: Unnamed Static Fence", - is_static); +void ThreadsInternal::fence(const std::string &name) { + Kokkos::Tools::Experimental::Impl::profile_fence_event( + name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, + internal_fence); } // Wait for root thread to become inactive -void ThreadsExec::internal_fence(const std::string &name, - Impl::fence_is_static is_static) { - const auto &fence_lam = [&]() { - if (s_thread_pool_size[0]) { - // Wait for the root thread to complete: - Impl::spinwait_while_equal(s_threads_exec[0]->m_pool_state, - ThreadsExec::Active); - } +void ThreadsInternal::internal_fence() { + if (s_thread_pool_size[0]) { + // Wait for the root thread to complete: + Impl::spinwait_while_equal(s_threads_exec[0]->m_pool_state, + ThreadState::Active); + } - s_current_function = nullptr; - s_current_function_arg = nullptr; + s_current_function = nullptr; + s_current_function_arg = nullptr; - // Make sure function and arguments are cleared before - // potentially re-activating threads with a subsequent launch. - memory_fence(); - }; - if (is_static == Impl::fence_is_static::yes) { - Kokkos::Tools::Experimental::Impl::profile_fence_event( - name, - Kokkos::Tools::Experimental::SpecialSynchronizationCases:: - GlobalDeviceSynchronization, - fence_lam); - } else { - Kokkos::Tools::Experimental::Impl::profile_fence_event( - name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, - fence_lam); - } + // Make sure function and arguments are cleared before + // potentially re-activating threads with a subsequent launch. + memory_fence(); } /** \brief Begin execution of the asynchronous functor */ -void ThreadsExec::start(void (*func)(ThreadsExec &, const void *), - const void *arg) { - verify_is_process("ThreadsExec::start", true); +void ThreadsInternal::start(void (*func)(ThreadsInternal &, const void *), + const void *arg) { + verify_is_process("ThreadsInternal::start", true); if (s_current_function || s_current_function_arg) { Kokkos::Impl::throw_runtime_exception( - std::string("ThreadsExec::start() FAILED : already executing")); + std::string("ThreadsInternal::start() FAILED : already executing")); } s_current_function = func; @@ -372,68 +291,29 @@ void ThreadsExec::start(void (*func)(ThreadsExec &, const void *), // Make sure function and arguments are written before activating threads. memory_fence(); - // Activate threads: + // Activate threads. The spawned threads will start working on + // s_current_function. The root thread is only set to active, we still need to + // call s_current_function. for (int i = s_thread_pool_size[0]; 0 < i--;) { - s_threads_exec[i]->m_pool_state = ThreadsExec::Active; + s_threads_exec[i]->m_pool_state = ThreadState::Active; } if (s_threads_process.m_pool_size) { // Master process is the root thread, run it: (*func)(s_threads_process, arg); - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_state = ThreadState::Inactive; } } //---------------------------------------------------------------------------- -bool ThreadsExec::sleep() { - verify_is_process("ThreadsExec::sleep", true); - - if (&execute_sleep == s_current_function) return false; - - fence(); - - ThreadsExec::global_lock(); - - s_current_function = &execute_sleep; - - // Activate threads: - for (unsigned i = s_thread_pool_size[0]; 0 < i;) { - s_threads_exec[--i]->m_pool_state = ThreadsExec::Active; - } - - return true; -} - -bool ThreadsExec::wake() { - verify_is_process("ThreadsExec::wake", true); - - if (&execute_sleep != s_current_function) return false; - - ThreadsExec::global_unlock(); - - if (s_threads_process.m_pool_base) { - execute_sleep(s_threads_process, nullptr); - s_threads_process.m_pool_state = ThreadsExec::Inactive; - } - - fence(); - - return true; -} - -//---------------------------------------------------------------------------- - -void ThreadsExec::execute_resize_scratch_in_serial() { +void ThreadsInternal::execute_resize_scratch_in_serial() { const unsigned begin = s_threads_process.m_pool_base ? 1 : 0; - auto deallocate_scratch_memory = [](ThreadsExec &exec) { + auto deallocate_scratch_memory = [](ThreadsInternal &exec) { if (exec.m_scratch) { - using Record = - Kokkos::Impl::SharedAllocationRecord; - Record *const r = Record::get_record(exec.m_scratch); - exec.m_scratch = nullptr; - Record::decrement(r); + Kokkos::kokkos_free(exec.m_scratch); + exec.m_scratch = nullptr; } }; if (s_threads_process.m_pool_base) { @@ -449,18 +329,18 @@ void ThreadsExec::execute_resize_scratch_in_serial() { memory_fence(); for (unsigned i = s_thread_pool_size[0]; begin < i;) { - ThreadsExec &th = *s_threads_exec[--i]; + ThreadsInternal &th = *s_threads_exec[--i]; - th.m_pool_state = ThreadsExec::Active; + th.m_pool_state = ThreadState::Active; - wait_yield(th.m_pool_state, ThreadsExec::Active); + wait_yield(th.m_pool_state, ThreadState::Active); } if (s_threads_process.m_pool_base) { deallocate_scratch_memory(s_threads_process); - s_threads_process.m_pool_state = ThreadsExec::Active; + s_threads_process.m_pool_state = ThreadState::Active; first_touch_allocate_thread_private_scratch(s_threads_process, nullptr); - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_state = ThreadState::Inactive; } s_current_function_arg = nullptr; @@ -472,27 +352,20 @@ void ThreadsExec::execute_resize_scratch_in_serial() { //---------------------------------------------------------------------------- -void *ThreadsExec::root_reduce_scratch() { +void *ThreadsInternal::root_reduce_scratch() { return s_threads_process.reduce_memory(); } -void ThreadsExec::first_touch_allocate_thread_private_scratch(ThreadsExec &exec, - const void *) { +void ThreadsInternal::first_touch_allocate_thread_private_scratch( + ThreadsInternal &exec, const void *) { exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end; exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end; if (s_threads_process.m_scratch_thread_end) { // Allocate tracked memory: { - using Record = - Kokkos::Impl::SharedAllocationRecord; - Record *const r = - Record::allocate(Kokkos::HostSpace(), "Kokkos::thread_scratch", - s_threads_process.m_scratch_thread_end); - - Record::increment(r); - - exec.m_scratch = r->data(); + exec.m_scratch = Kokkos::kokkos_malloc( + "Kokkos::thread_scratch", s_threads_process.m_scratch_thread_end); } unsigned *ptr = reinterpret_cast(exec.m_scratch); @@ -505,7 +378,7 @@ void ThreadsExec::first_touch_allocate_thread_private_scratch(ThreadsExec &exec, } } -void *ThreadsExec::resize_scratch(size_t reduce_size, size_t thread_size) { +void *ThreadsInternal::resize_scratch(size_t reduce_size, size_t thread_size) { enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 }; fence(); @@ -522,7 +395,7 @@ void *ThreadsExec::resize_scratch(size_t reduce_size, size_t thread_size) { if ((old_reduce_size < reduce_size) || (old_thread_size < thread_size) || ((reduce_size == 0 && thread_size == 0) && (old_reduce_size != 0 || old_thread_size != 0))) { - verify_is_process("ThreadsExec::resize_scratch", true); + verify_is_process("ThreadsInternal::resize_scratch", true); s_threads_process.m_scratch_reduce_end = reduce_size; s_threads_process.m_scratch_thread_end = reduce_size + thread_size; @@ -537,27 +410,22 @@ void *ThreadsExec::resize_scratch(size_t reduce_size, size_t thread_size) { //---------------------------------------------------------------------------- -void ThreadsExec::print_configuration(std::ostream &s, const bool detail) { - verify_is_process("ThreadsExec::print_configuration", false); +void ThreadsInternal::print_configuration(std::ostream &s, const bool detail) { + verify_is_process("ThreadsInternal::print_configuration", false); fence(); - const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); - const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); - const unsigned threads_per_core = - Kokkos::hwloc::get_available_threads_per_core(); - - // Forestall compiler warnings for unused variables. - (void)numa_count; - (void)cores_per_numa; - (void)threads_per_core; - s << "Kokkos::Threads"; #if defined(KOKKOS_ENABLE_THREADS) s << " KOKKOS_ENABLE_THREADS"; #endif #if defined(KOKKOS_ENABLE_HWLOC) + const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); + const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); + const unsigned threads_per_core = + Kokkos::hwloc::get_available_threads_per_core(); + s << " hwloc[" << numa_count << "x" << cores_per_numa << "x" << threads_per_core << "]"; #endif @@ -569,25 +437,21 @@ void ThreadsExec::print_configuration(std::ostream &s, const bool detail) { if (nullptr == s_threads_process.m_pool_base) { s << " Asynchronous"; } - s << " ReduceScratch[" << s_current_reduce_size << "]" - << " SharedScratch[" << s_current_shared_size << "]"; s << std::endl; if (detail) { for (int i = 0; i < s_thread_pool_size[0]; ++i) { - ThreadsExec *const th = s_threads_exec[i]; + ThreadsInternal *const th = s_threads_exec[i]; if (th) { const int rank_rev = th->m_pool_size - (th->m_pool_rank + 1); - s << " Thread[ " << th->m_pool_rank << " : " << th->m_numa_rank << "." - << th->m_numa_core_rank << " ]"; + s << " Thread[ " << th->m_pool_rank << " ]"; s << " Fan{"; for (int j = 0; j < th->m_pool_fan_size; ++j) { - ThreadsExec *const thfan = th->m_pool_base[rank_rev + (1 << j)]; - s << " [ " << thfan->m_pool_rank << " : " << thfan->m_numa_rank - << "." << thfan->m_numa_core_rank << " ]"; + ThreadsInternal *const thfan = th->m_pool_base[rank_rev + (1 << j)]; + s << " [ " << thfan->m_pool_rank << " ]"; } s << " }"; @@ -605,29 +469,21 @@ void ThreadsExec::print_configuration(std::ostream &s, const bool detail) { //---------------------------------------------------------------------------- -int ThreadsExec::is_initialized() { return nullptr != s_threads_exec[0]; } +int ThreadsInternal::is_initialized() { return nullptr != s_threads_exec[0]; } -void ThreadsExec::initialize(int thread_count_arg) { - // legacy arguments - unsigned thread_count = thread_count_arg == -1 ? 0 : thread_count_arg; - unsigned use_numa_count = 0; - unsigned use_cores_per_numa = 0; - bool allow_asynchronous_threadpool = false; - // need to provide an initializer for Intel compilers - static const Sentinel sentinel = {}; +void ThreadsInternal::initialize(int thread_count_arg) { + unsigned thread_count = thread_count_arg == -1 ? 0 : thread_count_arg; const bool is_initialized = 0 != s_thread_pool_size[0]; unsigned thread_spawn_failed = 0; - for (int i = 0; i < ThreadsExec::MAX_THREAD_COUNT; i++) + for (int i = 0; i < ThreadsInternal::MAX_THREAD_COUNT; i++) s_threads_exec[i] = nullptr; if (!is_initialized) { - // If thread_count, use_numa_count, or use_cores_per_numa are zero - // then they will be given default values based upon hwloc detection - // and allowed asynchronous execution. - + // If thread_count is zero then it will be given default values based upon + // hwloc detection. const bool hwloc_avail = Kokkos::hwloc::available(); const bool hwloc_can_bind = hwloc_avail && Kokkos::hwloc::can_bind_threads(); @@ -640,17 +496,18 @@ void ThreadsExec::initialize(int thread_count_arg) { : 1; } - const unsigned thread_spawn_begin = hwloc::thread_mapping( - "Kokkos::Threads::initialize", allow_asynchronous_threadpool, - thread_count, use_numa_count, use_cores_per_numa, s_threads_coord); + const bool allow_asynchronous_threadpool = false; + unsigned use_numa_count = 0; + unsigned use_cores_per_numa = 0; + hwloc::thread_mapping("Kokkos::Threads::initialize", + allow_asynchronous_threadpool, thread_count, + use_numa_count, use_cores_per_numa, s_threads_coord); const std::pair proc_coord = s_threads_coord[0]; - if (thread_spawn_begin) { - // Synchronous with s_threads_coord[0] as the process core - // Claim entry #0 for binding the process core. - s_threads_coord[0] = std::pair(~0u, ~0u); - } + // Synchronous with s_threads_coord[0] as the process core + // Claim entry #0 for binding the process core. + s_threads_coord[0] = std::pair(~0u, ~0u); s_thread_pool_size[0] = thread_count; s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count; @@ -658,8 +515,8 @@ void ThreadsExec::initialize(int thread_count_arg) { s_current_function = &execute_function_noop; // Initialization work function - for (unsigned ith = thread_spawn_begin; ith < thread_count; ++ith) { - s_threads_process.m_pool_state = ThreadsExec::Inactive; + for (unsigned ith = 1; ith < thread_count; ++ith) { + s_threads_process.m_pool_state = ThreadState::Inactive; // If hwloc available then spawned thread will // choose its own entry in 's_threads_coord' @@ -675,18 +532,20 @@ void ThreadsExec::initialize(int thread_count_arg) { // Wait until spawned thread has attempted to initialize. // If spawning and initialization is successful then // an entry in 's_threads_exec' will be assigned. - ThreadsExec::spawn(); - wait_yield(s_threads_process.m_pool_state, ThreadsExec::Inactive); - if (s_threads_process.m_pool_state == ThreadsExec::Terminating) break; + std::thread t(internal_cppthread_driver); + t.detach(); + wait_yield(s_threads_process.m_pool_state, ThreadState::Inactive); + if (s_threads_process.m_pool_state == ThreadState::Terminating) break; } // Wait for all spawned threads to deactivate before zeroing the function. - for (unsigned ith = thread_spawn_begin; ith < thread_count; ++ith) { + for (unsigned ith = 1; ith < thread_count; ++ith) { // Try to protect against cache coherency failure by casting to volatile. - ThreadsExec *const th = ((ThreadsExec * volatile *)s_threads_exec)[ith]; + ThreadsInternal *const th = + ((ThreadsInternal * volatile *)s_threads_exec)[ith]; if (th) { - wait_yield(th->m_pool_state, ThreadsExec::Active); + wait_yield(th->m_pool_state, ThreadState::Active); } else { ++thread_spawn_failed; } @@ -694,7 +553,7 @@ void ThreadsExec::initialize(int thread_count_arg) { s_current_function = nullptr; s_current_function_arg = nullptr; - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_state = ThreadState::Inactive; memory_fence(); @@ -705,30 +564,17 @@ void ThreadsExec::initialize(int thread_count_arg) { Kokkos::hwloc::bind_this_thread(proc_coord); } - if (thread_spawn_begin) { // Include process in pool. - const std::pair coord = - Kokkos::hwloc::get_this_thread_coordinate(); - - s_threads_exec[0] = &s_threads_process; - s_threads_process.m_numa_rank = coord.first; - s_threads_process.m_numa_core_rank = coord.second; - s_threads_process.m_pool_base = s_threads_exec; - s_threads_process.m_pool_rank = - thread_count - 1; // Reversed for scan-compatible reductions - s_threads_process.m_pool_size = thread_count; - s_threads_process.m_pool_fan_size = fan_size( - s_threads_process.m_pool_rank, s_threads_process.m_pool_size); - s_threads_pid[s_threads_process.m_pool_rank] = - std::this_thread::get_id(); - } else { - s_threads_process.m_pool_base = nullptr; - s_threads_process.m_pool_rank = 0; - s_threads_process.m_pool_size = 0; - s_threads_process.m_pool_fan_size = 0; - } + s_threads_exec[0] = &s_threads_process; + s_threads_process.m_pool_base = s_threads_exec; + s_threads_process.m_pool_rank = + thread_count - 1; // Reversed for scan-compatible reductions + s_threads_process.m_pool_size = thread_count; + s_threads_process.m_pool_fan_size = fan_size( + s_threads_process.m_pool_rank, s_threads_process.m_pool_size); + s_threads_pid[s_threads_process.m_pool_rank] = std::this_thread::get_id(); // Initial allocations: - ThreadsExec::resize_scratch(1024, 1024); + ThreadsInternal::resize_scratch(1024, 1024); } else { s_thread_pool_size[0] = 0; s_thread_pool_size[1] = 0; @@ -773,8 +619,8 @@ void ThreadsExec::initialize(int thread_count_arg) { //---------------------------------------------------------------------------- -void ThreadsExec::finalize() { - verify_is_process("ThreadsExec::finalize", false); +void ThreadsInternal::finalize() { + verify_is_process("ThreadsInternal::finalize", false); fence(); @@ -784,18 +630,18 @@ void ThreadsExec::finalize() { for (unsigned i = s_thread_pool_size[0]; begin < i--;) { if (s_threads_exec[i]) { - s_threads_exec[i]->m_pool_state = ThreadsExec::Terminating; + s_threads_exec[i]->m_pool_state = ThreadState::Terminating; - wait_yield(s_threads_process.m_pool_state, ThreadsExec::Inactive); + wait_yield(s_threads_process.m_pool_state, ThreadState::Inactive); - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_state = ThreadState::Inactive; } s_threads_pid[i] = std::thread::id(); } if (s_threads_process.m_pool_base) { - (&s_threads_process)->~ThreadsExec(); + (&s_threads_process)->~ThreadsInternal(); s_threads_exec[0] = nullptr; } @@ -808,13 +654,11 @@ void ThreadsExec::finalize() { s_thread_pool_size[2] = 0; // Reset master thread to run solo. - s_threads_process.m_numa_rank = 0; - s_threads_process.m_numa_core_rank = 0; - s_threads_process.m_pool_base = nullptr; - s_threads_process.m_pool_rank = 0; - s_threads_process.m_pool_size = 1; - s_threads_process.m_pool_fan_size = 0; - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_base = nullptr; + s_threads_process.m_pool_rank = 0; + s_threads_process.m_pool_size = 1; + s_threads_process.m_pool_fan_size = 0; + s_threads_process.m_pool_state = ThreadState::Inactive; } //---------------------------------------------------------------------------- @@ -834,7 +678,7 @@ int Threads::concurrency() const { return impl_thread_pool_size(0); } #endif void Threads::fence(const std::string &name) const { - Impl::ThreadsExec::internal_fence(name, Impl::fence_is_static::no); + Impl::ThreadsInternal::fence(name); } Threads &Threads::impl_instance(int) { diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp similarity index 76% rename from packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp rename to packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp index 377e096bfbeb..a5eb231cb011 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_THREADSEXEC_HPP -#define KOKKOS_THREADSEXEC_HPP +#ifndef KOKKOS_THREADS_INSTANCE_HPP +#define KOKKOS_THREADS_INSTANCE_HPP #include @@ -23,41 +23,25 @@ #include #include -#include - #include #include #include #include +#include +#include //---------------------------------------------------------------------------- namespace Kokkos { namespace Impl { -class ThreadsExec { +class ThreadsInternal { public: // Fan array has log_2(NT) reduction threads plus 2 scan threads // Currently limited to 16k threads. - enum { MAX_FAN_COUNT = 16 }; - enum { MAX_THREAD_COUNT = 1 << (MAX_FAN_COUNT - 2) }; - enum { VECTOR_LENGTH = 8 }; - - /** \brief States of a worker thread */ - enum { - Terminating ///< Termination in progress - , - Inactive ///< Exists, waiting for work - , - Active ///< Exists, performing work - , - Rendezvous ///< Exists, waiting in a barrier or reduce - - , - ScanCompleted, - ScanAvailable, - ReductionAvailable - }; + static constexpr int MAX_FAN_COUNT = 16; + static constexpr int MAX_THREAD_COUNT = 1 << (MAX_FAN_COUNT - 2); + static constexpr int VECTOR_LENGTH = 8; private: friend class Kokkos::Threads; @@ -67,18 +51,16 @@ class ThreadsExec { // the threads that need them. // For a simple reduction the thread location is arbitrary. - ThreadsExec *const *m_pool_base; ///< Base for pool fan-in + ThreadsInternal *const *m_pool_base; ///< Base for pool fan-in void *m_scratch; int m_scratch_reduce_end; size_t m_scratch_thread_end; - int m_numa_rank; - int m_numa_core_rank; int m_pool_rank; int m_pool_rank_rev; int m_pool_size; int m_pool_fan_size; - int volatile m_pool_state; ///< State for global synchronizations + ThreadState volatile m_pool_state; ///< State for global synchronizations // Members for dynamic scheduling // Which thread am I stealing from currently @@ -93,41 +75,36 @@ class ThreadsExec { static void global_lock(); static void global_unlock(); - static void spawn(); - static void first_touch_allocate_thread_private_scratch(ThreadsExec &, + static void first_touch_allocate_thread_private_scratch(ThreadsInternal &, const void *); - static void execute_sleep(ThreadsExec &, const void *); - ThreadsExec(const ThreadsExec &); - ThreadsExec &operator=(const ThreadsExec &); + ThreadsInternal(const ThreadsInternal &); + ThreadsInternal &operator=(const ThreadsInternal &); static void execute_resize_scratch_in_serial(); public: KOKKOS_INLINE_FUNCTION int pool_size() const { return m_pool_size; } KOKKOS_INLINE_FUNCTION int pool_rank() const { return m_pool_rank; } - KOKKOS_INLINE_FUNCTION int numa_rank() const { return m_numa_rank; } - KOKKOS_INLINE_FUNCTION int numa_core_rank() const { return m_numa_core_rank; } inline long team_work_index() const { return m_team_work_index; } - static int get_thread_count(); - static ThreadsExec *get_thread(const int init_thread_rank); + static ThreadsInternal *get_thread(const int init_thread_rank); inline void *reduce_memory() const { return m_scratch; } KOKKOS_INLINE_FUNCTION void *scratch_memory() const { return reinterpret_cast(m_scratch) + m_scratch_reduce_end; } - KOKKOS_INLINE_FUNCTION int volatile &state() { return m_pool_state; } - KOKKOS_INLINE_FUNCTION ThreadsExec *const *pool_base() const { + KOKKOS_INLINE_FUNCTION ThreadState volatile &state() { return m_pool_state; } + KOKKOS_INLINE_FUNCTION ThreadsInternal *const *pool_base() const { return m_pool_base; } static void driver(void); - ~ThreadsExec(); - ThreadsExec(); + ~ThreadsInternal(); + ThreadsInternal(); static void *resize_scratch(size_t reduce_size, size_t thread_size); @@ -143,15 +120,8 @@ class ThreadsExec { static void finalize(); - /* Given a requested team size, return valid team size */ - static unsigned team_size_valid(unsigned); - static void print_configuration(std::ostream &, const bool detail = false); - //------------------------------------ - - static void wait_yield(volatile int &, const int); - //------------------------------------ // All-thread functions: @@ -166,14 +136,14 @@ class ThreadsExec { // Fan-in reduction with highest ranking thread as the root for (int i = 0; i < m_pool_fan_size; ++i) { // Wait: Active -> Rendezvous - Impl::spinwait_while_equal( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Active); } if (rev_rank) { - m_pool_state = ThreadsExec::Rendezvous; + m_pool_state = ThreadState::Rendezvous; // Wait: Rendezvous -> Active - Impl::spinwait_while_equal(m_pool_state, ThreadsExec::Rendezvous); + spinwait_while_equal(m_pool_state, ThreadState::Rendezvous); } else { // Root thread does the reduction and broadcast @@ -191,7 +161,7 @@ class ThreadsExec { memory_fence(); for (int rank = 0; rank < m_pool_size; ++rank) { - get_thread(rank)->m_pool_state = ThreadsExec::Active; + get_thread(rank)->m_pool_state = ThreadState::Active; } } @@ -207,21 +177,21 @@ class ThreadsExec { // Fan-in reduction with highest ranking thread as the root for (int i = 0; i < m_pool_fan_size; ++i) { // Wait: Active -> Rendezvous - Impl::spinwait_while_equal( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Active); } if (rev_rank) { - m_pool_state = ThreadsExec::Rendezvous; + m_pool_state = ThreadState::Rendezvous; // Wait: Rendezvous -> Active - Impl::spinwait_while_equal(m_pool_state, ThreadsExec::Rendezvous); + spinwait_while_equal(m_pool_state, ThreadState::Rendezvous); } else { // Root thread does the reduction and broadcast memory_fence(); for (int rank = 0; rank < m_pool_size; ++rank) { - get_thread(rank)->m_pool_state = ThreadsExec::Active; + get_thread(rank)->m_pool_state = ThreadState::Active; } } } @@ -234,9 +204,9 @@ class ThreadsExec { const int rev_rank = m_pool_size - (m_pool_rank + 1); for (int i = 0; i < m_pool_fan_size; ++i) { - ThreadsExec &fan = *m_pool_base[rev_rank + (1 << i)]; + ThreadsInternal &fan = *m_pool_base[rev_rank + (1 << i)]; - Impl::spinwait_while_equal(fan.m_pool_state, ThreadsExec::Active); + spinwait_while_equal(fan.m_pool_state, ThreadState::Active); f.join( reinterpret_cast(reduce_memory()), @@ -265,8 +235,8 @@ class ThreadsExec { const int rev_rank = m_pool_size - (m_pool_rank + 1); for (int i = 0; i < m_pool_fan_size; ++i) { - Impl::spinwait_while_equal( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Active); } } @@ -289,10 +259,10 @@ class ThreadsExec { //-------------------------------- // Fan-in reduction with highest ranking thread as the root for (int i = 0; i < m_pool_fan_size; ++i) { - ThreadsExec &fan = *m_pool_base[rev_rank + (1 << i)]; + ThreadsInternal &fan = *m_pool_base[rev_rank + (1 << i)]; // Wait: Active -> ReductionAvailable (or ScanAvailable) - Impl::spinwait_while_equal(fan.m_pool_state, ThreadsExec::Active); + spinwait_while_equal(fan.m_pool_state, ThreadState::Active); f.join(work_value, fan.reduce_memory()); } @@ -303,39 +273,37 @@ class ThreadsExec { if (rev_rank) { // Set: Active -> ReductionAvailable - m_pool_state = ThreadsExec::ReductionAvailable; + m_pool_state = ThreadState::ReductionAvailable; // Wait for contributing threads' scan value to be available. if ((1 << m_pool_fan_size) < (m_pool_rank + 1)) { - ThreadsExec &th = *m_pool_base[rev_rank + (1 << m_pool_fan_size)]; + ThreadsInternal &th = *m_pool_base[rev_rank + (1 << m_pool_fan_size)]; // Wait: Active -> ReductionAvailable // Wait: ReductionAvailable -> ScanAvailable - Impl::spinwait_while_equal(th.m_pool_state, ThreadsExec::Active); - Impl::spinwait_while_equal(th.m_pool_state, - ThreadsExec::ReductionAvailable); + spinwait_while_equal(th.m_pool_state, ThreadState::Active); + spinwait_while_equal(th.m_pool_state, ThreadState::ReductionAvailable); f.join(work_value + count, ((scalar_type *)th.reduce_memory()) + count); } // This thread has completed inclusive scan // Set: ReductionAvailable -> ScanAvailable - m_pool_state = ThreadsExec::ScanAvailable; + m_pool_state = ThreadState::ScanAvailable; // Wait for all threads to complete inclusive scan // Wait: ScanAvailable -> Rendezvous - Impl::spinwait_while_equal(m_pool_state, ThreadsExec::ScanAvailable); + spinwait_while_equal(m_pool_state, ThreadState::ScanAvailable); } //-------------------------------- for (int i = 0; i < m_pool_fan_size; ++i) { - ThreadsExec &fan = *m_pool_base[rev_rank + (1 << i)]; + ThreadsInternal &fan = *m_pool_base[rev_rank + (1 << i)]; // Wait: ReductionAvailable -> ScanAvailable - Impl::spinwait_while_equal(fan.m_pool_state, - ThreadsExec::ReductionAvailable); + spinwait_while_equal(fan.m_pool_state, ThreadState::ReductionAvailable); // Set: ScanAvailable -> Rendezvous - fan.m_pool_state = ThreadsExec::Rendezvous; + fan.m_pool_state = ThreadState::Rendezvous; } // All threads have completed the inclusive scan. @@ -346,7 +314,7 @@ class ThreadsExec { if ((rev_rank + 1) < m_pool_size) { // Exclusive scan: copy the previous thread's inclusive scan value - ThreadsExec &th = *m_pool_base[rev_rank + 1]; // Not the root thread + ThreadsInternal &th = *m_pool_base[rev_rank + 1]; // Not the root thread const scalar_type *const src_value = ((scalar_type *)th.reduce_memory()) + count; @@ -362,19 +330,18 @@ class ThreadsExec { // Wait for all threads to copy previous thread's inclusive scan value // Wait for all threads: Rendezvous -> ScanCompleted for (int i = 0; i < m_pool_fan_size; ++i) { - Impl::spinwait_while_equal( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, - ThreadsExec::Rendezvous); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Rendezvous); } if (rev_rank) { // Set: ScanAvailable -> ScanCompleted - m_pool_state = ThreadsExec::ScanCompleted; + m_pool_state = ThreadState::ScanCompleted; // Wait: ScanCompleted -> Active - Impl::spinwait_while_equal(m_pool_state, ThreadsExec::ScanCompleted); + spinwait_while_equal(m_pool_state, ThreadState::ScanCompleted); } // Set: ScanCompleted -> Active for (int i = 0; i < m_pool_fan_size; ++i) { - m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadsExec::Active; + m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadState::Active; } } @@ -391,8 +358,8 @@ class ThreadsExec { // Fan-in reduction with highest ranking thread as the root for (int i = 0; i < m_pool_fan_size; ++i) { // Wait: Active -> Rendezvous - Impl::spinwait_while_equal( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Active); } for (unsigned i = 0; i < count; ++i) { @@ -400,9 +367,9 @@ class ThreadsExec { } if (rev_rank) { - m_pool_state = ThreadsExec::Rendezvous; + m_pool_state = ThreadState::Rendezvous; // Wait: Rendezvous -> Active - Impl::spinwait_while_equal(m_pool_state, ThreadsExec::Rendezvous); + spinwait_while_equal(m_pool_state, ThreadState::Rendezvous); } else { // Root thread does the thread-scan before releasing threads @@ -424,7 +391,7 @@ class ThreadsExec { } for (int i = 0; i < m_pool_fan_size; ++i) { - m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadsExec::Active; + m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadState::Active; } } @@ -433,18 +400,14 @@ class ThreadsExec { * complete and release the Threads device. * Acquire the Threads device and start this functor. */ - static void start(void (*)(ThreadsExec &, const void *), const void *); + static void start(void (*)(ThreadsInternal &, const void *), const void *); - static int in_parallel(); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static int in_parallel(); +#endif static void fence(); static void fence(const std::string &); - static void internal_fence( - Impl::fence_is_static is_static = Impl::fence_is_static::yes); - static void internal_fence( - const std::string &, - Impl::fence_is_static is_static = Impl::fence_is_static::yes); - static bool sleep(); - static bool wake(); + static void internal_fence(); /* Dynamic Scheduling related functionality */ // Initialize the work range for this thread @@ -583,30 +546,38 @@ class ThreadsExec { namespace Kokkos { -inline int Threads::in_parallel() { return Impl::ThreadsExec::in_parallel(); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_DEPRECATED inline int Threads::in_parallel() { + return Impl::ThreadsInternal::in_parallel(); +} +#endif inline int Threads::impl_is_initialized() { - return Impl::ThreadsExec::is_initialized(); + return Impl::ThreadsInternal::is_initialized(); } inline void Threads::impl_initialize(InitializationSettings const &settings) { - Impl::ThreadsExec::initialize( + Impl::ThreadsInternal::initialize( settings.has_num_threads() ? settings.get_num_threads() : -1); } -inline void Threads::impl_finalize() { Impl::ThreadsExec::finalize(); } +inline void Threads::impl_finalize() { Impl::ThreadsInternal::finalize(); } inline void Threads::print_configuration(std::ostream &os, bool verbose) const { os << "Host Parallel Execution Space:\n"; os << " KOKKOS_ENABLE_THREADS: yes\n"; os << "\nThreads Runtime Configuration:\n"; - Impl::ThreadsExec::print_configuration(os, verbose); + Impl::ThreadsInternal::print_configuration(os, verbose); } inline void Threads::impl_static_fence(const std::string &name) { - Impl::ThreadsExec::internal_fence(name, Impl::fence_is_static::yes); + Kokkos::Tools::Experimental::Impl::profile_fence_event( + name, + Kokkos::Tools::Experimental::SpecialSynchronizationCases:: + GlobalDeviceSynchronization, + Impl::ThreadsInternal::internal_fence); } } /* namespace Kokkos */ -#endif /* #define KOKKOS_THREADSEXEC_HPP */ +#endif diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp index 0828f262993c..59577609ab78 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp @@ -46,54 +46,54 @@ class ParallelFor, } } - static void exec(ThreadsExec &exec, const void *arg) { - exec_schedule(exec, arg); + static void exec(ThreadsInternal &instance, const void *arg) { + exec_schedule(instance, arg); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); auto const num_tiles = self.m_iter.m_rp.m_num_tiles; - WorkRange range(Policy(0, num_tiles).set_chunk_size(1), exec.pool_rank(), - exec.pool_size()); + WorkRange range(Policy(0, num_tiles).set_chunk_size(1), + instance.pool_rank(), instance.pool_size()); self.exec_range(range.begin(), range.end()); - exec.fan_in(); + instance.fan_in(); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); auto const num_tiles = self.m_iter.m_rp.m_num_tiles; - WorkRange range(Policy(0, num_tiles).set_chunk_size(1), exec.pool_rank(), - exec.pool_size()); + WorkRange range(Policy(0, num_tiles).set_chunk_size(1), + instance.pool_rank(), instance.pool_size()); - exec.set_work_range(range.begin(), range.end(), 1); - exec.reset_steal_target(); - exec.barrier(); + instance.set_work_range(range.begin(), range.end(), 1); + instance.reset_steal_target(); + instance.barrier(); - long work_index = exec.get_work_index(); + long work_index = instance.get_work_index(); while (work_index != -1) { const Member begin = static_cast(work_index); const Member end = begin + 1 < num_tiles ? begin + 1 : num_tiles; self.exec_range(begin, end); - work_index = exec.get_work_index(); + work_index = instance.get_work_index(); } - exec.fan_in(); + instance.fan_in(); } public: inline void execute() const { - ThreadsExec::start(&ParallelFor::exec, this); - ThreadsExec::fence(); + ThreadsInternal::start(&ParallelFor::exec, this); + ThreadsInternal::fence(); } ParallelFor(const FunctorType &arg_functor, const MDRangePolicy &arg_policy) diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp index 3698416ef187..4a89c4fad823 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp @@ -59,37 +59,37 @@ class ParallelFor, } } - static void exec(ThreadsExec &exec, const void *arg) { - exec_schedule(exec, arg); + static void exec(ThreadsInternal &instance, const void *arg) { + exec_schedule(instance, arg); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); - WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + WorkRange range(self.m_policy, instance.pool_rank(), instance.pool_size()); ParallelFor::template exec_range(self.m_functor, range.begin(), range.end()); - exec.fan_in(); + instance.fan_in(); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); - WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + WorkRange range(self.m_policy, instance.pool_rank(), instance.pool_size()); - exec.set_work_range(range.begin() - self.m_policy.begin(), - range.end() - self.m_policy.begin(), - self.m_policy.chunk_size()); - exec.reset_steal_target(); - exec.barrier(); + instance.set_work_range(range.begin() - self.m_policy.begin(), + range.end() - self.m_policy.begin(), + self.m_policy.chunk_size()); + instance.reset_steal_target(); + instance.barrier(); - long work_index = exec.get_work_index(); + long work_index = instance.get_work_index(); while (work_index != -1) { const Member begin = @@ -100,16 +100,16 @@ class ParallelFor, ? begin + self.m_policy.chunk_size() : self.m_policy.end(); ParallelFor::template exec_range(self.m_functor, begin, end); - work_index = exec.get_work_index(); + work_index = instance.get_work_index(); } - exec.fan_in(); + instance.fan_in(); } public: inline void execute() const { - ThreadsExec::start(&ParallelFor::exec, this); - ThreadsExec::fence(); + ThreadsInternal::start(&ParallelFor::exec, this); + ThreadsInternal::fence(); } ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy) diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp index 36404857a228..f927d7c6a67e 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp @@ -73,14 +73,14 @@ class ParallelFor, } } - static void exec(ThreadsExec &exec, const void *arg) { + static void exec(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); ParallelFor::exec_team( - self.m_functor, Member(&exec, self.m_policy, self.m_shared)); + self.m_functor, Member(&instance, self.m_policy, self.m_shared)); - exec.barrier(); - exec.fan_in(); + instance.barrier(); + instance.fan_in(); } template Policy fix_policy(Policy policy) { @@ -96,12 +96,12 @@ class ParallelFor, public: inline void execute() const { - ThreadsExec::resize_scratch( + ThreadsInternal::resize_scratch( 0, Policy::member_type::team_reduce_size() + m_shared); - ThreadsExec::start(&ParallelFor::exec, this); + ThreadsInternal::start(&ParallelFor::exec, this); - ThreadsExec::fence(); + ThreadsInternal::fence(); } ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy) diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp index 3d06379480f7..fa63215a9e5d 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp @@ -54,67 +54,67 @@ class ParallelReduce(exec, arg); + static void exec(ThreadsInternal &instance, const void *arg) { + exec_schedule(instance, arg); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); const auto num_tiles = self.m_iter.m_rp.m_num_tiles; const WorkRange range(Policy(0, num_tiles).set_chunk_size(1), - exec.pool_rank(), exec.pool_size()); + instance.pool_rank(), instance.pool_size()); const ReducerType &reducer = self.m_iter.m_func.get_reducer(); self.exec_range( range.begin(), range.end(), - reducer.init(static_cast(exec.reduce_memory()))); + reducer.init(static_cast(instance.reduce_memory()))); - exec.fan_in_reduce(reducer); + instance.fan_in_reduce(reducer); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); const auto num_tiles = self.m_iter.m_rp.m_num_tiles; const WorkRange range(Policy(0, num_tiles).set_chunk_size(1), - exec.pool_rank(), exec.pool_size()); + instance.pool_rank(), instance.pool_size()); - exec.set_work_range(range.begin(), range.end(), 1); - exec.reset_steal_target(); - exec.barrier(); + instance.set_work_range(range.begin(), range.end(), 1); + instance.reset_steal_target(); + instance.barrier(); - long work_index = exec.get_work_index(); + long work_index = instance.get_work_index(); const ReducerType &reducer = self.m_iter.m_func.get_reducer(); - reference_type update = - self.m_reducer.init(static_cast(exec.reduce_memory())); + reference_type update = self.m_reducer.init( + static_cast(instance.reduce_memory())); while (work_index != -1) { const Member begin = static_cast(work_index); const Member end = begin + 1 < num_tiles ? begin + 1 : num_tiles; self.exec_range(begin, end, update); - work_index = exec.get_work_index(); + work_index = instance.get_work_index(); } - exec.fan_in_reduce(self.m_reducer); + instance.fan_in_reduce(self.m_reducer); } public: inline void execute() const { const ReducerType &reducer = m_iter.m_func.get_reducer(); - ThreadsExec::resize_scratch(reducer.value_size(), 0); + ThreadsInternal::resize_scratch(reducer.value_size(), 0); - ThreadsExec::start(&ParallelReduce::exec, this); + ThreadsInternal::start(&ParallelReduce::exec, this); - ThreadsExec::fence(); + ThreadsInternal::fence(); if (m_result_ptr) { const pointer_type data = - (pointer_type)ThreadsExec::root_reduce_scratch(); + (pointer_type)ThreadsInternal::root_reduce_scratch(); const unsigned n = reducer.value_count(); for (unsigned i = 0; i < n; ++i) { diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp index 5fa97b403c4e..bf4c2a532a17 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp @@ -68,42 +68,44 @@ class ParallelReduce, } } - static void exec(ThreadsExec &exec, const void *arg) { - exec_schedule(exec, arg); + static void exec(ThreadsInternal &instance, const void *arg) { + exec_schedule(instance, arg); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + const WorkRange range(self.m_policy, instance.pool_rank(), + instance.pool_size()); const ReducerType &reducer = self.m_functor_reducer.get_reducer(); ParallelReduce::template exec_range( self.m_functor_reducer.get_functor(), range.begin(), range.end(), - reducer.init(static_cast(exec.reduce_memory()))); + reducer.init(static_cast(instance.reduce_memory()))); - exec.fan_in_reduce(reducer); + instance.fan_in_reduce(reducer); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + const WorkRange range(self.m_policy, instance.pool_rank(), + instance.pool_size()); - exec.set_work_range(range.begin() - self.m_policy.begin(), - range.end() - self.m_policy.begin(), - self.m_policy.chunk_size()); - exec.reset_steal_target(); - exec.barrier(); + instance.set_work_range(range.begin() - self.m_policy.begin(), + range.end() - self.m_policy.begin(), + self.m_policy.chunk_size()); + instance.reset_steal_target(); + instance.barrier(); - long work_index = exec.get_work_index(); + long work_index = instance.get_work_index(); const ReducerType &reducer = self.m_functor_reducer.get_reducer(); reference_type update = - reducer.init(static_cast(exec.reduce_memory())); + reducer.init(static_cast(instance.reduce_memory())); while (work_index != -1) { const Member begin = static_cast(work_index) * self.m_policy.chunk_size() + @@ -114,10 +116,10 @@ class ParallelReduce, : self.m_policy.end(); ParallelReduce::template exec_range( self.m_functor_reducer.get_functor(), begin, end, update); - work_index = exec.get_work_index(); + work_index = instance.get_work_index(); } - exec.fan_in_reduce(reducer); + instance.fan_in_reduce(reducer); } public: @@ -130,15 +132,15 @@ class ParallelReduce, reducer.final(m_result_ptr); } } else { - ThreadsExec::resize_scratch(reducer.value_size(), 0); + ThreadsInternal::resize_scratch(reducer.value_size(), 0); - ThreadsExec::start(&ParallelReduce::exec, this); + ThreadsInternal::start(&ParallelReduce::exec, this); - ThreadsExec::fence(); + ThreadsInternal::fence(); if (m_result_ptr) { const pointer_type data = - (pointer_type)ThreadsExec::root_reduce_scratch(); + (pointer_type)ThreadsInternal::root_reduce_scratch(); const unsigned n = reducer.value_count(); for (unsigned i = 0; i < n; ++i) { diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp index c4b6100a9df2..4db310701f9f 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp @@ -58,16 +58,16 @@ class ParallelReduce( self.m_functor_reducer.get_functor(), - Member(&exec, self.m_policy, self.m_shared), + Member(&instance, self.m_policy, self.m_shared), self.m_functor_reducer.get_reducer().init( - static_cast(exec.reduce_memory()))); + static_cast(instance.reduce_memory()))); - exec.fan_in_reduce(self.m_functor_reducer.get_reducer()); + instance.fan_in_reduce(self.m_functor_reducer.get_reducer()); } public: @@ -80,17 +80,17 @@ class ParallelReduce, } } - static void exec(ThreadsExec &exec, const void *arg) { + static void exec(ThreadsInternal &instance, const void *arg) { const ParallelScan &self = *((const ParallelScan *)arg); - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + const WorkRange range(self.m_policy, instance.pool_rank(), + instance.pool_size()); typename Analysis::Reducer final_reducer(self.m_functor); reference_type update = - final_reducer.init(static_cast(exec.reduce_memory())); + final_reducer.init(static_cast(instance.reduce_memory())); ParallelScan::template exec_range(self.m_functor, range.begin(), range.end(), update, false); - // exec.template scan_large( final_reducer ); - exec.scan_small(final_reducer); + instance.scan_small(final_reducer); ParallelScan::template exec_range(self.m_functor, range.begin(), range.end(), update, true); - exec.fan_in(); + instance.fan_in(); } public: inline void execute() const { - ThreadsExec::resize_scratch(2 * Analysis::value_size(m_functor), 0); - ThreadsExec::start(&ParallelScan::exec, this); - ThreadsExec::fence(); + ThreadsInternal::resize_scratch(2 * Analysis::value_size(m_functor), 0); + ThreadsInternal::start(&ParallelScan::exec, this); + ThreadsInternal::fence(); } ParallelScan(const FunctorType &arg_functor, const Policy &arg_policy) @@ -145,37 +145,37 @@ class ParallelScanWithTotal, } } - static void exec(ThreadsExec &exec, const void *arg) { + static void exec(ThreadsInternal &instance, const void *arg) { const ParallelScanWithTotal &self = *((const ParallelScanWithTotal *)arg); - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + const WorkRange range(self.m_policy, instance.pool_rank(), + instance.pool_size()); typename Analysis::Reducer final_reducer(self.m_functor); reference_type update = - final_reducer.init(static_cast(exec.reduce_memory())); + final_reducer.init(static_cast(instance.reduce_memory())); ParallelScanWithTotal::template exec_range( self.m_functor, range.begin(), range.end(), update, false); - // exec.template scan_large(final_reducer); - exec.scan_small(final_reducer); + instance.scan_small(final_reducer); ParallelScanWithTotal::template exec_range( self.m_functor, range.begin(), range.end(), update, true); - exec.fan_in(); + instance.fan_in(); - if (exec.pool_rank() == exec.pool_size() - 1) { + if (instance.pool_rank() == instance.pool_size() - 1) { *self.m_result_ptr = update; } } public: inline void execute() const { - ThreadsExec::resize_scratch(2 * Analysis::value_size(m_functor), 0); - ThreadsExec::start(&ParallelScanWithTotal::exec, this); - ThreadsExec::fence(); + ThreadsInternal::resize_scratch(2 * Analysis::value_size(m_functor), 0); + ThreadsInternal::start(&ParallelScanWithTotal::exec, this); + ThreadsInternal::fence(); } template diff --git a/packages/kokkos/core/src/impl/Kokkos_Spinwait.cpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp similarity index 90% rename from packages/kokkos/core/src/impl/Kokkos_Spinwait.cpp rename to packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp index 0a7eda29bcf1..3df9dc07bf43 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Spinwait.cpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include @@ -108,5 +108,15 @@ void host_thread_yield(const uint32_t i, const WaitMode mode) { #endif /* defined( KOKKOS_ENABLE_ASM ) */ } +void spinwait_while_equal(ThreadState const volatile& flag, + ThreadState const value) { + Kokkos::store_fence(); + uint32_t i = 0; + while (value == flag) { + host_thread_yield(++i, WaitMode::ACTIVE); + } + Kokkos::load_fence(); +} + } // namespace Impl } // namespace Kokkos diff --git a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Swap.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp similarity index 52% rename from packages/kokkos/algorithms/src/std_algorithms/Kokkos_Swap.hpp rename to packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp index acd2a572c8c2..b98b6dbb73bc 100644 --- a/packages/kokkos/algorithms/src/std_algorithms/Kokkos_Swap.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp @@ -14,28 +14,30 @@ // //@HEADER -#ifndef KOKKOS_STD_ALGORITHMS_SWAP_HPP -#define KOKKOS_STD_ALGORITHMS_SWAP_HPP +#ifndef KOKKOS_THREADS_SPINWAIT_HPP +#define KOKKOS_THREADS_SPINWAIT_HPP -#include +#include + +#include namespace Kokkos { -namespace Experimental { - -// swap -template -KOKKOS_INLINE_FUNCTION void swap(T& a, T& b) noexcept { - static_assert( - std::is_move_assignable::value && std::is_move_constructible::value, - "Kokkos::Experimental::swap arguments must be move assignable " - "and move constructible"); - - T tmp = std::move(a); - a = std::move(b); - b = std::move(tmp); -} - -} // namespace Experimental +namespace Impl { + +enum class WaitMode : int { + ACTIVE // Used for tight loops to keep threads active longest + , + PASSIVE // Used to quickly yield the thread to quite down the system + , + ROOT // Never sleep or yield the root thread +}; + +void host_thread_yield(const uint32_t i, const WaitMode mode); + +void spinwait_while_equal(ThreadState const volatile& flag, + ThreadState const value); + +} // namespace Impl } // namespace Kokkos #endif diff --git a/packages/kokkos/core/src/fwd/Kokkos_Fwd_HBWSpace.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_State.hpp similarity index 59% rename from packages/kokkos/core/src/fwd/Kokkos_Fwd_HBWSpace.hpp rename to packages/kokkos/core/src/Threads/Kokkos_Threads_State.hpp index 21ba7fad01cf..148e9aa4e057 100644 --- a/packages/kokkos/core/src/fwd/Kokkos_Fwd_HBWSpace.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_State.hpp @@ -14,16 +14,26 @@ // //@HEADER -#ifndef KOKKOS_HBWSPACE_FWD_HPP_ -#define KOKKOS_HBWSPACE_FWD_HPP_ +#ifndef KOKKOS_THREADS_STATE_HPP +#define KOKKOS_THREADS_STATE_HPP -#ifdef KOKKOS_ENABLE_HBWSPACE namespace Kokkos { - -namespace Experimental { -class HBWSpace; /// Memory space for hbw_malloc from memkind (e.g. for KNL - /// processor) -} // namespace Experimental +namespace Impl { +/** \brief States of a worker thread */ +enum class ThreadState { + Terminating ///< Termination in progress + , + Inactive ///< Exists, waiting for work + , + Active ///< Exists, performing work + , + Rendezvous ///< Exists, waiting in a barrier or reduce + , + ScanCompleted, + ScanAvailable, + ReductionAvailable +}; +} // namespace Impl } // namespace Kokkos -#endif + #endif diff --git a/packages/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp similarity index 95% rename from packages/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp rename to packages/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp index b1cadc7c485d..fd0f221365b5 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp @@ -22,10 +22,11 @@ #include #include -#include #include #include +#include +#include //---------------------------------------------------------------------------- @@ -50,8 +51,8 @@ class ThreadsExecTeamMember { private: using space = execution_space::scratch_memory_space; - ThreadsExec* const m_exec; - ThreadsExec* const* m_team_base; ///< Base for team fan-in + ThreadsInternal* const m_instance; + ThreadsInternal* const* m_team_base; ///< Base for team fan-in space m_team_shared; size_t m_team_shared_size; int m_team_size; @@ -84,14 +85,13 @@ class ThreadsExecTeamMember { for (n = 1; (!(m_team_rank_rev & n)) && ((j = m_team_rank_rev + n) < m_team_size); n <<= 1) { - Impl::spinwait_while_equal(m_team_base[j]->state(), - ThreadsExec::Active); + spinwait_while_equal(m_team_base[j]->state(), ThreadState::Active); } // If not root then wait for release if (m_team_rank_rev) { - m_exec->state() = ThreadsExec::Rendezvous; - Impl::spinwait_while_equal(m_exec->state(), ThreadsExec::Rendezvous); + m_instance->state() = ThreadState::Rendezvous; + spinwait_while_equal(m_instance->state(), ThreadState::Rendezvous); } return !m_team_rank_rev; @@ -102,7 +102,7 @@ class ThreadsExecTeamMember { for (n = 1; (!(m_team_rank_rev & n)) && ((j = m_team_rank_rev + n) < m_team_size); n <<= 1) { - m_team_base[j]->state() = ThreadsExec::Active; + m_team_base[j]->state() = ThreadState::Active; } } @@ -188,10 +188,10 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (nullptr == m_exec) return value; + if (m_instance == nullptr) return value; if (team_rank() != team_size() - 1) * - ((volatile type*)m_exec->scratch_memory()) = value; + ((volatile type*)m_instance->scratch_memory()) = value; memory_fence(); @@ -229,9 +229,9 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (nullptr == m_exec) return; + if (m_instance == nullptr) return; - type* const local_value = ((type*)m_exec->scratch_memory()); + type* const local_value = ((type*)m_instance->scratch_memory()); // Set this thread's contribution if (team_rank() != team_size() - 1) { *local_value = contribution; } @@ -285,9 +285,9 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (nullptr == m_exec) return type(0); + if (m_instance == nullptr) return type(0); - volatile type* const work_value = ((type*)m_exec->scratch_memory()); + volatile type* const work_value = ((type*)m_instance->scratch_memory()); *work_value = value; @@ -342,10 +342,10 @@ class ThreadsExecTeamMember { template ThreadsExecTeamMember( - Impl::ThreadsExec* exec, + Impl::ThreadsInternal* instance, const TeamPolicyInternal& team, const size_t shared_size) - : m_exec(exec), + : m_instance(instance), m_team_base(nullptr), m_team_shared(nullptr, 0), m_team_shared_size(shared_size), @@ -361,9 +361,11 @@ class ThreadsExecTeamMember { if (team.league_size()) { // Execution is using device-team interface: - const int pool_rank_rev = m_exec->pool_size() - (m_exec->pool_rank() + 1); + const int pool_rank_rev = + m_instance->pool_size() - (m_instance->pool_rank() + 1); const int team_rank_rev = pool_rank_rev % team.team_alloc(); - const size_t pool_league_size = m_exec->pool_size() / team.team_alloc(); + const size_t pool_league_size = + m_instance->pool_size() / team.team_alloc(); const size_t pool_league_rank_rev = pool_rank_rev / team.team_alloc(); if (pool_league_rank_rev >= pool_league_size) { m_invalid_thread = 1; @@ -372,7 +374,7 @@ class ThreadsExecTeamMember { const size_t pool_league_rank = pool_league_size - (pool_league_rank_rev + 1); - const int pool_num_teams = m_exec->pool_size() / team.team_alloc(); + const int pool_num_teams = m_instance->pool_size() / team.team_alloc(); const int chunk_size = team.chunk_size() > 0 ? team.chunk_size() : team.team_iter(); const int chunks_per_team = @@ -387,8 +389,8 @@ class ThreadsExecTeamMember { if ((team.team_alloc() > size_t(m_team_size)) ? (team_rank_rev >= m_team_size) - : (m_exec->pool_size() - pool_num_teams * m_team_size > - m_exec->pool_rank())) + : (m_instance->pool_size() - pool_num_teams * m_team_size > + m_instance->pool_rank())) m_invalid_thread = 1; else m_invalid_thread = 0; @@ -398,7 +400,7 @@ class ThreadsExecTeamMember { if (team_rank_rev < team.team_size() && !m_invalid_thread) { m_team_base = - m_exec->pool_base() + team.team_alloc() * pool_league_rank_rev; + m_instance->pool_base() + team.team_alloc() * pool_league_rank_rev; m_team_size = team.team_size(); m_team_rank = team.team_size() - (team_rank_rev + 1); m_team_rank_rev = team_rank_rev; @@ -413,13 +415,13 @@ class ThreadsExecTeamMember { } if ((m_team_rank_rev == 0) && (m_invalid_thread == 0)) { - m_exec->set_work_range(m_league_rank, m_league_end, m_chunk_size); - m_exec->reset_steal_target(m_team_size); + m_instance->set_work_range(m_league_rank, m_league_end, m_chunk_size); + m_instance->reset_steal_target(m_team_size); } if (std::is_same::schedule_type::type, Kokkos::Dynamic>::value) { - m_exec->barrier(); + m_instance->barrier(); } } else { m_invalid_thread = 1; @@ -427,7 +429,7 @@ class ThreadsExecTeamMember { } ThreadsExecTeamMember() - : m_exec(nullptr), + : m_instance(nullptr), m_team_base(nullptr), m_team_shared(nullptr, 0), m_team_shared_size(0), @@ -442,8 +444,8 @@ class ThreadsExecTeamMember { m_invalid_thread(0), m_team_alloc(0) {} - inline ThreadsExec& threads_exec_team_base() const { - return m_team_base ? **m_team_base : *m_exec; + inline ThreadsInternal& threads_exec_team_base() const { + return m_team_base ? **m_team_base : *m_instance; } bool valid_static() const { return m_league_rank < m_league_end; } @@ -999,8 +1001,10 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( lambda(i, scan_val, false); } + auto& team_member = loop_bounds.thread; + // 'scan_val' output is the exclusive prefix sum - scan_val = loop_bounds.thread.team_scan(scan_val); + scan_val = team_member.team_scan(scan_val); #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep @@ -1010,6 +1014,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( lambda(i, scan_val, true); } + team_member.team_broadcast(scan_val, team_member.team_size() - 1); + return_val = scan_val; } diff --git a/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp b/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp index d4ce697548fa..c88d66db5f9a 100644 --- a/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp +++ b/packages/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp @@ -18,7 +18,7 @@ #define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP #include -#include +#include namespace Kokkos { namespace Impl { @@ -61,16 +61,17 @@ class ParallelFor, } } - static inline void thread_main(ThreadsExec& exec, const void* arg) noexcept { + static inline void thread_main(ThreadsInternal& instance, + const void* arg) noexcept { const Self& self = *(static_cast(arg)); self.exec_one_thread(); - exec.fan_in(); + instance.fan_in(); } public: inline void execute() { - ThreadsExec::start(&Self::thread_main, this); - ThreadsExec::fence(); + ThreadsInternal::start(&Self::thread_main, this); + ThreadsInternal::fence(); } inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_HIP.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_HIP.hpp index e115f7051f3a..cf405e57b8f9 100644 --- a/packages/kokkos/core/src/decl/Kokkos_Declare_HIP.hpp +++ b/packages/kokkos/core/src/decl/Kokkos_Declare_HIP.hpp @@ -25,9 +25,13 @@ #include #include #include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include #include #include #include diff --git a/packages/kokkos/core/src/decl/Kokkos_Declare_THREADS.hpp b/packages/kokkos/core/src/decl/Kokkos_Declare_THREADS.hpp index f5cbc0c1d1d6..4d7caec6f5fa 100644 --- a/packages/kokkos/core/src/decl/Kokkos_Declare_THREADS.hpp +++ b/packages/kokkos/core/src/decl/Kokkos_Declare_THREADS.hpp @@ -19,7 +19,7 @@ #if defined(KOKKOS_ENABLE_THREADS) #include -#include +#include #include #include #include @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Core.cpp b/packages/kokkos/core/src/impl/Kokkos_Core.cpp index 5c182db5663a..4a696526161e 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Core.cpp +++ b/packages/kokkos/core/src/impl/Kokkos_Core.cpp @@ -90,8 +90,6 @@ void combine(Kokkos::InitializationSettings& out, KOKKOS_IMPL_COMBINE_SETTING(num_threads); KOKKOS_IMPL_COMBINE_SETTING(map_device_id_by); KOKKOS_IMPL_COMBINE_SETTING(device_id); - KOKKOS_IMPL_COMBINE_SETTING(num_devices); - KOKKOS_IMPL_COMBINE_SETTING(skip_device); KOKKOS_IMPL_COMBINE_SETTING(disable_warnings); KOKKOS_IMPL_COMBINE_SETTING(tune_internals); KOKKOS_IMPL_COMBINE_SETTING(tools_help); @@ -131,11 +129,15 @@ void combine(Kokkos::Tools::InitArguments& out, int get_device_count() { #if defined(KOKKOS_ENABLE_CUDA) - return Kokkos::Cuda::detect_device_count(); + int count; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count)); + return count; #elif defined(KOKKOS_ENABLE_HIP) - return Kokkos::HIP::detect_device_count(); + int count; + KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&count)); + return count; #elif defined(KOKKOS_ENABLE_SYCL) - return sycl::device::get_devices(sycl::info::device_type::gpu).size(); + return Kokkos::Experimental::Impl::get_sycl_devices().size(); #elif defined(KOKKOS_ENABLE_OPENACC) return acc_get_num_devices( Kokkos::Experimental::Impl::OpenACC_Traits::dev_type); @@ -165,20 +167,43 @@ bool is_valid_map_device_id_by(std::string const& x) { } // namespace +std::vector const& Kokkos::Impl::get_visible_devices() { + static auto devices = get_visible_devices(get_device_count()); + return devices; +} + [[nodiscard]] int Kokkos::device_id() noexcept { #if defined(KOKKOS_ENABLE_CUDA) - return Cuda().cuda_device(); + int device = Cuda().cuda_device(); #elif defined(KOKKOS_ENABLE_HIP) - return HIP().hip_device(); + int device = HIP().hip_device(); #elif defined(KOKKOS_ENABLE_OPENACC) - return Experimental::OpenACC().acc_device_number(); + int device = Experimental::OpenACC().acc_device_number(); #elif defined(KOKKOS_ENABLE_OPENMPTARGET) - return omp_get_default_device(); // FIXME_OPENMPTARGET + int device = omp_get_default_device(); // FIXME_OPENMPTARGET #elif defined(KOKKOS_ENABLE_SYCL) - return Experimental::Impl::SYCLInternal::m_syclDev; + int device = Experimental::Impl::SYCLInternal::m_syclDev; #else - return -1; + int device = -1; + return device; #endif + auto const& visible_devices = Impl::get_visible_devices(); + for (std::size_t i = 0; i < visible_devices.size(); ++i) { + if (visible_devices[i] == device) { + return i; + } + } + Kokkos::abort("Unexpected error: cannot determine device id"); + return -1; +} + +[[nodiscard]] int Kokkos::num_devices() noexcept { + if constexpr (std::is_same_v) { + return -1; // no GPU backend enabled + } else { + return Impl::get_visible_devices().size(); + } } [[nodiscard]] int Kokkos::num_threads() noexcept { @@ -313,8 +338,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { return std::stoi(id.c_str()); } -std::vector Kokkos::Impl::get_visible_devices( - Kokkos::InitializationSettings const& settings, int device_count) { +std::vector Kokkos::Impl::get_visible_devices(int device_count) { std::vector visible_devices; char* env_visible_devices = std::getenv("KOKKOS_VISIBLE_DEVICES"); if (env_visible_devices) { @@ -341,30 +365,9 @@ std::vector Kokkos::Impl::get_visible_devices( } } } else { - int num_devices = - settings.has_num_devices() ? settings.get_num_devices() : device_count; - if (num_devices > device_count) { - std::stringstream ss; - ss << "Error: Specified number of devices '" << num_devices - << "' exceeds the actual number of GPUs available for execution '" - << device_count << "'." - << " Raised by Kokkos::initialize().\n"; - Kokkos::abort(ss.str().c_str()); - } - for (int i = 0; i < num_devices; ++i) { + for (int i = 0; i < device_count; ++i) { visible_devices.push_back(i); } - if (settings.has_skip_device()) { - if (visible_devices.size() == 1 && settings.get_skip_device() == 0) { - Kokkos::abort( - "Error: skipping the only GPU available for execution.\n" - " Raised by Kokkos::initialize().\n"); - } - visible_devices.erase( - std::remove(visible_devices.begin(), visible_devices.end(), - settings.get_skip_device()), - visible_devices.end()); - } } if (visible_devices.empty()) { Kokkos::abort( @@ -374,10 +377,10 @@ std::vector Kokkos::Impl::get_visible_devices( return visible_devices; } -int Kokkos::Impl::get_gpu(const InitializationSettings& settings) { - std::vector visible_devices = - get_visible_devices(settings, get_device_count()); - int const num_devices = visible_devices.size(); +std::optional Kokkos::Impl::get_gpu( + const InitializationSettings& settings) { + std::vector visible_devices = get_visible_devices(get_device_count()); + int const num_devices = visible_devices.size(); // device_id is provided if (settings.has_device_id()) { int const id = settings.get_device_id(); @@ -423,14 +426,15 @@ int Kokkos::Impl::get_gpu(const InitializationSettings& settings) { int const mpi_local_rank = mpi_local_rank_on_node(); - // use first GPU available for execution if unable to detect local MPI rank + // if unable to detect local MPI rank return nullopt to delegate device + // selection to the backend if (mpi_local_rank < 0) { if (settings.has_map_device_id_by()) { std::cerr << "Warning: unable to detect local MPI rank." << " Falling back to the first GPU available for execution." << " Raised by Kokkos::initialize()." << std::endl; } - return visible_devices[0]; + return std::nullopt; } // use device assigned by CTest when resource allocation is activated @@ -445,13 +449,6 @@ int Kokkos::Impl::get_gpu(const InitializationSettings& settings) { namespace { void initialize_backends(const Kokkos::InitializationSettings& settings) { -// This is an experimental setting -// For KNL in Flat mode this variable should be set, so that -// memkind allocates high bandwidth memory correctly. -#ifdef KOKKOS_ENABLE_HBWSPACE - setenv("MEMKIND_HBW_NODES", "1", 0); -#endif - Kokkos::Impl::ExecSpaceManager::get_instance().initialize_spaces(settings); } @@ -571,19 +568,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { "no"); #endif -#ifdef KOKKOS_ENABLE_HBWSPACE - declare_configuration_metadata("memory", "KOKKOS_ENABLE_HBWSPACE", "yes"); -#else - declare_configuration_metadata("memory", "KOKKOS_ENABLE_HBWSPACE", "no"); -#endif -#ifdef KOKKOS_ENABLE_INTEL_MM_ALLOC - declare_configuration_metadata("memory", "KOKKOS_ENABLE_INTEL_MM_ALLOC", - "yes"); -#else - declare_configuration_metadata("memory", "KOKKOS_ENABLE_INTEL_MM_ALLOC", - "no"); -#endif - #ifdef KOKKOS_ENABLE_ASM declare_configuration_metadata("options", "KOKKOS_ENABLE_ASM", "yes"); #else @@ -604,6 +588,11 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #else declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX23", "no"); #endif +#ifdef KOKKOS_ENABLE_CXX26 + declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX26", "yes"); +#else + declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX26", "no"); +#endif #ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK declare_configuration_metadata("options", "KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK", "yes"); @@ -616,11 +605,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #else declare_configuration_metadata("options", "KOKKOS_ENABLE_HWLOC", "no"); #endif -#ifdef KOKKOS_ENABLE_LIBRT - declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "yes"); -#else - declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "no"); -#endif #ifdef KOKKOS_ENABLE_LIBDL declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBDL", "yes"); #else @@ -645,8 +629,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { "ARMV8_THUNDERX2"); #elif defined(KOKKOS_ARCH_BDW) declare_configuration_metadata("architecture", "CPU architecture", "BDW"); -#elif defined(KOKKOS_ARCH_BGQ) - declare_configuration_metadata("architecture", "CPU architecture", "BGQ"); #elif defined(KOKKOS_ARCH_HSW) declare_configuration_metadata("architecture", "CPU architecture", "HSW"); #elif defined(KOKKOS_ARCH_ICL) @@ -659,8 +641,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { declare_configuration_metadata("architecture", "CPU architecture", "KNL"); #elif defined(KOKKOS_ARCH_NATIVE) declare_configuration_metadata("architecture", "CPU architecture", "NATIVE"); -#elif defined(KOKKOS_ARCH_POWER7) - declare_configuration_metadata("architecture", "CPU architecture", "POWER7"); #elif defined(KOKKOS_ARCH_POWER8) declare_configuration_metadata("architecture", "CPU architecture", "POWER8"); #elif defined(KOKKOS_ARCH_POWER9) @@ -673,8 +653,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { declare_configuration_metadata("architecture", "CPU architecture", "SNB"); #elif defined(KOKKOS_ARCH_SPR) declare_configuration_metadata("architecture", "CPU architecture", "SPR"); -#elif defined(KOKKOS_ARCH_WSM) - declare_configuration_metadata("architecture", "CPU architecture", "WSM"); #elif defined(KOKKOS_ARCH_AMD_ZEN) declare_configuration_metadata("architecture", "CPU architecture", "AMD_ZEN"); #elif defined(KOKKOS_ARCH_AMD_ZEN2) @@ -683,6 +661,9 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #elif defined(KOKKOS_ARCH_AMD_ZEN3) declare_configuration_metadata("architecture", "CPU architecture", "AMD_ZEN3"); +#elif defined(KOKKOS_ARCH_RISCV_SG2042) + declare_configuration_metadata("architecture", "CPU architecture", + "SG2042 (RISC-V)") #else declare_configuration_metadata("architecture", "CPU architecture", "none"); #endif @@ -752,8 +733,8 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #elif defined(KOKKOS_ARCH_ADA89) declare_configuration_metadata("architecture", "GPU architecture", "ADA89"); #elif defined(KOKKOS_ARCH_HOPPER90) - declare_configuration_metadata("architecture", "GPU architecture", - "HOPPER90"); + declare_configuration_metadata("architecture", "GPU architecture", + "HOPPER90"); #elif defined(KOKKOS_ARCH_AMD_GFX906) declare_configuration_metadata("architecture", "GPU architecture", "AMD_GFX906"); @@ -911,36 +892,18 @@ void Kokkos::Impl::parse_command_line_arguments( int num_threads; int device_id; - int num_devices; // deprecated - int skip_device; // deprecated std::string map_device_id_by; bool disable_warnings; bool print_configuration; bool tune_internals; - auto get_flag = [](std::string s) -> std::string { - return s.erase(s.find('=')); - }; - bool help_flag = false; int iarg = 0; while (iarg < argc) { bool remove_flag = false; - if (check_arg(argv[iarg], "--kokkos-numa") || - check_arg(argv[iarg], "--numa")) { - warn_deprecated_command_line_argument(get_flag(argv[iarg])); - // remove flag if prefixed with '--kokkos-' - remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0; - } else if (check_arg_int(argv[iarg], "--kokkos-num-threads", num_threads) || - check_arg_int(argv[iarg], "--num-threads", num_threads) || - check_arg_int(argv[iarg], "--kokkos-threads", num_threads) || - check_arg_int(argv[iarg], "--threads", num_threads)) { - if (get_flag(argv[iarg]) != "--kokkos-num-threads") { - warn_deprecated_command_line_argument(get_flag(argv[iarg]), - "--kokkos-num-threads"); - } + if (check_arg_int(argv[iarg], "--kokkos-num-threads", num_threads)) { if (!is_valid_num_threads(num_threads)) { std::stringstream ss; ss << "Error: command line argument '" << argv[iarg] << "' is invalid." @@ -949,15 +912,8 @@ void Kokkos::Impl::parse_command_line_arguments( Kokkos::abort(ss.str().c_str()); } settings.set_num_threads(num_threads); - remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0; - } else if (check_arg_int(argv[iarg], "--kokkos-device-id", device_id) || - check_arg_int(argv[iarg], "--device-id", device_id) || - check_arg_int(argv[iarg], "--kokkos-device", device_id) || - check_arg_int(argv[iarg], "--device", device_id)) { - if (get_flag(argv[iarg]) != "--kokkos-device-id") { - warn_deprecated_command_line_argument(get_flag(argv[iarg]), - "--kokkos-device-id"); - } + remove_flag = true; + } else if (check_arg_int(argv[iarg], "--kokkos-device-id", device_id)) { if (!is_valid_device_id(device_id)) { std::stringstream ss; ss << "Error: command line argument '" << argv[iarg] << "' is invalid." @@ -966,70 +922,7 @@ void Kokkos::Impl::parse_command_line_arguments( Kokkos::abort(ss.str().c_str()); } settings.set_device_id(device_id); - remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0; - } else if (check_arg(argv[iarg], "--kokkos-num-devices") || - check_arg(argv[iarg], "--num-devices") || - check_arg(argv[iarg], "--kokkos-ndevices") || - check_arg(argv[iarg], "--ndevices")) { - if (check_arg(argv[iarg], "--num-devices")) { - warn_deprecated_command_line_argument("--num-devices", - "--kokkos-num-devices"); - } - if (check_arg(argv[iarg], "--ndevices")) { - warn_deprecated_command_line_argument("--ndevices", - "--kokkos-num-devices"); - } - if (check_arg(argv[iarg], "--kokkos-ndevices")) { - warn_deprecated_command_line_argument("--kokkos-ndevices", - "--kokkos-num-devices"); - } - warn_deprecated_command_line_argument( - "--kokkos-num-devices", "--kokkos-map-device-id-by=mpi_rank"); - // Find the number of device (expecting --device=XX) - if (!((strncmp(argv[iarg], "--kokkos-num-devices=", 21) == 0) || - (strncmp(argv[iarg], "--num-devices=", 14) == 0) || - (strncmp(argv[iarg], "--kokkos-ndevices=", 18) == 0) || - (strncmp(argv[iarg], "--ndevices=", 11) == 0))) - throw_runtime_exception( - "Error: expecting an '=INT[,INT]' after command line argument " - "'--kokkos-num-devices'." - " Raised by Kokkos::initialize()."); - - char* num1 = strchr(argv[iarg], '=') + 1; - char* num2 = strpbrk(num1, ","); - int num1_len = num2 == nullptr ? strlen(num1) : num2 - num1; - char* num1_only = new char[num1_len + 1]; - strncpy(num1_only, num1, num1_len); - num1_only[num1_len] = '\0'; - - if (!is_unsigned_int(num1_only) || (strlen(num1_only) == 0)) { - throw_runtime_exception( - "Error: expecting an integer number after command line argument " - "'--kokkos-num-devices'." - " Raised by Kokkos::initialize()."); - } - if (check_arg(argv[iarg], "--kokkos-num-devices") || - check_arg(argv[iarg], "--kokkos-ndevices")) { - num_devices = std::stoi(num1_only); - settings.set_num_devices(num_devices); - settings.set_map_device_id_by("mpi_rank"); - } - delete[] num1_only; - - if (num2 != nullptr) { - if ((!is_unsigned_int(num2 + 1)) || (strlen(num2) == 1)) - throw_runtime_exception( - "Error: expecting an integer number after command line argument " - "'--kokkos-num-devices=XX,'." - " Raised by Kokkos::initialize()."); - - if (check_arg(argv[iarg], "--kokkos-num-devices") || - check_arg(argv[iarg], "--kokkos-ndevices")) { - skip_device = std::stoi(num2 + 1); - settings.set_skip_device(skip_device); - } - } - remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0; + remove_flag = true; } else if (check_arg_bool(argv[iarg], "--kokkos-disable-warnings", disable_warnings)) { settings.set_disable_warnings(disable_warnings); @@ -1098,9 +991,6 @@ void Kokkos::Impl::parse_environment_variables( } combine(settings, tools_init_arguments); - if (std::getenv("KOKKOS_NUMA")) { - warn_deprecated_environment_variable("KOKKOS_NUMA"); - } int num_threads; if (check_env_int("KOKKOS_NUM_THREADS", num_threads)) { if (!is_valid_num_threads(num_threads)) { @@ -1125,34 +1015,6 @@ void Kokkos::Impl::parse_environment_variables( } settings.set_device_id(device_id); } - int num_devices; - int rand_devices; - bool has_num_devices = check_env_int("KOKKOS_NUM_DEVICES", num_devices); - bool has_rand_devices = check_env_int("KOKKOS_RAND_DEVICES", rand_devices); - if (has_rand_devices && has_num_devices) { - Impl::throw_runtime_exception( - "Error: cannot specify both KOKKOS_NUM_DEVICES and " - "KOKKOS_RAND_DEVICES." - " Raised by Kokkos::initialize()."); - } - if (has_num_devices) { - warn_deprecated_environment_variable("KOKKOS_NUM_DEVICES", - "KOKKOS_MAP_DEVICE_ID_BY=mpi_rank"); - settings.set_map_device_id_by("mpi_rank"); - settings.set_num_devices(num_devices); - } - if (has_rand_devices) { - warn_deprecated_environment_variable("KOKKOS_RAND_DEVICES", - "KOKKOS_MAP_DEVICE_ID_BY=random"); - settings.set_map_device_id_by("random"); - settings.set_num_devices(rand_devices); - } - if (has_num_devices || has_rand_devices) { - int skip_device; - if (check_env_int("KOKKOS_SKIP_DEVICE", skip_device)) { - settings.set_skip_device(skip_device); - } - } bool disable_warnings; if (check_env_bool("KOKKOS_DISABLE_WARNINGS", disable_warnings)) { settings.set_disable_warnings(disable_warnings); diff --git a/packages/kokkos/core/src/impl/Kokkos_DeviceManagement.hpp b/packages/kokkos/core/src/impl/Kokkos_DeviceManagement.hpp index bd89c8b19ca9..70dca5d8fade 100644 --- a/packages/kokkos/core/src/impl/Kokkos_DeviceManagement.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_DeviceManagement.hpp @@ -17,17 +17,17 @@ #ifndef KOKKOS_DEVICE_MANAGEMENT_HPP #define KOKKOS_DEVICE_MANAGEMENT_HPP +#include #include namespace Kokkos { class InitializationSettings; namespace Impl { -int get_gpu(const Kokkos::InitializationSettings& settings); +std::optional get_gpu(const Kokkos::InitializationSettings& settings); // This declaration is provided for testing purposes only int get_ctest_gpu(int local_rank); -// ditto -std::vector get_visible_devices( - Kokkos::InitializationSettings const& settings, int device_count); +std::vector get_visible_devices(int device_count); // test-only +std::vector const& get_visible_devices(); // use this instead } // namespace Impl } // namespace Kokkos diff --git a/packages/kokkos/core/src/impl/Kokkos_Error.cpp b/packages/kokkos/core/src/impl/Kokkos_Error.cpp index 4babe2d72bd1..de6e83ed1f28 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Error.cpp +++ b/packages/kokkos/core/src/impl/Kokkos_Error.cpp @@ -21,10 +21,11 @@ #include #include -#include +#include #include #include #include +#include // show_warnings #include #include @@ -38,6 +39,12 @@ void throw_runtime_exception(const std::string &msg) { throw std::runtime_error(msg); } +void log_warning(const std::string &msg) { + if (show_warnings()) { + std::cerr << msg << std::flush; + } +} + std::string human_memory_size(size_t arg_bytes) { double bytes = arg_bytes; const double K = 1024; @@ -64,7 +71,8 @@ std::string human_memory_size(size_t arg_bytes) { void Experimental::RawMemoryAllocationFailure::print_error_message( std::ostream &o) const { - o << "Allocation of size " << Impl::human_memory_size(m_attempted_size); + o << "Allocation of size " + << ::Kokkos::Impl::human_memory_size(m_attempted_size); o << " failed"; switch (m_failure_mode) { case FailureMode::OutOfMemoryError: diff --git a/packages/kokkos/core/src/impl/Kokkos_Error.hpp b/packages/kokkos/core/src/impl/Kokkos_Error.hpp index 3d0b1d3274c8..1058fd98dbf7 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Error.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_Error.hpp @@ -28,6 +28,8 @@ namespace Impl { [[noreturn]] void throw_runtime_exception(const std::string &msg); +void log_warning(const std::string &msg); + std::string human_memory_size(size_t arg_bytes); } // namespace Impl @@ -58,7 +60,8 @@ class RawMemoryAllocationFailure : public std::bad_alloc { HIPMallocManaged, SYCLMallocDevice, SYCLMallocShared, - SYCLMallocHost + SYCLMallocHost, + OpenACCMalloc, }; private: diff --git a/packages/kokkos/core/src/impl/Kokkos_HBWSpace.cpp b/packages/kokkos/core/src/impl/Kokkos_HBWSpace.cpp deleted file mode 100644 index cd640b88cb92..000000000000 --- a/packages/kokkos/core/src/impl/Kokkos_HBWSpace.cpp +++ /dev/null @@ -1,313 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#ifdef KOKKOS_ENABLE_HBWSPACE -#include -#endif - -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- -#ifdef KOKKOS_ENABLE_HBWSPACE -#define MEMKIND_TYPE MEMKIND_HBW // hbw_get_kind(HBW_PAGESIZE_4KB) - -/*--------------------------------------------------------------------------*/ - -namespace Kokkos { -namespace Experimental { - -/* Default allocation mechanism */ -HBWSpace::HBWSpace() : m_alloc_mech(HBWSpace::STD_MALLOC) { - printf("Init\n"); - setenv("MEMKIND_HBW_NODES", "1", 0); -} - -/* Default allocation mechanism */ -HBWSpace::HBWSpace(const HBWSpace::AllocationMechanism &arg_alloc_mech) - : m_alloc_mech(HBWSpace::STD_MALLOC) { - printf("Init2\n"); - setenv("MEMKIND_HBW_NODES", "1", 0); - if (arg_alloc_mech == STD_MALLOC) { - m_alloc_mech = HBWSpace::STD_MALLOC; - } -} - -void *HBWSpace::allocate(const size_t arg_alloc_size) const { - return allocate("[unlabeled]", arg_alloc_size); -} -void *HBWSpace::allocate(const char *arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size) const { - return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); -} -void *HBWSpace::impl_allocate( - const char *arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size, - const Kokkos::Tools::SpaceHandle arg_handle) const { - static_assert(sizeof(void *) == sizeof(uintptr_t), - "Error sizeof(void*) != sizeof(uintptr_t)"); - - static_assert( - Kokkos::Impl::power_of_two::value, - "Memory alignment must be power of two"); - - constexpr uintptr_t alignment = Kokkos::Impl::MEMORY_ALIGNMENT; - constexpr uintptr_t alignment_mask = alignment - 1; - - void *ptr = nullptr; - - if (arg_alloc_size) { - if (m_alloc_mech == STD_MALLOC) { - // Over-allocate to and round up to guarantee proper alignment. - size_t size_padded = arg_alloc_size + sizeof(void *) + alignment; - - void *alloc_ptr = memkind_malloc(MEMKIND_TYPE, size_padded); - - if (alloc_ptr) { - uintptr_t address = reinterpret_cast(alloc_ptr); - - // offset enough to record the alloc_ptr - address += sizeof(void *); - uintptr_t rem = address % alignment; - uintptr_t offset = rem ? (alignment - rem) : 0u; - address += offset; - ptr = reinterpret_cast(address); - // record the alloc'd pointer - address -= sizeof(void *); - *reinterpret_cast(address) = alloc_ptr; - } - } - } - - if ((ptr == nullptr) || (reinterpret_cast(ptr) == ~uintptr_t(0)) || - (reinterpret_cast(ptr) & alignment_mask)) { - std::ostringstream msg; - msg << "Kokkos::Experimental::HBWSpace::allocate[ "; - switch (m_alloc_mech) { - case STD_MALLOC: msg << "STD_MALLOC"; break; - case POSIX_MEMALIGN: msg << "POSIX_MEMALIGN"; break; - case POSIX_MMAP: msg << "POSIX_MMAP"; break; - case INTEL_MM_ALLOC: msg << "INTEL_MM_ALLOC"; break; - } - msg << " ]( " << arg_alloc_size << " ) FAILED"; - if (ptr == nullptr) { - msg << " nullptr"; - } else { - msg << " NOT ALIGNED " << ptr; - } - - std::cerr << msg.str() << std::endl; - std::cerr.flush(); - - Kokkos::Impl::throw_runtime_exception(msg.str()); - } - if (Kokkos::Profiling::profileLibraryLoaded()) { - const size_t reported_size = - (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); - } - - return ptr; -} - -void HBWSpace::deallocate(void *const arg_alloc_ptr, - const size_t arg_alloc_size) const { - deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); -} -void HBWSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size) const { - impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); -} -void HBWSpace::impl_deallocate( - const char *arg_label, void *const arg_alloc_ptr, - const size_t arg_alloc_size, const size_t arg_logical_size, - const Kokkos::Tools::SpaceHandle arg_handle) const { - if (arg_alloc_ptr) { - if (Kokkos::Profiling::profileLibraryLoaded()) { - const size_t reported_size = - (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, - reported_size); - } - - if (m_alloc_mech == STD_MALLOC) { - void *alloc_ptr = *(reinterpret_cast(arg_alloc_ptr) - 1); - memkind_free(MEMKIND_TYPE, alloc_ptr); - } - } -} - -} // namespace Experimental -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord - SharedAllocationRecord::s_root_record; -#endif - -void SharedAllocationRecord::deallocate( - SharedAllocationRecord *arg_rec) { - delete static_cast(arg_rec); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size, - (SharedAllocationRecord::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::HBWSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : SharedAllocationRecord( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - // Fill in the Header information - RecordBase::m_alloc_ptr->m_record = - static_cast *>(this); - - strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length - 1); - // Set last element zero, in case c_str is too long - RecordBase::m_alloc_ptr - ->m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0'; -} - -//---------------------------------------------------------------------------- - -void * -SharedAllocationRecord::allocate_tracked( - const Kokkos::Experimental::HBWSpace &arg_space, - const std::string &arg_alloc_label, const size_t arg_alloc_size) { - if (!arg_alloc_size) return nullptr; - - SharedAllocationRecord *const r = - allocate(arg_space, arg_alloc_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); -} - -void SharedAllocationRecord::deallocate_tracked(void *const - arg_alloc_ptr) { - if (arg_alloc_ptr != nullptr) { - SharedAllocationRecord *const r = get_record(arg_alloc_ptr); - - RecordBase::decrement(r); - } -} - -void *SharedAllocationRecord:: - reallocate_tracked(void *const arg_alloc_ptr, const size_t arg_alloc_size) { - SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord *const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - Kokkos::fence( - "SharedAllocationRecord::reallocate_tracked(): fence after copying data"); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); -} - -SharedAllocationRecord - *SharedAllocationRecord::get_record( - void *alloc_ptr) { - using Header = SharedAllocationHeader; - using RecordHost = - SharedAllocationRecord; - - SharedAllocationHeader const *const head = - alloc_ptr ? Header::get_header(alloc_ptr) : nullptr; - RecordHost *const record = - head ? static_cast(head->m_record) : nullptr; - - if (!alloc_ptr || record->m_alloc_ptr != head) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace " - ", void >::get_record ERROR")); - } - - return record; -} - -// Iterate records to print orphaned memory ... -void SharedAllocationRecord:: - print_records(std::ostream &s, const Kokkos::Experimental::HBWSpace &space, - bool detail) { -#ifdef KOKKOS_ENABLE_DEBUG - SharedAllocationRecord::print_host_accessible_records( - s, "HBWSpace", &s_root_record, detail); -#else - throw_runtime_exception( - "SharedAllocationRecord::print_records" - " only works with KOKKOS_ENABLE_DEBUG enabled"); -#endif -} - -} // namespace Impl -} // namespace Kokkos - -#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp b/packages/kokkos/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp index 4a22898d1682..bcce013b00ec 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp @@ -196,12 +196,12 @@ KOKKOS_INLINE_FUNCTION template static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::half_t cast_to_wrapper( - T x, const volatile Kokkos::Impl::half_impl_t::type&); + T x, const Kokkos::Impl::half_impl_t::type&); #ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED template static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::bhalf_t cast_to_wrapper( - T x, const volatile Kokkos::Impl::bhalf_impl_t::type&); + T x, const Kokkos::Impl::bhalf_impl_t::type&); #endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED template @@ -283,13 +283,6 @@ class alignas(FloatType) floating_point_wrapper { private: impl_type val; - using fixed_width_integer_type = std::conditional_t< - sizeof(impl_type) == 2, uint16_t, - std::conditional_t< - sizeof(impl_type) == 4, uint32_t, - std::conditional_t>>; - static_assert(!std::is_void::value, - "Invalid impl_type"); public: // In-class initialization and defaulted default constructors not used @@ -318,18 +311,6 @@ class alignas(FloatType) floating_point_wrapper { default; #endif - KOKKOS_INLINE_FUNCTION - floating_point_wrapper(const volatile floating_point_wrapper& rhs) { -#if defined(KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH) && !defined(KOKKOS_ENABLE_SYCL) - val = rhs.val; -#else - const volatile fixed_width_integer_type* rv_ptr = - reinterpret_cast(&rhs.val); - const fixed_width_integer_type rv_val = *rv_ptr; - val = reinterpret_cast(rv_val); -#endif // KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - } - KOKKOS_FUNCTION floating_point_wrapper(bit_comparison_type rhs) { val = Kokkos::bit_cast(rhs); @@ -492,15 +473,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - template - KOKKOS_FUNCTION void operator=(T rhs) volatile { - impl_type new_val = cast_to_wrapper(rhs, val).val; - volatile fixed_width_integer_type* val_ptr = - reinterpret_cast( - const_cast(&val)); - *val_ptr = reinterpret_cast(new_val); - } - // Compound operators KOKKOS_FUNCTION floating_point_wrapper& operator+=(floating_point_wrapper rhs) { @@ -515,15 +487,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - KOKKOS_FUNCTION - void operator+=(const volatile floating_point_wrapper& rhs) volatile { - floating_point_wrapper tmp_rhs = rhs; - floating_point_wrapper tmp_lhs = *this; - - tmp_lhs += tmp_rhs; - *this = tmp_lhs; - } - // Compound operators: upcast overloads for += template KOKKOS_FUNCTION friend std::enable_if_t< @@ -560,15 +523,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - KOKKOS_FUNCTION - void operator-=(const volatile floating_point_wrapper& rhs) volatile { - floating_point_wrapper tmp_rhs = rhs; - floating_point_wrapper tmp_lhs = *this; - - tmp_lhs -= tmp_rhs; - *this = tmp_lhs; - } - // Compund operators: upcast overloads for -= template KOKKOS_FUNCTION friend std::enable_if_t< @@ -605,15 +559,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - KOKKOS_FUNCTION - void operator*=(const volatile floating_point_wrapper& rhs) volatile { - floating_point_wrapper tmp_rhs = rhs; - floating_point_wrapper tmp_lhs = *this; - - tmp_lhs *= tmp_rhs; - *this = tmp_lhs; - } - // Compund operators: upcast overloads for *= template KOKKOS_FUNCTION friend std::enable_if_t< @@ -650,15 +595,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - KOKKOS_FUNCTION - void operator/=(const volatile floating_point_wrapper& rhs) volatile { - floating_point_wrapper tmp_rhs = rhs; - floating_point_wrapper tmp_lhs = *this; - - tmp_lhs /= tmp_rhs; - *this = tmp_lhs; - } - // Compund operators: upcast overloads for /= template KOKKOS_FUNCTION friend std::enable_if_t< @@ -884,27 +820,6 @@ class alignas(FloatType) floating_point_wrapper { #endif } - KOKKOS_FUNCTION - friend bool operator==(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs == tmp_rhs; - } - - KOKKOS_FUNCTION - friend bool operator!=(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs != tmp_rhs; - } - - KOKKOS_FUNCTION - friend bool operator<(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs < tmp_rhs; - } - template KOKKOS_FUNCTION friend std::enable_if_t && (std::is_same_v || @@ -923,13 +838,6 @@ class alignas(FloatType) floating_point_wrapper { return lhs < static_cast(rhs); } - KOKKOS_FUNCTION - friend bool operator>(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs > tmp_rhs; - } - template KOKKOS_FUNCTION friend std::enable_if_t && (std::is_same_v || @@ -948,13 +856,6 @@ class alignas(FloatType) floating_point_wrapper { return lhs > static_cast(rhs); } - KOKKOS_FUNCTION - friend bool operator<=(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs <= tmp_rhs; - } - template KOKKOS_FUNCTION friend std::enable_if_t && (std::is_same_v || @@ -973,13 +874,6 @@ class alignas(FloatType) floating_point_wrapper { return lhs <= static_cast(rhs); } - KOKKOS_FUNCTION - friend bool operator>=(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs >= tmp_rhs; - } - template KOKKOS_FUNCTION friend std::enable_if_t && (std::is_same_v || @@ -1018,14 +912,14 @@ class alignas(FloatType) floating_point_wrapper { // Declare wrapper overloads now that floating_point_wrapper is declared template static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::half_t cast_to_wrapper( - T x, const volatile Kokkos::Impl::half_impl_t::type&) { + T x, const Kokkos::Impl::half_impl_t::type&) { return Kokkos::Experimental::cast_to_half(x); } #ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED template static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::bhalf_t cast_to_wrapper( - T x, const volatile Kokkos::Impl::bhalf_impl_t::type&) { + T x, const Kokkos::Impl::bhalf_impl_t::type&) { return Kokkos::Experimental::cast_to_bhalf(x); } #endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED diff --git a/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp b/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp index a9d721605937..1047b773d774 100644 --- a/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp +++ b/packages/kokkos/core/src/impl/Kokkos_HostSpace.cpp @@ -20,23 +20,11 @@ #include +#include +#include #include -#include #include -/*--------------------------------------------------------------------------*/ - -#if (defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM)) && \ - !defined(KOKKOS_ENABLE_CUDA) - -// Intel specialized allocator does not interoperate with CUDA memory allocation - -#define KOKKOS_ENABLE_INTEL_MM_ALLOC - -#endif - -/*--------------------------------------------------------------------------*/ - #include #include #include @@ -50,10 +38,6 @@ #include #endif -#include -#include -#include - //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -150,84 +134,6 @@ void HostSpace::impl_deallocate( } // namespace Kokkos -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord - SharedAllocationRecord::s_root_record; -#endif - -SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size, - (SharedAllocationRecord::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -SharedAllocationHeader *_do_allocation(Kokkos::HostSpace const &space, - std::string const &label, - size_t alloc_size) { - try { - return reinterpret_cast( - space.allocate(alloc_size)); - } catch (Experimental::RawMemoryAllocationFailure const &failure) { - if (failure.failure_mode() == Experimental::RawMemoryAllocationFailure:: - FailureMode::AllocationNotAligned) { - // TODO: delete the misaligned memory - } - - std::cerr << "Kokkos failed to allocate memory for label \"" << label - << "\". Allocation using MemorySpace named \"" << space.name() - << " failed with the following error: "; - failure.print_error_message(std::cerr); - std::cerr.flush(); - Kokkos::Impl::throw_runtime_exception("Memory allocation failure"); - } - return nullptr; // unreachable -} - -SharedAllocationRecord::SharedAllocationRecord( - const Kokkos::HostSpace &arg_space, const std::string &arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, - arg_label); -} - -} // namespace Impl -} // namespace Kokkos - -//============================================================================== -// {{{1 - #include -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class SharedAllocationRecordCommon; - -} // end namespace Impl -} // end namespace Kokkos - -// end Explicit instantiations of CRTP Base classes }}}1 -//============================================================================== +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(Kokkos::HostSpace); diff --git a/packages/kokkos/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp b/packages/kokkos/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp index f740c408fb8f..3072e2ce8251 100644 --- a/packages/kokkos/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp @@ -26,8 +26,7 @@ namespace Impl { template struct ZeroMemset> { - ZeroMemset(const HostSpace::execution_space& exec, const View& dst, - typename View::const_value_type&) { + ZeroMemset(const HostSpace::execution_space& exec, const View& dst) { // Host spaces, except for HPX, are synchronous and we need to fence for HPX // since we can't properly enqueue a std::memset otherwise. // We can't use exec.fence() directly since we don't have a full definition @@ -36,12 +35,6 @@ struct ZeroMemset> { using ValueType = typename View::value_type; std::memset(dst.data(), 0, sizeof(ValueType) * dst.size()); } - - ZeroMemset(const View& dst, - typename View::const_value_type&) { - using ValueType = typename View::value_type; - std::memset(dst.data(), 0, sizeof(ValueType) * dst.size()); - } }; } // end namespace Impl diff --git a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp index bfe5902bf7ff..11bf701b57a2 100644 --- a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp +++ b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp @@ -22,7 +22,6 @@ #include #include #include -#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp index 51f25a8b60f1..25f09b828655 100644 --- a/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp @@ -885,7 +885,7 @@ KOKKOS_INLINE_FUNCTION closure(i, accum, false); } - auto team_member = loop_boundaries.thread; + auto& team_member = loop_boundaries.thread; // 'accum' output is the exclusive prefix sum accum = team_member.team_scan(accum); diff --git a/packages/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp b/packages/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp index ab4350f3a7a4..11a93c6bb56b 100644 --- a/packages/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp @@ -24,32 +24,6 @@ namespace Kokkos { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -struct InitArguments { - int num_threads; - int num_numa; - int device_id; - int ndevices; - int skip_device; - bool disable_warnings; - bool tune_internals; - bool tool_help = false; - std::string tool_lib = {}; - std::string tool_args = {}; - - KOKKOS_DEPRECATED_WITH_COMMENT("Use InitializationSettings instead!") - InitArguments(int nt = -1, int nn = -1, int dv = -1, bool dw = false, - bool ti = false) - : num_threads{nt}, - num_numa{nn}, - device_id{dv}, - ndevices{-1}, - skip_device{9999}, - disable_warnings{dw}, - tune_internals{ti} {} -}; -#endif - class InitializationSettings { #define KOKKOS_IMPL_DECLARE(TYPE, NAME) \ private: \ @@ -64,12 +38,32 @@ class InitializationSettings { TYPE get_##NAME() const noexcept { return *m_##NAME; } \ static_assert(true, "no-op to require trailing semicolon") +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#define KOKKOS_IMPL_DECLARE_DEPRECATED(TYPE, NAME) \ + private: \ + std::optional m_##NAME; \ + \ + public: \ + KOKKOS_DEPRECATED InitializationSettings& set_##NAME(TYPE NAME) { \ + m_##NAME = NAME; \ + return *this; \ + } \ + KOKKOS_DEPRECATED bool has_##NAME() const noexcept { \ + return static_cast(m_##NAME); \ + } \ + KOKKOS_DEPRECATED TYPE get_##NAME() const noexcept { return *m_##NAME; } \ + static_assert(true, "no-op to require trailing semicolon") +#else +#define KOKKOS_IMPL_DECLARE_DEPRECATED(TYPE, NAME) \ + static_assert(true, "no-op to require trailing semicolon") +#endif + public: KOKKOS_IMPL_DECLARE(int, num_threads); KOKKOS_IMPL_DECLARE(int, device_id); KOKKOS_IMPL_DECLARE(std::string, map_device_id_by); - KOKKOS_IMPL_DECLARE(int, num_devices); // deprecated - KOKKOS_IMPL_DECLARE(int, skip_device); // deprecated + KOKKOS_IMPL_DECLARE_DEPRECATED(int, num_devices); + KOKKOS_IMPL_DECLARE_DEPRECATED(int, skip_device); KOKKOS_IMPL_DECLARE(bool, disable_warnings); KOKKOS_IMPL_DECLARE(bool, print_configuration); KOKKOS_IMPL_DECLARE(bool, tune_internals); @@ -80,41 +74,6 @@ class InitializationSettings { #undef KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER_TYPE #undef KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER #undef KOKKOS_IMPL_DECLARE - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - public: - InitializationSettings() = default; - - InitializationSettings(InitArguments const& old) { - if (old.num_threads != -1) { - set_num_threads(old.num_threads); - } - if (old.device_id != -1) { - set_device_id(old.device_id); - } - if (old.ndevices != -1) { - set_num_devices(old.ndevices); - } - if (old.skip_device != 9999) { - set_skip_device(old.skip_device); - } - if (old.disable_warnings) { - set_disable_warnings(true); - } - if (old.tune_internals) { - set_tune_internals(true); - } - if (old.tool_help) { - set_tools_help(true); - } - if (!old.tool_lib.empty()) { - set_tools_libs(old.tool_lib); - } - if (!old.tool_args.empty()) { - set_tools_args(old.tool_args); - } - } -#endif }; } // namespace Kokkos diff --git a/packages/kokkos/core/src/impl/Kokkos_MemorySpace.cpp b/packages/kokkos/core/src/impl/Kokkos_MemorySpace.cpp deleted file mode 100644 index 2f0e01c5b28d..000000000000 --- a/packages/kokkos/core/src/impl/Kokkos_MemorySpace.cpp +++ /dev/null @@ -1,72 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -/** @file Kokkos_MemorySpace.cpp - * - * Operations common to memory space instances, or at least default - * implementations thereof. - */ - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include - -#include -#include -#include - -namespace Kokkos { -namespace Impl { - -void safe_throw_allocation_with_header_failure( - std::string const& space_name, std::string const& label, - Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { - auto generate_failure_message = [&](std::ostream& o) { - o << "Kokkos failed to allocate memory for label \"" << label - << "\". Allocation using MemorySpace named \"" << space_name - << "\" failed with the following error: "; - failure.print_error_message(o); - if (failure.failure_mode() == - Kokkos::Experimental::RawMemoryAllocationFailure::FailureMode:: - AllocationNotAligned) { - // TODO: delete the misaligned memory? - o << "Warning: Allocation failed due to misalignment; memory may " - "be leaked.\n"; - } - o.flush(); - }; - try { - std::ostringstream sstr; - generate_failure_message(sstr); - Kokkos::Impl::throw_runtime_exception(sstr.str()); - } catch (std::bad_alloc const&) { - // Probably failed to allocate the string because we're so close to out - // of memory. Try printing to std::cerr instead - try { - generate_failure_message(std::cerr); - } catch (std::bad_alloc const&) { - // oh well, we tried... - } - Kokkos::Impl::throw_runtime_exception( - "Kokkos encountered an allocation failure, then another allocation " - "failure while trying to create the error message."); - } -} - -} // end namespace Impl -} // end namespace Kokkos diff --git a/packages/kokkos/core/src/impl/Kokkos_MemorySpace.hpp b/packages/kokkos/core/src/impl/Kokkos_MemorySpace.hpp deleted file mode 100644 index 44956dd7c5d9..000000000000 --- a/packages/kokkos/core/src/impl/Kokkos_MemorySpace.hpp +++ /dev/null @@ -1,71 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -/** @file Kokkos_MemorySpace.hpp - * - * Operations common to memory space instances, or at least default - * implementations thereof. - */ - -#ifndef KOKKOS_IMPL_MEMORYSPACE_HPP -#define KOKKOS_IMPL_MEMORYSPACE_HPP - -#include -#include -#include - -#include - -namespace Kokkos { -namespace Impl { - -// Defined in implementation file to avoid having to include iostream -void safe_throw_allocation_with_header_failure( - std::string const &space_name, std::string const &label, - Kokkos::Experimental::RawMemoryAllocationFailure const &failure); - -template -SharedAllocationHeader *checked_allocation_with_header(MemorySpace const &space, - std::string const &label, - size_t alloc_size) { - try { - return reinterpret_cast(space.allocate( - label.c_str(), alloc_size + sizeof(SharedAllocationHeader), - alloc_size)); - } catch (Kokkos::Experimental::RawMemoryAllocationFailure const &failure) { - safe_throw_allocation_with_header_failure(space.name(), label, failure); - } - return nullptr; // unreachable -} - -template -SharedAllocationHeader *checked_allocation_with_header( - ExecutionSpace const &exec_space, MemorySpace const &space, - std::string const &label, size_t alloc_size) { - try { - return reinterpret_cast(space.allocate( - exec_space, label.c_str(), alloc_size + sizeof(SharedAllocationHeader), - alloc_size)); - } catch (Kokkos::Experimental::RawMemoryAllocationFailure const &failure) { - safe_throw_allocation_with_header_failure(space.name(), label, failure); - } - return nullptr; // unreachable -} - -} // end namespace Impl -} // end namespace Kokkos - -#endif // KOKKOS_IMPL_MEMORYSPACE_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp b/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp deleted file mode 100644 index 42a53b04fb2a..000000000000 --- a/packages/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp +++ /dev/null @@ -1,54 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_MEMORY_FENCE_HPP) -#define KOKKOS_MEMORY_FENCE_HPP -namespace Kokkos { - -////////////////////////////////////////////////////// -// store_fence() -// -// If possible use a store fence on the architecture, if not run a full memory -// fence - -KOKKOS_FORCEINLINE_FUNCTION -void store_fence() { -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) - asm volatile("sfence" ::: "memory"); -#else - memory_fence(); -#endif -} - -////////////////////////////////////////////////////// -// load_fence() -// -// If possible use a load fence on the architecture, if not run a full memory -// fence - -KOKKOS_FORCEINLINE_FUNCTION -void load_fence() { -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) - asm volatile("lfence" ::: "memory"); -#else - memory_fence(); -#endif -} - -} // namespace Kokkos - -#endif diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h b/packages/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h index 731a11e917ad..15c466b27ed4 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h +++ b/packages/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h @@ -154,7 +154,7 @@ enum Kokkos_Tools_OptimizationType { Kokkos_Tools_Maximize }; -struct Kokkos_Tools_OptimzationGoal { +struct Kokkos_Tools_OptimizationGoal { size_t type_id; enum Kokkos_Tools_OptimizationType goal; }; @@ -220,7 +220,7 @@ typedef void (*Kokkos_Tools_contextBeginFunction)(const size_t); typedef void (*Kokkos_Tools_contextEndFunction)( const size_t, struct Kokkos_Tools_VariableValue); typedef void (*Kokkos_Tools_optimizationGoalDeclarationFunction)( - const size_t, const struct Kokkos_Tools_OptimzationGoal goal); + const size_t, const struct Kokkos_Tools_OptimizationGoal goal); struct Kokkos_Profiling_EventSet { Kokkos_Profiling_initFunction init; diff --git a/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp b/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp index af71932e47be..b66886d9f7e4 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp @@ -226,7 +226,7 @@ using ValueType = Kokkos_Tools_VariableInfo_ValueType; using CandidateValueType = Kokkos_Tools_VariableInfo_CandidateValueType; using SetOrRange = Kokkos_Tools_VariableInfo_SetOrRange; using VariableInfo = Kokkos_Tools_VariableInfo; -using OptimizationGoal = Kokkos_Tools_OptimzationGoal; +using OptimizationGoal = Kokkos_Tools_OptimizationGoal; using TuningString = Kokkos_Tools_Tuning_String; using VariableValue = Kokkos_Tools_VariableValue; diff --git a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp index 255f5125f4ab..0bc3814b3a1b 100644 --- a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp +++ b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp @@ -20,6 +20,8 @@ #include #include +#include +#include namespace Kokkos { namespace Impl { @@ -321,5 +323,53 @@ void SharedAllocationRecord::print_host_accessible_records( } #endif +void safe_throw_allocation_with_header_failure( + std::string const& space_name, std::string const& label, + Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { + auto generate_failure_message = [&](std::ostream& o) { + o << "Kokkos failed to allocate memory for label \"" << label + << "\". Allocation using MemorySpace named \"" << space_name + << "\" failed with the following error: "; + failure.print_error_message(o); + if (failure.failure_mode() == + Kokkos::Experimental::RawMemoryAllocationFailure::FailureMode:: + AllocationNotAligned) { + // TODO: delete the misaligned memory? + o << "Warning: Allocation failed due to misalignment; memory may " + "be leaked.\n"; + } + o.flush(); + }; + try { + std::ostringstream sstr; + generate_failure_message(sstr); + Kokkos::Impl::throw_runtime_exception(sstr.str()); + } catch (std::bad_alloc const&) { + // Probably failed to allocate the string because we're so close to out + // of memory. Try printing to std::cerr instead + try { + generate_failure_message(std::cerr); + } catch (std::bad_alloc const&) { + // oh well, we tried... + } + Kokkos::Impl::throw_runtime_exception( + "Kokkos encountered an allocation failure, then another allocation " + "failure while trying to create the error message."); + } +} + +void fill_host_accessible_header_info( + SharedAllocationRecord* arg_record, + SharedAllocationHeader& arg_header, std::string const& arg_label) { + // Fill in the Header information, directly accessible on the host + + arg_header.m_record = arg_record; + + strncpy(arg_header.m_label, arg_label.c_str(), + SharedAllocationHeader::maximum_label_length); + // Set last element zero, in case c_str is too long + arg_header.m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0'; +} + } /* namespace Impl */ } /* namespace Kokkos */ diff --git a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp index 043505a158e9..99ab660213f7 100644 --- a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp @@ -51,6 +51,9 @@ class SharedAllocationHeader { friend class SharedAllocationRecordCommon; template friend class HostInaccessibleSharedAllocationRecordCommon; + friend void fill_host_accessible_header_info( + SharedAllocationRecord*, SharedAllocationHeader&, + std::string const&); Record* m_record; char m_label[maximum_label_length]; @@ -145,25 +148,23 @@ class SharedAllocationRecord { SharedAllocationRecord() : m_alloc_ptr(nullptr), m_alloc_size(0), - m_dealloc(nullptr) + m_dealloc(nullptr), #ifdef KOKKOS_ENABLE_DEBUG - , m_root(this), m_prev(this), - m_next(this) + m_next(this), #endif - , m_count(0) { } static constexpr unsigned maximum_label_length = SharedAllocationHeader::maximum_label_length; - KOKKOS_INLINE_FUNCTION + KOKKOS_FUNCTION const SharedAllocationHeader* head() const { return m_alloc_ptr; } /* User's memory begins at the end of the header */ - KOKKOS_INLINE_FUNCTION + KOKKOS_FUNCTION void* data() const { return static_cast(m_alloc_ptr + 1); } /* User's memory begins at the end of the header */ @@ -195,23 +196,79 @@ class SharedAllocationRecord { const SharedAllocationRecord* const root, const bool detail); }; +void safe_throw_allocation_with_header_failure( + std::string const& space_name, std::string const& label, + Kokkos::Experimental::RawMemoryAllocationFailure const& failure); + +template +SharedAllocationHeader* checked_allocation_with_header(MemorySpace const& space, + std::string const& label, + size_t alloc_size) { + try { + return reinterpret_cast(space.allocate( + label.c_str(), alloc_size + sizeof(SharedAllocationHeader), + alloc_size)); + } catch (Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { + safe_throw_allocation_with_header_failure(space.name(), label, failure); + } + return nullptr; // unreachable +} + +template +SharedAllocationHeader* checked_allocation_with_header( + ExecutionSpace const& exec_space, MemorySpace const& space, + std::string const& label, size_t alloc_size) { + try { + return reinterpret_cast(space.allocate( + exec_space, label.c_str(), alloc_size + sizeof(SharedAllocationHeader), + alloc_size)); + } catch (Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { + safe_throw_allocation_with_header_failure(space.name(), label, failure); + } + return nullptr; // unreachable +} + +void fill_host_accessible_header_info(SharedAllocationHeader& arg_header, + std::string const& arg_label); + template class SharedAllocationRecordCommon : public SharedAllocationRecord { private: using derived_t = SharedAllocationRecord; using record_base_t = SharedAllocationRecord; - derived_t& self() { return *static_cast(this); } - derived_t const& self() const { return *static_cast(this); } protected: using record_base_t::record_base_t; - void _fill_host_accessible_header_info(SharedAllocationHeader& arg_header, - std::string const& arg_label); + MemorySpace m_space; + +#ifdef KOKKOS_ENABLE_DEBUG + static record_base_t s_root_record; +#endif static void deallocate(record_base_t* arg_rec); public: + ~SharedAllocationRecordCommon(); + template + SharedAllocationRecordCommon( + ExecutionSpace const& exec, MemorySpace const& space, + std::string const& label, std::size_t alloc_size, + record_base_t::function_type dealloc = &deallocate) + : SharedAllocationRecord( +#ifdef KOKKOS_ENABLE_DEBUG + &s_root_record, +#endif + checked_allocation_with_header(exec, space, label, alloc_size), + sizeof(SharedAllocationHeader) + alloc_size, dealloc, label), + m_space(space) { + auto& header = *SharedAllocationRecord::m_alloc_ptr; + fill_host_accessible_header_info(this, header, label); + } + SharedAllocationRecordCommon( + MemorySpace const& space, std::string const& label, std::size_t size, + record_base_t::function_type dealloc = &deallocate); + static auto allocate(MemorySpace const& arg_space, std::string const& arg_label, size_t arg_alloc_size) -> derived_t*; @@ -231,22 +288,103 @@ class SharedAllocationRecordCommon : public SharedAllocationRecord { template class HostInaccessibleSharedAllocationRecordCommon - : public SharedAllocationRecordCommon { + : public SharedAllocationRecord { private: - using base_t = SharedAllocationRecordCommon; using derived_t = SharedAllocationRecord; using record_base_t = SharedAllocationRecord; protected: - using base_t::base_t; + using record_base_t::record_base_t; + + MemorySpace m_space; + +#ifdef KOKKOS_ENABLE_DEBUG + static record_base_t s_root_record; +#endif + + static void deallocate(record_base_t* arg_rec); public: + ~HostInaccessibleSharedAllocationRecordCommon(); + template + HostInaccessibleSharedAllocationRecordCommon( + ExecutionSpace const& exec, MemorySpace const& space, + std::string const& label, std::size_t alloc_size, + record_base_t::function_type dealloc = &deallocate) + : SharedAllocationRecord( +#ifdef KOKKOS_ENABLE_DEBUG + &s_root_record, +#endif + checked_allocation_with_header(exec, space, label, alloc_size), + sizeof(SharedAllocationHeader) + alloc_size, dealloc, label), + m_space(space) { + SharedAllocationHeader header; + + fill_host_accessible_header_info(this, header, label); + + Kokkos::Impl::DeepCopy( + exec, SharedAllocationRecord::m_alloc_ptr, &header, + sizeof(SharedAllocationHeader)); + } + HostInaccessibleSharedAllocationRecordCommon( + MemorySpace const& space, std::string const& label, std::size_t size, + record_base_t::function_type dealloc = &deallocate); + + static auto allocate(MemorySpace const& arg_space, + std::string const& arg_label, size_t arg_alloc_size) + -> derived_t*; + /**\brief Allocate tracked memory in the space */ + static void* allocate_tracked(MemorySpace const& arg_space, + std::string const& arg_alloc_label, + size_t arg_alloc_size); + /**\brief Reallocate tracked memory in the space */ + static void deallocate_tracked(void* arg_alloc_ptr); + /**\brief Deallocate tracked memory in the space */ + static void* reallocate_tracked(void* arg_alloc_ptr, size_t arg_alloc_size); + static void print_records(std::ostream& s, MemorySpace const&, bool detail = false); static auto get_record(void* alloc_ptr) -> derived_t*; std::string get_label() const; }; +#ifdef KOKKOS_ENABLE_DEBUG +template +SharedAllocationRecord + SharedAllocationRecordCommon::s_root_record; + +template +SharedAllocationRecord + HostInaccessibleSharedAllocationRecordCommon::s_root_record; +#endif + +#define KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(MEMORY_SPACE) \ + template <> \ + class Kokkos::Impl::SharedAllocationRecord \ + : public Kokkos::Impl::SharedAllocationRecordCommon { \ + using SharedAllocationRecordCommon< \ + MEMORY_SPACE>::SharedAllocationRecordCommon; \ + } + +#define KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( \ + MEMORY_SPACE) \ + template <> \ + class Kokkos::Impl::SharedAllocationRecord \ + : public Kokkos::Impl::HostInaccessibleSharedAllocationRecordCommon< \ + MEMORY_SPACE> { \ + using HostInaccessibleSharedAllocationRecordCommon< \ + MEMORY_SPACE>::HostInaccessibleSharedAllocationRecordCommon; \ + } + +#define KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( \ + MEMORY_SPACE) \ + template class Kokkos::Impl::SharedAllocationRecordCommon + +#define KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( \ + MEMORY_SPACE) \ + template class Kokkos::Impl::HostInaccessibleSharedAllocationRecordCommon< \ + MEMORY_SPACE> + namespace { /* Taking the address of this function so make sure it is unique */ diff --git a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc_timpl.hpp b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc_timpl.hpp index d403ef9db064..41036ab06788 100644 --- a/packages/kokkos/core/src/impl/Kokkos_SharedAlloc_timpl.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_SharedAlloc_timpl.hpp @@ -31,6 +31,66 @@ namespace Kokkos { namespace Impl { +template +SharedAllocationRecordCommon::~SharedAllocationRecordCommon() { + auto alloc_ptr = SharedAllocationRecord::m_alloc_ptr; + auto alloc_size = SharedAllocationRecord::m_alloc_size; + auto label = SharedAllocationRecord::m_label; + m_space.deallocate(label.c_str(), alloc_ptr, alloc_size, + alloc_size - sizeof(SharedAllocationHeader)); +} +template +HostInaccessibleSharedAllocationRecordCommon< + MemorySpace>::~HostInaccessibleSharedAllocationRecordCommon() { + auto alloc_ptr = SharedAllocationRecord::m_alloc_ptr; + auto alloc_size = SharedAllocationRecord::m_alloc_size; + auto label = SharedAllocationRecord::m_label; + m_space.deallocate(label.c_str(), alloc_ptr, alloc_size, + alloc_size - sizeof(SharedAllocationHeader)); +} + +template +SharedAllocationRecordCommon::SharedAllocationRecordCommon( + MemorySpace const& space, std::string const& label, std::size_t alloc_size, + SharedAllocationRecord::function_type dealloc) + : SharedAllocationRecord( +#ifdef KOKKOS_ENABLE_DEBUG + &s_root_record, +#endif + checked_allocation_with_header(space, label, alloc_size), + sizeof(SharedAllocationHeader) + alloc_size, dealloc, label), + m_space(space) { + auto& header = *SharedAllocationRecord::m_alloc_ptr; + fill_host_accessible_header_info(this, header, label); +} + +template +HostInaccessibleSharedAllocationRecordCommon:: + HostInaccessibleSharedAllocationRecordCommon( + MemorySpace const& space, std::string const& label, + std::size_t alloc_size, + SharedAllocationRecord::function_type dealloc) + : SharedAllocationRecord( +#ifdef KOKKOS_ENABLE_DEBUG + &s_root_record, +#endif + checked_allocation_with_header(space, label, alloc_size), + sizeof(SharedAllocationHeader) + alloc_size, dealloc, label), + m_space(space) { + SharedAllocationHeader header; + + fill_host_accessible_header_info(this, header, label); + + typename MemorySpace::execution_space exec; + Kokkos::Impl::DeepCopy( + exec, SharedAllocationRecord::m_alloc_ptr, &header, + sizeof(SharedAllocationHeader)); + exec.fence(std::string("SharedAllocationRecord::SharedAllocationRecord(): " + "fence after copying header from HostSpace"); +} + template auto SharedAllocationRecordCommon::allocate( MemorySpace const& arg_space, std::string const& arg_label, @@ -76,9 +136,64 @@ void* SharedAllocationRecordCommon::reallocate_tracked( Kokkos::Impl::DeepCopy( r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - Kokkos::fence( - "SharedAllocationRecord::reallocate_tracked(): fence after copying data"); + Kokkos::fence(std::string("SharedAllocationRecord<") + MemorySpace::name() + + ", void>::reallocate_tracked(): fence after copying data"); + + record_base_t::increment(r_new); + record_base_t::decrement(r_old); + + return r_new->data(); +} + +template +auto HostInaccessibleSharedAllocationRecordCommon::allocate( + MemorySpace const& arg_space, std::string const& arg_label, + size_t arg_alloc_size) -> derived_t* { + return new derived_t(arg_space, arg_label, arg_alloc_size); +} + +template +void* HostInaccessibleSharedAllocationRecordCommon< + MemorySpace>::allocate_tracked(const MemorySpace& arg_space, + const std::string& arg_alloc_label, + size_t arg_alloc_size) { + if (!arg_alloc_size) return nullptr; + + SharedAllocationRecord* const r = + allocate(arg_space, arg_alloc_label, arg_alloc_size); + + record_base_t::increment(r); + + return r->data(); +} + +template +void HostInaccessibleSharedAllocationRecordCommon::deallocate( + HostInaccessibleSharedAllocationRecordCommon::record_base_t* arg_rec) { + delete static_cast(arg_rec); +} + +template +void HostInaccessibleSharedAllocationRecordCommon< + MemorySpace>::deallocate_tracked(void* arg_alloc_ptr) { + if (arg_alloc_ptr != nullptr) { + SharedAllocationRecord* const r = derived_t::get_record(arg_alloc_ptr); + record_base_t::decrement(r); + } +} + +template +void* HostInaccessibleSharedAllocationRecordCommon< + MemorySpace>::reallocate_tracked(void* arg_alloc_ptr, + size_t arg_alloc_size) { + derived_t* const r_old = derived_t::get_record(arg_alloc_ptr); + derived_t* const r_new = + allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); + + Kokkos::Impl::DeepCopy( + r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); + Kokkos::fence(std::string("SharedAllocationRecord<") + MemorySpace::name() + + ", void>::reallocate_tracked(): fence after copying data"); record_base_t::increment(r_new); record_base_t::decrement(r_old); @@ -108,20 +223,6 @@ std::string SharedAllocationRecordCommon::get_label() const { return record_base_t::m_label; } -template -void SharedAllocationRecordCommon:: - _fill_host_accessible_header_info(SharedAllocationHeader& arg_header, - std::string const& arg_label) { - // Fill in the Header information, directly accessible on the host - - arg_header.m_record = &self(); - - strncpy(arg_header.m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length); - // Set last element zero, in case c_str is too long - arg_header.m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0'; -} - template void SharedAllocationRecordCommon::print_records( std::ostream& s, const MemorySpace&, bool detail) { diff --git a/packages/kokkos/core/src/impl/Kokkos_Spinwait.hpp b/packages/kokkos/core/src/impl/Kokkos_Spinwait.hpp deleted file mode 100644 index c57b17d646a2..000000000000 --- a/packages/kokkos/core/src/impl/Kokkos_Spinwait.hpp +++ /dev/null @@ -1,109 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_SPINWAIT_HPP -#define KOKKOS_SPINWAIT_HPP - -#include -#include - -#include - -#include - -namespace Kokkos { -namespace Impl { - -enum class WaitMode : int { - ACTIVE // Used for tight loops to keep threads active longest - , - PASSIVE // Used to quickly yield the thread to quite down the system - , - ROOT // Never sleep or yield the root thread -}; - -void host_thread_yield(const uint32_t i, const WaitMode mode); - -template -std::enable_if_t::value, void> root_spinwait_while_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value == flag) { - host_thread_yield(++i, WaitMode::ROOT); - } - Kokkos::load_fence(); -} - -template -std::enable_if_t::value, void> root_spinwait_until_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value != flag) { - host_thread_yield(++i, WaitMode::ROOT); - } - Kokkos::load_fence(); -} - -template -std::enable_if_t::value, void> spinwait_while_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value == flag) { - host_thread_yield(++i, WaitMode::ACTIVE); - } - Kokkos::load_fence(); -} - -template -std::enable_if_t::value, void> yield_while_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value == flag) { - host_thread_yield(++i, WaitMode::PASSIVE); - } - Kokkos::load_fence(); -} - -template -std::enable_if_t::value, void> spinwait_until_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value != flag) { - host_thread_yield(++i, WaitMode::ACTIVE); - } - Kokkos::load_fence(); -} - -template -std::enable_if_t::value, void> yield_until_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value != flag) { - host_thread_yield(++i, WaitMode::PASSIVE); - } - Kokkos::load_fence(); -} - -} /* namespace Impl */ -} /* namespace Kokkos */ - -#endif /* #ifndef KOKKOS_SPINWAIT_HPP */ diff --git a/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp b/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp index 7e2f130564fe..cadeed1a6d84 100644 --- a/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_Utilities.hpp @@ -49,6 +49,11 @@ struct integral_constant { template struct always_true : std::true_type {}; +// type-dependent expression that is always false intended for use in +// static_assert to check "we should never get there" +template +struct always_false : std::false_type {}; + //============================================================================== #if defined(__cpp_lib_type_identity) diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewArray.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewArray.hpp index 725ba5de092a..fe43b630184f 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ViewArray.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_ViewArray.hpp @@ -27,10 +27,9 @@ struct ViewDataAnalysis> { private: using array_analysis = ViewArrayAnalysis; - static_assert(std::is_void

::value, ""); + static_assert(std::is_void

::value); static_assert(std::is_same>::value, - ""); + Kokkos::Array>::value); static_assert(std::is_scalar::value, "View of Array type must be of a scalar type"); @@ -130,6 +129,12 @@ class ViewMapping> { return m_impl_offset.m_dim.extent(r); } + static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( + const unsigned r) noexcept { + using dim_type = typename offset_type::dimension_type; + return dim_type::static_extent(r); + } + KOKKOS_INLINE_FUNCTION constexpr typename Traits::array_layout layout() const { return m_impl_offset.layout(); @@ -507,7 +512,7 @@ class ViewMapping< Kokkos::LayoutStride>::value))>, SrcTraits, Args...> { private: - static_assert(SrcTraits::rank == sizeof...(Args), ""); + static_assert(SrcTraits::rank == sizeof...(Args)); enum : bool { R0 = is_integral_extent<0, Args...>::value, diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewDataAnalysis.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewDataAnalysis.hpp new file mode 100644 index 000000000000..04c0c9aeede7 --- /dev/null +++ b/packages/kokkos/core/src/impl/Kokkos_ViewDataAnalysis.hpp @@ -0,0 +1,402 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif + +#ifndef KOKKOS_VIEW_DATA_ANALYSIS_HPP +#define KOKKOS_VIEW_DATA_ANALYSIS_HPP + +#include + +namespace Kokkos::Impl { + +template +struct variadic_size_t { + enum : size_t { value = KOKKOS_INVALID_INDEX }; +}; + +template +struct variadic_size_t<0, Val, Args...> { + enum : size_t { value = Val }; +}; + +template +struct variadic_size_t { + enum : size_t { value = variadic_size_t::value }; +}; + +template +struct rank_dynamic; + +template <> +struct rank_dynamic<> { + enum : unsigned { value = 0 }; +}; + +template +struct rank_dynamic { + enum : unsigned { value = (Val == 0 ? 1 : 0) + rank_dynamic::value }; +}; + +#define KOKKOS_IMPL_VIEW_DIMENSION(R) \ + template \ + struct ViewDimension##R { \ + static constexpr size_t ArgN##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ + static constexpr size_t N##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ + KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t) {} \ + ViewDimension##R() = default; \ + ViewDimension##R(const ViewDimension##R&) = default; \ + ViewDimension##R& operator=(const ViewDimension##R&) = default; \ + }; \ + template \ + constexpr size_t ViewDimension##R::ArgN##R; \ + template \ + constexpr size_t ViewDimension##R::N##R; \ + template \ + struct ViewDimension##R<0u, RD> { \ + static constexpr size_t ArgN##R = 0; \ + std::conditional_t<(RD < 3), size_t, unsigned> N##R; \ + ViewDimension##R() = default; \ + ViewDimension##R(const ViewDimension##R&) = default; \ + ViewDimension##R& operator=(const ViewDimension##R&) = default; \ + KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t V) : N##R(V) {} \ + }; \ + template \ + constexpr size_t ViewDimension##R<0u, RD>::ArgN##R; + +KOKKOS_IMPL_VIEW_DIMENSION(0) +KOKKOS_IMPL_VIEW_DIMENSION(1) +KOKKOS_IMPL_VIEW_DIMENSION(2) +KOKKOS_IMPL_VIEW_DIMENSION(3) +KOKKOS_IMPL_VIEW_DIMENSION(4) +KOKKOS_IMPL_VIEW_DIMENSION(5) +KOKKOS_IMPL_VIEW_DIMENSION(6) +KOKKOS_IMPL_VIEW_DIMENSION(7) + +#undef KOKKOS_IMPL_VIEW_DIMENSION + +// MSVC does not do empty base class optimization by default. +// Per standard it is required for standard layout types +template +struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION ViewDimension + : public ViewDimension0::value, + rank_dynamic::value>, + public ViewDimension1::value, + rank_dynamic::value>, + public ViewDimension2::value, + rank_dynamic::value>, + public ViewDimension3::value, + rank_dynamic::value>, + public ViewDimension4::value, + rank_dynamic::value>, + public ViewDimension5::value, + rank_dynamic::value>, + public ViewDimension6::value, + rank_dynamic::value>, + public ViewDimension7::value, + rank_dynamic::value> { + using D0 = ViewDimension0::value, + rank_dynamic::value>; + using D1 = ViewDimension1::value, + rank_dynamic::value>; + using D2 = ViewDimension2::value, + rank_dynamic::value>; + using D3 = ViewDimension3::value, + rank_dynamic::value>; + using D4 = ViewDimension4::value, + rank_dynamic::value>; + using D5 = ViewDimension5::value, + rank_dynamic::value>; + using D6 = ViewDimension6::value, + rank_dynamic::value>; + using D7 = ViewDimension7::value, + rank_dynamic::value>; + + using D0::ArgN0; + using D1::ArgN1; + using D2::ArgN2; + using D3::ArgN3; + using D4::ArgN4; + using D5::ArgN5; + using D6::ArgN6; + using D7::ArgN7; + + using D0::N0; + using D1::N1; + using D2::N2; + using D3::N3; + using D4::N4; + using D5::N5; + using D6::N6; + using D7::N7; + + static constexpr unsigned rank = sizeof...(Vals); + static constexpr unsigned rank_dynamic = Impl::rank_dynamic::value; + + ViewDimension() = default; + ViewDimension(const ViewDimension&) = default; + ViewDimension& operator=(const ViewDimension&) = default; + + KOKKOS_INLINE_FUNCTION + constexpr ViewDimension(size_t n0, size_t n1, size_t n2, size_t n3, size_t n4, + size_t n5, size_t n6, size_t n7) + : D0(n0 == KOKKOS_INVALID_INDEX ? 1 : n0), + D1(n1 == KOKKOS_INVALID_INDEX ? 1 : n1), + D2(n2 == KOKKOS_INVALID_INDEX ? 1 : n2), + D3(n3 == KOKKOS_INVALID_INDEX ? 1 : n3), + D4(n4 == KOKKOS_INVALID_INDEX ? 1 : n4), + D5(n5 == KOKKOS_INVALID_INDEX ? 1 : n5), + D6(n6 == KOKKOS_INVALID_INDEX ? 1 : n6), + D7(n7 == KOKKOS_INVALID_INDEX ? 1 : n7) {} + + KOKKOS_INLINE_FUNCTION + constexpr size_t extent(const unsigned r) const noexcept { + return r == 0 + ? N0 + : (r == 1 + ? N1 + : (r == 2 + ? N2 + : (r == 3 + ? N3 + : (r == 4 + ? N4 + : (r == 5 + ? N5 + : (r == 6 + ? N6 + : (r == 7 ? N7 + : 0))))))); + } + + static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( + const unsigned r) noexcept { + return r == 0 + ? ArgN0 + : (r == 1 + ? ArgN1 + : (r == 2 + ? ArgN2 + : (r == 3 + ? ArgN3 + : (r == 4 + ? ArgN4 + : (r == 5 + ? ArgN5 + : (r == 6 + ? ArgN6 + : (r == 7 ? ArgN7 + : 0))))))); + } + + template + struct prepend { + using type = ViewDimension; + }; + + template + struct append { + using type = ViewDimension; + }; +}; + +template +struct ViewDimensionJoin; + +template +struct ViewDimensionJoin, ViewDimension> { + using type = ViewDimension; +}; + +//---------------------------------------------------------------------------- + +template +struct ViewDimensionAssignable; + +template +struct ViewDimensionAssignable, + ViewDimension> { + using dst = ViewDimension; + using src = ViewDimension; + + enum { + value = unsigned(dst::rank) == unsigned(src::rank) && + ( + // Compile time check that potential static dimensions match + ((1 > dst::rank_dynamic && 1 > src::rank_dynamic) + ? (size_t(dst::ArgN0) == size_t(src::ArgN0)) + : true) && + ((2 > dst::rank_dynamic && 2 > src::rank_dynamic) + ? (size_t(dst::ArgN1) == size_t(src::ArgN1)) + : true) && + ((3 > dst::rank_dynamic && 3 > src::rank_dynamic) + ? (size_t(dst::ArgN2) == size_t(src::ArgN2)) + : true) && + ((4 > dst::rank_dynamic && 4 > src::rank_dynamic) + ? (size_t(dst::ArgN3) == size_t(src::ArgN3)) + : true) && + ((5 > dst::rank_dynamic && 5 > src::rank_dynamic) + ? (size_t(dst::ArgN4) == size_t(src::ArgN4)) + : true) && + ((6 > dst::rank_dynamic && 6 > src::rank_dynamic) + ? (size_t(dst::ArgN5) == size_t(src::ArgN5)) + : true) && + ((7 > dst::rank_dynamic && 7 > src::rank_dynamic) + ? (size_t(dst::ArgN6) == size_t(src::ArgN6)) + : true) && + ((8 > dst::rank_dynamic && 8 > src::rank_dynamic) + ? (size_t(dst::ArgN7) == size_t(src::ArgN7)) + : true)) + }; +}; + +/** \brief Given a value type and dimension generate the View data type */ +template +struct ViewDataType; + +template +struct ViewDataType> { + using type = T; +}; + +template +struct ViewDataType> { + using type = typename ViewDataType>::type; +}; + +template +struct ViewDataType> { + using type = typename ViewDataType>::type[N]; +}; + +/**\brief Analysis of View data type. + * + * Data type conforms to one of the following patterns : + * {const} value_type [][#][#][#] + * {const} value_type ***[#][#][#] + * Where the sum of counts of '*' and '[#]' is at most ten. + * + * Provide alias for ViewDimension<...> and value_type. + */ +template +struct ViewArrayAnalysis { + using value_type = T; + using const_value_type = std::add_const_t; + using non_const_value_type = std::remove_const_t; + using static_dimension = ViewDimension<>; + using dynamic_dimension = ViewDimension<>; + using dimension = ViewDimension<>; +}; + +template +struct ViewArrayAnalysis { + private: + using nested = ViewArrayAnalysis; + + public: + using value_type = typename nested::value_type; + using const_value_type = typename nested::const_value_type; + using non_const_value_type = typename nested::non_const_value_type; + + using static_dimension = + typename nested::static_dimension::template prepend::type; + + using dynamic_dimension = typename nested::dynamic_dimension; + + using dimension = + typename ViewDimensionJoin::type; +}; + +template +struct ViewArrayAnalysis { + private: + using nested = ViewArrayAnalysis; + using nested_dimension = typename nested::dimension; + + public: + using value_type = typename nested::value_type; + using const_value_type = typename nested::const_value_type; + using non_const_value_type = typename nested::non_const_value_type; + + using dynamic_dimension = + typename nested::dynamic_dimension::template prepend<0>::type; + + using static_dimension = typename nested::static_dimension; + + using dimension = + typename ViewDimensionJoin::type; +}; + +template +struct ViewArrayAnalysis { + private: + using nested = ViewArrayAnalysis; + + public: + using value_type = typename nested::value_type; + using const_value_type = typename nested::const_value_type; + using non_const_value_type = typename nested::non_const_value_type; + + using dynamic_dimension = + typename nested::dynamic_dimension::template prepend<0>::type; + + using static_dimension = typename nested::static_dimension; + + using dimension = + typename ViewDimensionJoin::type; +}; + +template +struct ViewDataAnalysis { + private: + using array_analysis = ViewArrayAnalysis; + + // ValueType is opportunity for partial specialization. + // Must match array analysis when this default template is used. + static_assert( + std::is_same::value); + + public: + using specialize = void; // No specialization + + using dimension = typename array_analysis::dimension; + using value_type = typename array_analysis::value_type; + using const_value_type = typename array_analysis::const_value_type; + using non_const_value_type = typename array_analysis::non_const_value_type; + + // Generate analogous multidimensional array specification type. + using type = typename ViewDataType::type; + using const_type = typename ViewDataType::type; + using non_const_type = + typename ViewDataType::type; + + // Generate "flattened" multidimensional array specification type. + using scalar_array_type = type; + using const_scalar_array_type = const_type; + using non_const_scalar_array_type = non_const_type; +}; + +template +struct ViewOffset { + using is_mapping_plugin = std::false_type; +}; +} // namespace Kokkos::Impl + +#endif // KOKKOS_VIEW_DATA_ANALYSIS_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp index 01d0dc4f6811..3217c76e3801 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp @@ -33,255 +33,7 @@ #include #include #include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -struct variadic_size_t { - enum : size_t { value = KOKKOS_INVALID_INDEX }; -}; - -template -struct variadic_size_t<0, Val, Args...> { - enum : size_t { value = Val }; -}; - -template -struct variadic_size_t { - enum : size_t { value = variadic_size_t::value }; -}; - -template -struct rank_dynamic; - -template <> -struct rank_dynamic<> { - enum : unsigned { value = 0 }; -}; - -template -struct rank_dynamic { - enum : unsigned { value = (Val == 0 ? 1 : 0) + rank_dynamic::value }; -}; - -#define KOKKOS_IMPL_VIEW_DIMENSION(R) \ - template \ - struct ViewDimension##R { \ - static constexpr size_t ArgN##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ - static constexpr size_t N##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ - KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t) {} \ - ViewDimension##R() = default; \ - ViewDimension##R(const ViewDimension##R&) = default; \ - ViewDimension##R& operator=(const ViewDimension##R&) = default; \ - }; \ - template \ - constexpr size_t ViewDimension##R::ArgN##R; \ - template \ - constexpr size_t ViewDimension##R::N##R; \ - template \ - struct ViewDimension##R<0u, RD> { \ - static constexpr size_t ArgN##R = 0; \ - std::conditional_t<(RD < 3), size_t, unsigned> N##R; \ - ViewDimension##R() = default; \ - ViewDimension##R(const ViewDimension##R&) = default; \ - ViewDimension##R& operator=(const ViewDimension##R&) = default; \ - KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t V) : N##R(V) {} \ - }; \ - template \ - constexpr size_t ViewDimension##R<0u, RD>::ArgN##R; - -KOKKOS_IMPL_VIEW_DIMENSION(0) -KOKKOS_IMPL_VIEW_DIMENSION(1) -KOKKOS_IMPL_VIEW_DIMENSION(2) -KOKKOS_IMPL_VIEW_DIMENSION(3) -KOKKOS_IMPL_VIEW_DIMENSION(4) -KOKKOS_IMPL_VIEW_DIMENSION(5) -KOKKOS_IMPL_VIEW_DIMENSION(6) -KOKKOS_IMPL_VIEW_DIMENSION(7) - -#undef KOKKOS_IMPL_VIEW_DIMENSION - -// MSVC does not do empty base class optimization by default. -// Per standard it is required for standard layout types -template -struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION ViewDimension - : public ViewDimension0::value, - rank_dynamic::value>, - public ViewDimension1::value, - rank_dynamic::value>, - public ViewDimension2::value, - rank_dynamic::value>, - public ViewDimension3::value, - rank_dynamic::value>, - public ViewDimension4::value, - rank_dynamic::value>, - public ViewDimension5::value, - rank_dynamic::value>, - public ViewDimension6::value, - rank_dynamic::value>, - public ViewDimension7::value, - rank_dynamic::value> { - using D0 = ViewDimension0::value, - rank_dynamic::value>; - using D1 = ViewDimension1::value, - rank_dynamic::value>; - using D2 = ViewDimension2::value, - rank_dynamic::value>; - using D3 = ViewDimension3::value, - rank_dynamic::value>; - using D4 = ViewDimension4::value, - rank_dynamic::value>; - using D5 = ViewDimension5::value, - rank_dynamic::value>; - using D6 = ViewDimension6::value, - rank_dynamic::value>; - using D7 = ViewDimension7::value, - rank_dynamic::value>; - - using D0::ArgN0; - using D1::ArgN1; - using D2::ArgN2; - using D3::ArgN3; - using D4::ArgN4; - using D5::ArgN5; - using D6::ArgN6; - using D7::ArgN7; - - using D0::N0; - using D1::N1; - using D2::N2; - using D3::N3; - using D4::N4; - using D5::N5; - using D6::N6; - using D7::N7; - - static constexpr unsigned rank = sizeof...(Vals); - static constexpr unsigned rank_dynamic = Impl::rank_dynamic::value; - - ViewDimension() = default; - ViewDimension(const ViewDimension&) = default; - ViewDimension& operator=(const ViewDimension&) = default; - - KOKKOS_INLINE_FUNCTION - constexpr ViewDimension(size_t n0, size_t n1, size_t n2, size_t n3, size_t n4, - size_t n5, size_t n6, size_t n7) - : D0(n0 == KOKKOS_INVALID_INDEX ? 1 : n0), - D1(n1 == KOKKOS_INVALID_INDEX ? 1 : n1), - D2(n2 == KOKKOS_INVALID_INDEX ? 1 : n2), - D3(n3 == KOKKOS_INVALID_INDEX ? 1 : n3), - D4(n4 == KOKKOS_INVALID_INDEX ? 1 : n4), - D5(n5 == KOKKOS_INVALID_INDEX ? 1 : n5), - D6(n6 == KOKKOS_INVALID_INDEX ? 1 : n6), - D7(n7 == KOKKOS_INVALID_INDEX ? 1 : n7) {} - - KOKKOS_INLINE_FUNCTION - constexpr size_t extent(const unsigned r) const noexcept { - return r == 0 - ? N0 - : (r == 1 - ? N1 - : (r == 2 - ? N2 - : (r == 3 - ? N3 - : (r == 4 - ? N4 - : (r == 5 - ? N5 - : (r == 6 - ? N6 - : (r == 7 ? N7 - : 0))))))); - } - - static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( - const unsigned r) noexcept { - return r == 0 - ? ArgN0 - : (r == 1 - ? ArgN1 - : (r == 2 - ? ArgN2 - : (r == 3 - ? ArgN3 - : (r == 4 - ? ArgN4 - : (r == 5 - ? ArgN5 - : (r == 6 - ? ArgN6 - : (r == 7 ? ArgN7 - : 0))))))); - } - - template - struct prepend { - using type = ViewDimension; - }; - - template - struct append { - using type = ViewDimension; - }; -}; - -template -struct ViewDimensionJoin; - -template -struct ViewDimensionJoin, ViewDimension> { - using type = ViewDimension; -}; - -//---------------------------------------------------------------------------- - -template -struct ViewDimensionAssignable; - -template -struct ViewDimensionAssignable, - ViewDimension> { - using dst = ViewDimension; - using src = ViewDimension; - - enum { - value = unsigned(dst::rank) == unsigned(src::rank) && - ( - // Compile time check that potential static dimensions match - ((1 > dst::rank_dynamic && 1 > src::rank_dynamic) - ? (size_t(dst::ArgN0) == size_t(src::ArgN0)) - : true) && - ((2 > dst::rank_dynamic && 2 > src::rank_dynamic) - ? (size_t(dst::ArgN1) == size_t(src::ArgN1)) - : true) && - ((3 > dst::rank_dynamic && 3 > src::rank_dynamic) - ? (size_t(dst::ArgN2) == size_t(src::ArgN2)) - : true) && - ((4 > dst::rank_dynamic && 4 > src::rank_dynamic) - ? (size_t(dst::ArgN3) == size_t(src::ArgN3)) - : true) && - ((5 > dst::rank_dynamic && 5 > src::rank_dynamic) - ? (size_t(dst::ArgN4) == size_t(src::ArgN4)) - : true) && - ((6 > dst::rank_dynamic && 6 > src::rank_dynamic) - ? (size_t(dst::ArgN5) == size_t(src::ArgN5)) - : true) && - ((7 > dst::rank_dynamic && 7 > src::rank_dynamic) - ? (size_t(dst::ArgN6) == size_t(src::ArgN6)) - : true) && - ((8 > dst::rank_dynamic && 8 > src::rank_dynamic) - ? (size_t(dst::ArgN7) == size_t(src::ArgN7)) - : true)) - }; -}; - -} // namespace Impl -} // namespace Kokkos +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -657,21 +409,20 @@ struct SubviewExtents { template KOKKOS_INLINE_FUNCTION SubviewExtents(const ViewDimension& dim, Args... args) { - static_assert(DomainRank == sizeof...(DimArgs), ""); - static_assert(DomainRank == sizeof...(Args), ""); + static_assert(DomainRank == sizeof...(DimArgs)); + static_assert(DomainRank == sizeof...(Args)); // Verifies that all arguments, up to 8, are integral types, // integral extents, or don't exist. - static_assert( - RangeRank == unsigned(is_integral_extent<0, Args...>::value) + - unsigned(is_integral_extent<1, Args...>::value) + - unsigned(is_integral_extent<2, Args...>::value) + - unsigned(is_integral_extent<3, Args...>::value) + - unsigned(is_integral_extent<4, Args...>::value) + - unsigned(is_integral_extent<5, Args...>::value) + - unsigned(is_integral_extent<6, Args...>::value) + - unsigned(is_integral_extent<7, Args...>::value), - ""); + static_assert(RangeRank == + unsigned(is_integral_extent<0, Args...>::value) + + unsigned(is_integral_extent<1, Args...>::value) + + unsigned(is_integral_extent<2, Args...>::value) + + unsigned(is_integral_extent<3, Args...>::value) + + unsigned(is_integral_extent<4, Args...>::value) + + unsigned(is_integral_extent<5, Args...>::value) + + unsigned(is_integral_extent<6, Args...>::value) + + unsigned(is_integral_extent<7, Args...>::value)); if (RangeRank == 0) { m_length[0] = 0; @@ -708,149 +459,6 @@ struct SubviewExtents { namespace Kokkos { namespace Impl { - -/** \brief Given a value type and dimension generate the View data type */ -template -struct ViewDataType; - -template -struct ViewDataType> { - using type = T; -}; - -template -struct ViewDataType> { - using type = typename ViewDataType>::type; -}; - -template -struct ViewDataType> { - using type = typename ViewDataType>::type[N]; -}; - -/**\brief Analysis of View data type. - * - * Data type conforms to one of the following patterns : - * {const} value_type [][#][#][#] - * {const} value_type ***[#][#][#] - * Where the sum of counts of '*' and '[#]' is at most ten. - * - * Provide alias for ViewDimension<...> and value_type. - */ -template -struct ViewArrayAnalysis { - using value_type = T; - using const_value_type = std::add_const_t; - using non_const_value_type = std::remove_const_t; - using static_dimension = ViewDimension<>; - using dynamic_dimension = ViewDimension<>; - using dimension = ViewDimension<>; -}; - -template -struct ViewArrayAnalysis { - private: - using nested = ViewArrayAnalysis; - - public: - using value_type = typename nested::value_type; - using const_value_type = typename nested::const_value_type; - using non_const_value_type = typename nested::non_const_value_type; - - using static_dimension = - typename nested::static_dimension::template prepend::type; - - using dynamic_dimension = typename nested::dynamic_dimension; - - using dimension = - typename ViewDimensionJoin::type; -}; - -template -struct ViewArrayAnalysis { - private: - using nested = ViewArrayAnalysis; - using nested_dimension = typename nested::dimension; - - public: - using value_type = typename nested::value_type; - using const_value_type = typename nested::const_value_type; - using non_const_value_type = typename nested::non_const_value_type; - - using dynamic_dimension = - typename nested::dynamic_dimension::template prepend<0>::type; - - using static_dimension = typename nested::static_dimension; - - using dimension = - typename ViewDimensionJoin::type; -}; - -template -struct ViewArrayAnalysis { - private: - using nested = ViewArrayAnalysis; - - public: - using value_type = typename nested::value_type; - using const_value_type = typename nested::const_value_type; - using non_const_value_type = typename nested::non_const_value_type; - - using dynamic_dimension = - typename nested::dynamic_dimension::template prepend<0>::type; - - using static_dimension = typename nested::static_dimension; - - using dimension = - typename ViewDimensionJoin::type; -}; - -template -struct ViewDataAnalysis { - private: - using array_analysis = ViewArrayAnalysis; - - // ValueType is opportunity for partial specialization. - // Must match array analysis when this default template is used. - static_assert( - std::is_same::value, - ""); - - public: - using specialize = void; // No specialization - - using dimension = typename array_analysis::dimension; - using value_type = typename array_analysis::value_type; - using const_value_type = typename array_analysis::const_value_type; - using non_const_value_type = typename array_analysis::non_const_value_type; - - // Generate analogous multidimensional array specification type. - using type = typename ViewDataType::type; - using const_type = typename ViewDataType::type; - using non_const_type = - typename ViewDataType::type; - - // Generate "flattened" multidimensional array specification type. - using scalar_array_type = type; - using const_scalar_array_type = const_type; - using non_const_scalar_array_type = non_const_type; -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -struct ViewOffset { - using is_mapping_plugin = std::false_type; -}; - //---------------------------------------------------------------------------- // LayoutLeft AND ( 1 >= rank OR 0 == rank_dynamic ) : no padding / striding template @@ -2919,13 +2527,9 @@ struct ViewValueFunctor { "Kokkos::View::initialization [" + name + "] via memset", Kokkos::Profiling::Experimental::device_id(space), &kpID); } - (void)ZeroMemset< - ExecSpace, Kokkos::View>>( - space, - Kokkos::View>(ptr, n), - value); + (void)ZeroMemset( + space, Kokkos::View>(ptr, n)); if (Kokkos::Profiling::profileLibraryLoaded()) { Kokkos::Profiling::endParallelFor(kpID); @@ -2949,37 +2553,33 @@ struct ViewValueFunctor { template void parallel_for_implementation() { - if (!space.in_parallel()) { - using PolicyType = - Kokkos::RangePolicy, Tag>; - PolicyType policy(space, 0, n); - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - const std::string functor_name = - (std::is_same_v - ? "Kokkos::View::destruction [" + name + "]" - : "Kokkos::View::initialization [" + name + "]"); - Kokkos::Profiling::beginParallelFor( - functor_name, Kokkos::Profiling::Experimental::device_id(space), - &kpID); - } + using PolicyType = + Kokkos::RangePolicy, Tag>; + PolicyType policy(space, 0, n); + uint64_t kpID = 0; + if (Kokkos::Profiling::profileLibraryLoaded()) { + const std::string functor_name = + (std::is_same_v + ? "Kokkos::View::destruction [" + name + "]" + : "Kokkos::View::initialization [" + name + "]"); + Kokkos::Profiling::beginParallelFor( + functor_name, Kokkos::Profiling::Experimental::device_id(space), + &kpID); + } #ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) { - Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, - true); - } + if (std::is_same::value) { + Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, + true); + } #endif - const Kokkos::Impl::ParallelFor closure( - *this, policy); - closure.execute(); - if (default_exec_space || std::is_same_v) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - } else { - for (size_t i = 0; i < n; ++i) operator()(Tag{}, i); + const Kokkos::Impl::ParallelFor closure( + *this, policy); + closure.execute(); + if (default_exec_space || std::is_same_v) + space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); } } @@ -3057,13 +2657,9 @@ struct ViewValueFunctor { Kokkos::Profiling::Experimental::device_id(space), &kpID); } - (void)ZeroMemset< - ExecSpace, Kokkos::View>>( - space, - Kokkos::View>(ptr, n), - value); + (void)ZeroMemset( + space, Kokkos::View>(ptr, n)); if (Kokkos::Profiling::profileLibraryLoaded()) { Kokkos::Profiling::endParallelFor(kpID); @@ -3086,32 +2682,28 @@ struct ViewValueFunctor { } void parallel_for_implementation() { - if (!space.in_parallel()) { - PolicyType policy(0, n); - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "]", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } + PolicyType policy(0, n); + uint64_t kpID = 0; + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::beginParallelFor( + "Kokkos::View::initialization [" + name + "]", + Kokkos::Profiling::Experimental::device_id(space), &kpID); + } #ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) { - Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, - true); - } + if (std::is_same::value) { + Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, + true); + } #endif - const Kokkos::Impl::ParallelFor closure( - *this, PolicyType(0, n)); - closure.execute(); - if (default_exec_space) - space.fence( - "Kokkos::Impl::ViewValueFunctor: Fence after setting values in " - "view"); - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - } else { - for (size_t i = 0; i < n; ++i) operator()(i); + const Kokkos::Impl::ParallelFor closure( + *this, PolicyType(0, n)); + closure.execute(); + if (default_exec_space) + space.fence( + "Kokkos::Impl::ViewValueFunctor: Fence after setting values in " + "view"); + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); } } @@ -3896,7 +3488,7 @@ class ViewMapping< template struct apply { - static_assert(Kokkos::is_memory_traits::value, ""); + static_assert(Kokkos::is_memory_traits::value); using traits_type = Kokkos::ViewTraits -KOKKOS_INLINE_FUNCTION bool view_verify_operator_bounds(const MapType&) { - return true; +template +KOKKOS_FUNCTION bool within_range(Map const& map, + std::index_sequence, + Indices... indices) { + return (((std::size_t)indices < map.extent(Enumerate)) && ...); } -template -KOKKOS_INLINE_FUNCTION bool view_verify_operator_bounds(const MapType& map, - const iType& i, - Args... args) { - return (size_t(i) < map.extent(R)) && - view_verify_operator_bounds(map, args...); +template +KOKKOS_FUNCTION constexpr char* append_formatted_multidimensional_index( + char* dest, Indices... indices) { + char* d = dest; + strcat(d, "["); + ( + [&] { + d += strlen(d); + to_chars_i(d, + d + 20, // 20 digits ought to be enough + indices); + strcat(d, ","); + }(), + ...); + d[strlen(d) - 1] = ']'; // overwrite trailing comma + return dest; } -template -inline void view_error_operator_bounds(char*, int, const MapType&) {} - -template -inline void view_error_operator_bounds(char* buf, int len, const MapType& map, - const iType& i, Args... args) { - const int n = snprintf( - buf, len, " %ld < %ld %c", static_cast(i), - static_cast(map.extent(R)), (sizeof...(Args) ? ',' : ')')); - view_error_operator_bounds(buf + n, len - n, map, args...); +template +KOKKOS_FUNCTION void print_extents(char* dest, Map const& map, + std::index_sequence) { + append_formatted_multidimensional_index(dest, map.extent(Enumerate)...); } -/* Check #3: is the View managed as determined by the MemoryTraits? */ -template -struct OperatorBoundsErrorOnDevice; - -template -struct OperatorBoundsErrorOnDevice { - KOKKOS_INLINE_FUNCTION - static void run(MapType const&) { Kokkos::abort("View bounds error"); } -}; - -template -struct OperatorBoundsErrorOnDevice { - KOKKOS_INLINE_FUNCTION - static void run(MapType const& map) { - SharedAllocationHeader const* const header = - SharedAllocationHeader::get_header( - static_cast(map.data())); - char const* const label = header->label(); - enum { LEN = 128 }; - char msg[LEN]; - char const* const first_part = "View bounds error of view "; - char* p = msg; - char* const end = msg + LEN - 1; - for (char const* p2 = first_part; (*p2 != '\0') && (p < end); ++p, ++p2) { - *p = *p2; - } - for (char const* p2 = label; (*p2 != '\0') && (p < end); ++p, ++p2) { - *p = *p2; - } - *p = '\0'; - Kokkos::abort(msg); - } -}; - -/* Check #2: does the ViewMapping have the printable_label_typedef defined? - See above that only the non-specialized standard-layout ViewMapping has - this defined by default. - The existence of this alias indicates the existence of MapType::is_managed - */ template using printable_label_typedef_t = typename T::printable_label_typedef; -template -KOKKOS_FUNCTION - std::enable_if_t::value> - operator_bounds_error_on_device(Map const&) { - Kokkos::abort("View bounds error"); -} - -template -KOKKOS_FUNCTION - std::enable_if_t::value> - operator_bounds_error_on_device(Map const& map) { - OperatorBoundsErrorOnDevice::run(map); -} - template KOKKOS_INLINE_FUNCTION void view_verify_operator_bounds( Kokkos::Impl::ViewTracker const& tracker, const MapType& map, Args... args) { - if (!view_verify_operator_bounds<0>(map, args...)) { + if (!within_range(map, std::make_index_sequence(), + args...)) { + char err[256] = ""; + strcat(err, "Kokkos::View ERROR: out of bounds access"); + strcat(err, " label=(\""); KOKKOS_IF_ON_HOST( - (enum {LEN = 1024}; char buffer[LEN]; - const std::string label = - tracker.m_tracker.template get_label(); - int n = snprintf(buffer, LEN, "View bounds error of view %s (", - label.c_str()); - view_error_operator_bounds<0>(buffer + n, LEN - n, map, args...); - Kokkos::Impl::throw_runtime_exception(std::string(buffer));)) - - KOKKOS_IF_ON_DEVICE(( - /* Check #1: is there a SharedAllocationRecord? - (we won't use it, but if its not there then there isn't - a corresponding SharedAllocationHeader containing a label). - This check should cover the case of Views that don't - have the Unmanaged trait but were initialized by pointer. */ if (tracker.m_tracker.has_record()) { - operator_bounds_error_on_device(map); - } else { Kokkos::abort("View bounds error"); })) + strncat(err, tracker.m_tracker.template get_label().c_str(), + 128); + } else { strcat(err, "**UNMANAGED**"); }) + KOKKOS_IF_ON_DEVICE([&] { + // Check #1: is there a SharedAllocationRecord? (we won't use it, but + // if its not there then there isn't a corresponding + // SharedAllocationHeader containing a label). This check should cover + // the case of Views that don't have the Unmanaged trait but were + // initialized by pointer. + if (!tracker.m_tracker.has_record()) { + strcat(err, "**UNMANAGED**"); + return; + } + // Check #2: does the ViewMapping have the printable_label_typedef + // defined? See above that only the non-specialized standard-layout + // ViewMapping has this defined by default. The existence of this + // alias indicates the existence of MapType::is_managed + if constexpr (is_detected_v) { + // Check #3: is the View managed as determined by the MemoryTraits? + if constexpr (MapType::is_managed != 0) { + SharedAllocationHeader const* const header = + SharedAllocationHeader::get_header( + static_cast(map.data())); + char const* const label = header->label(); + strcat(err, label); + return; + } + strcat(err, "**UNAVAILABLE**"); + } + }();) + strcat(err, "\") with indices "); + append_formatted_multidimensional_index(err, args...); + strcat(err, " but extents "); + print_extents(err, map, std::make_index_sequence()); + Kokkos::abort(err); } } diff --git a/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp b/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp index 7f7957bc61f2..30f6fa2ad23f 100644 --- a/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp +++ b/packages/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp @@ -38,12 +38,11 @@ #include #endif -#ifdef __SYCL_DEVICE_ONLY__ -#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(format, ...) \ - do { \ - const __attribute__((opencl_constant)) char fmt[] = (format); \ - sycl::ext::oneapi::experimental::printf(fmt, ##__VA_ARGS__); \ - } while (0) +#if defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER >= 20230200 +#define KOKKOS_IMPL_SYCL_GET_MULTI_PTR(accessor) \ + accessor.get_multi_ptr() +#else +#define KOKKOS_IMPL_SYCL_GET_MULTI_PTR(accessor) accessor.get_pointer() #endif #endif diff --git a/packages/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp index 91820fbccacf..e43535451c31 100644 --- a/packages/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp +++ b/packages/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp @@ -83,7 +83,7 @@ struct IndexTypePolicyMixin : AnalyzeNextTrait { "Kokkos Error: More than one index type given. Search " "compiler output for 'show_extra_index_type' to see the " "type of the errant tag."); - static_assert(std::is_integral::value, ""); + static_assert(std::is_integral::value); static constexpr bool index_type_is_defaulted = false; using index_type = Kokkos::IndexType; }; diff --git a/packages/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp index dadf582c3728..c2ca5a341f1c 100644 --- a/packages/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp +++ b/packages/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp @@ -163,7 +163,7 @@ auto prefer(Policy const& p, DesiredOccupancy occ) { template constexpr auto prefer(Policy const& p, MaximizeOccupancy) { - static_assert(Kokkos::is_execution_policy::value, ""); + static_assert(Kokkos::is_execution_policy::value); using new_policy_t = Kokkos::Impl::OccupancyControlTrait::policy_with_trait; diff --git a/packages/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp b/packages/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp index 578e9e762adb..98ad1d7ebbba 100644 --- a/packages/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp +++ b/packages/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp @@ -68,7 +68,7 @@ struct PolicyTraitAdaptorImpl< TraitSpec, PolicyTemplate, type_list, type_list, NewTrait, std::enable_if_t::value>> { - static_assert(PolicyTraitMatcher::value, ""); + static_assert(PolicyTraitMatcher::value); using type = PolicyTemplate; }; @@ -92,7 +92,7 @@ template class PolicyTemplate, struct PolicyTraitAdaptorImpl, type_list<>, NewTrait> { - static_assert(PolicyTraitMatcher::value, ""); + static_assert(PolicyTraitMatcher::value); using type = PolicyTemplate; }; diff --git a/packages/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp index 861300255305..4e91d89f0f9c 100644 --- a/packages/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp +++ b/packages/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp @@ -78,7 +78,7 @@ namespace Experimental { template constexpr auto require(Policy const& p, Kokkos::Schedule) { - static_assert(Kokkos::is_execution_policy::value, ""); + static_assert(Kokkos::is_execution_policy::value); using new_policy_t = Kokkos::Impl::ScheduleTrait::policy_with_trait< Policy, Kokkos::Schedule>; return new_policy_t{p}; diff --git a/packages/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp b/packages/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp index 8f95385c8517..ae7aa6e534fd 100644 --- a/packages/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp +++ b/packages/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp @@ -57,7 +57,7 @@ namespace Experimental { template constexpr auto require(const Policy p, WorkItemProperty::ImplWorkItemProperty) { - static_assert(Kokkos::is_execution_policy::value, ""); + static_assert(Kokkos::is_execution_policy::value); using new_policy_t = Kokkos::Impl::WorkItemPropertyTrait::policy_with_trait< Policy, WorkItemProperty::ImplWorkItemProperty>; return new_policy_t{p}; diff --git a/packages/kokkos/core/unit_test/CMakeLists.txt b/packages/kokkos/core/unit_test/CMakeLists.txt index b71c72c3c9f7..6dfb7505c5d4 100644 --- a/packages/kokkos/core/unit_test/CMakeLists.txt +++ b/packages/kokkos/core/unit_test/CMakeLists.txt @@ -65,7 +65,7 @@ SET(KOKKOS_THREADS_NAME Threads) IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) SET(KOKKOS_OPENACC_FEATURE_LEVEL 9) ELSE() - SET(KOKKOS_OPENACC_FEATURE_LEVEL 16) + SET(KOKKOS_OPENACC_FEATURE_LEVEL 17) ENDIF() SET(KOKKOS_OPENACC_NAME Experimental::OpenACC) @@ -86,11 +86,13 @@ SET(COMPILE_ONLY_SOURCES TestDetectionIdiom.cpp TestBitManipulation.cpp TestInterOp.cpp + TestRangePolicyCTAD.cpp TestStringManipulation.cpp TestVersionMacros.cpp TestViewRank.cpp TestViewTypeTraits.cpp TestTypeList.cpp + TestMDRangePolicyCTAD.cpp view/TestExtentsDatatypeConversion.cpp ) @@ -184,6 +186,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) MDSpan MinMaxClamp NumericTraits + OccupancyControlTrait Other ParallelScanRangePolicy Printf @@ -200,6 +203,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) Reductions Reductions_DeviceView SharedAlloc + Swap ) set(file ${dir}/Test${Tag}_${Name}.cpp) # Write to a temporary intermediate file and call configure_file to avoid @@ -233,6 +237,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) ViewCopy_a ViewCopy_b ViewCtorDimMatch + ViewEmptyRuntimeUnmanaged ViewHooks ViewLayoutStrideAssignment ViewMapping_a @@ -240,6 +245,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) ViewMapping_subview ViewMemoryAccessViolation ViewOfClass + ViewOutOfBoundsAccess ViewResize WorkGraph WithoutInitializing @@ -372,20 +378,21 @@ if(Kokkos_ENABLE_OPENMPTARGET) ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamCombinedReducers.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamReductionScan.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_WorkGraph.cpp - IF (KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0) - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c01.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c02.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c03.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp - endif() IF (KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0) ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_shared.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MinMaxClamp.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_LocalDeepCopy.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_e.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamScan.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamBasic.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_e.cpp + IF (KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 17.0.3) + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c01.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c02.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c03.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp + endif() endif() # FIXME_OPENMPTARGET_CRAY: The following tests fail at compile time when the OpenMPTarget backend is enabled with the Cray compiler. # Atomic compare/exchange is used in these tests which can be one of the reasons for the compilation failures. @@ -522,17 +529,7 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) list(REMOVE_ITEM OpenACC_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_a1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_b1.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_double.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_float.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_int.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_longint.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_longlongint.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_shared.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_unsignedint.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_unsignedlongint.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Atomics.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicViews.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_BlockSizeDeduction.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_DeepCopyAlignment.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_HostSharedPtr.cpp @@ -549,17 +546,10 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_d.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions_DeviceView.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_b.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c02.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c03.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c05.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c08.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c11.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamBasic.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamScratch.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamTeamSize.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_UniqueToken.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewMapping_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewResize.cpp ) endif() @@ -677,7 +667,6 @@ endif() if (Kokkos_ENABLE_OPENMP) set(OpenMP_EXTRA_SOURCES openmp/TestOpenMP_Task.cpp - openmp/TestOpenMP_PartitionMaster.cpp ) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_OpenMP @@ -724,12 +713,14 @@ if(Kokkos_ENABLE_HPX) hpx/TestHPX_IndependentInstancesRefCounting.cpp hpx/TestHPX_IndependentInstancesSynchronization.cpp ) +if(Kokkos_ENABLE_DEPRECATED_CODE_4) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_HPX_InParallel SOURCES UnitTestMainInit.cpp hpx/TestHPX_InParallel.cpp ) + endif() endif() if(Kokkos_ENABLE_OPENMPTARGET) @@ -797,6 +788,12 @@ if(Kokkos_ENABLE_CUDA) UnitTestMain.cpp cuda/TestCuda_InterOp_Streams.cpp ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + CoreUnitTest_CudaInterOpStreamsMultiGPU + SOURCES + UnitTestMainInit.cpp + cuda/TestCuda_InterOp_StreamsMultiGPU.cpp + ) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_CudaGraph SOURCES @@ -1039,13 +1036,7 @@ KOKKOS_ADD_ADVANCED_TEST( CoreUnitTest_PushFinalizeHook_terminate tools/TestCategoricalTuner.cpp ) endif() - if((NOT Kokkos_ENABLE_OPENMPTARGET) AND (NOT Kokkos_ENABLE_OPENACC)) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_LogicalSpaces - SOURCES - tools/TestLogicalSpaces.cpp - ) - endif() + SET(KOKKOSP_SOURCES UnitTestMainInit.cpp tools/TestEventCorrectness.cpp @@ -1167,15 +1158,6 @@ KOKKOS_ADD_TEST( NAME CoreUnitTest_StackTraceTest ) endif() -if(Kokkos_ENABLE_DEPRECATED_CODE_3) - foreach(INITTESTS_NUM RANGE 1 18) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_DefaultInit_${INITTESTS_NUM} - SOURCES UnitTestMain.cpp default/TestDefaultDeviceTypeInit_${INITTESTS_NUM}.cpp - ) - endforeach(INITTESTS_NUM) -endif() - if (KOKKOS_ENABLE_HWLOC) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_HWLOC @@ -1259,12 +1241,10 @@ if (NOT KOKKOS_HAS_TRILINOS) INPUT TestDeviceAndThreads.py ${USE_SOURCE_PERMISSIONS_WHEN_SUPPORTED} ) - if(NOT Kokkos_ENABLE_OPENMPTARGET) # FIXME_OPENMPTARGET does not select the right device - add_test( - NAME Kokkos_CoreUnitTest_DeviceAndThreads - COMMAND ${Python3_EXECUTABLE} -m unittest -v $/TestDeviceAndThreads.py - ) - endif() + add_test( + NAME Kokkos_CoreUnitTest_DeviceAndThreads + COMMAND ${Python3_EXECUTABLE} -m unittest -v $/TestDeviceAndThreads.py + ) endif() endif() diff --git a/packages/kokkos/core/unit_test/Makefile b/packages/kokkos/core/unit_test/Makefile index 33a84b61f92a..202809d3fc98 100644 --- a/packages/kokkos/core/unit_test/Makefile +++ b/packages/kokkos/core/unit_test/Makefile @@ -67,8 +67,8 @@ TESTS = AtomicOperations_int AtomicOperations_unsignedint AtomicOperations_longi tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ tmp2 := $(foreach test, $(TESTS), \ $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > Test$(device)_$(test).cpp); \ - $(shell echo "\#include " >> Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include " > Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include " >> Test$(device)_$(test).cpp); \ ) \ ) \ ) @@ -82,8 +82,8 @@ KOKKOS_SUBVIEW_DEVICELIST := $(filter-out Cuda, $(KOKKOS_DEVICELIST)) tmp := $(foreach device, $(KOKKOS_SUBVIEW_DEVICELIST), \ tmp2 := $(foreach test, $(SUBVIEW_TESTS), \ $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),, \ - $(shell echo "\#include " > Test$(device)_$(test).cpp); \ - $(shell echo "\#include " >> Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include " > Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include " >> Test$(device)_$(test).cpp); \ ) \ )\ ) @@ -91,8 +91,8 @@ tmp := $(foreach device, $(KOKKOS_SUBVIEW_DEVICELIST), \ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) tmp2 := $(foreach test, $(SUBVIEW_TESTS), \ $(if $(filter TestCuda_$(test).cpp, $(shell ls TestCuda_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > TestCuda_$(test).cpp); \ - $(shell echo "\#include " >> TestCuda_$(test).cpp); \ + $(shell echo "$(H)include " > TestCuda_$(test).cpp); \ + $(shell echo "$(H)include " >> TestCuda_$(test).cpp); \ )\ ) @@ -100,8 +100,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) tmp := $(foreach space, $(GPU_SPACES), \ tmp2 := $(foreach test, $(GPU_SPACE_TESTS), \ $(if $(filter Test$(space)_$(test).cpp, $(shell ls Test$(space)_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > Test$(space)_$(test).cpp); \ - $(shell echo "\#include " >> Test$(space)_$(test).cpp); \ + $(shell echo "$(H)include " > Test$(space)_$(test).cpp); \ + $(shell echo "$(H)include " >> Test$(space)_$(test).cpp); \ )\ )\ ) @@ -277,8 +277,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) tmp := $(foreach space, $(GPU_SPACES), \ tmp2 := $(foreach test, $(GPU_SPACE_TESTS), \ $(if $(filter Test$(space)_$(test).cpp, $(shell ls Test$(space)_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > Test$(space)_$(test).cpp); \ - $(shell echo "\#include " >> Test$(space)_$(test).cpp); \ + $(shell echo "$(H)include " > Test$(space)_$(test).cpp); \ + $(shell echo "$(H)include " >> Test$(space)_$(test).cpp); \ )\ )\ ) diff --git a/packages/kokkos/core/unit_test/TestAggregate.hpp b/packages/kokkos/core/unit_test/TestAggregate.hpp index 4f67b2eddceb..f1316a7426af 100644 --- a/packages/kokkos/core/unit_test/TestAggregate.hpp +++ b/packages/kokkos/core/unit_test/TestAggregate.hpp @@ -29,35 +29,31 @@ void TestViewAggregate() { value_type>; static_assert( - std::is_same >::value, - ""); + std::is_same >::value); using a32_traits = Kokkos::ViewTraits; using flat_traits = Kokkos::ViewTraits; static_assert( - std::is_same >::value, - ""); + std::is_same >::value); static_assert( - std::is_same::value, ""); - static_assert(a32_traits::rank == 2, ""); - static_assert(a32_traits::rank_dynamic == 2, ""); + std::is_same::value); + static_assert(a32_traits::rank == 2); + static_assert(a32_traits::rank_dynamic == 2); - static_assert(std::is_void::value, ""); - static_assert(flat_traits::rank == 3, ""); - static_assert(flat_traits::rank_dynamic == 2, ""); - static_assert(flat_traits::dimension::N2 == 32, ""); + static_assert(std::is_void::value); + static_assert(flat_traits::rank == 3); + static_assert(flat_traits::rank_dynamic == 2); + static_assert(flat_traits::dimension::N2 == 32); using a32_type = Kokkos::View **, DeviceType>; using a32_flat_type = typename a32_type::array_type; - static_assert(std::is_same::value, - ""); - static_assert(std::is_same::value, - ""); - static_assert(a32_type::rank == 2, ""); - static_assert(a32_flat_type::rank == 3, ""); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(a32_type::rank == 2); + static_assert(a32_flat_type::rank == 3); a32_type x("test", 4, 5); a32_flat_type y(x); diff --git a/packages/kokkos/core/unit_test/TestArray.cpp b/packages/kokkos/core/unit_test/TestArray.cpp index d3bdc4f93f7b..673d0036b716 100644 --- a/packages/kokkos/core/unit_test/TestArray.cpp +++ b/packages/kokkos/core/unit_test/TestArray.cpp @@ -49,4 +49,28 @@ KOKKOS_FUNCTION constexpr bool test_array_structured_binding_support() { static_assert(test_array_structured_binding_support()); +template +KOKKOS_FUNCTION constexpr bool is_equal(L const& l, R const& r) { + if (std::size(l) != std::size(r)) return false; + + for (size_t i = 0; i != std::size(l); ++i) { + if (l[i] != r[i]) return false; + } + + return true; +} + +// Disable ctad test for intel versions < 2021, see issue #6702 +#if !defined(KOKKOS_COMPILER_INTEL) || KOKKOS_COMPILER_INTEL >= 2021 +KOKKOS_FUNCTION constexpr bool test_array_ctad() { + constexpr int x = 10; + constexpr Kokkos::Array a{1, 2, 3, 5, x}; + constexpr Kokkos::Array b{1, 2, 3, 5, x}; + + return std::is_same_v && is_equal(a, b); +} + +static_assert(test_array_ctad()); +#endif + } // namespace diff --git a/packages/kokkos/core/unit_test/TestAtomicOperations.hpp b/packages/kokkos/core/unit_test/TestAtomicOperations.hpp index a5aebed41380..cd7ba47aa1e9 100644 --- a/packages/kokkos/core/unit_test/TestAtomicOperations.hpp +++ b/packages/kokkos/core/unit_test/TestAtomicOperations.hpp @@ -368,6 +368,63 @@ bool atomic_op_test(T old_val, T update) { return result == 0; } +template +constexpr T relative_error_threshold = T(1.0e-15); + +template +bool atomic_op_test_rel(T old_val, T update) { + Kokkos::View op_data("op_data"); + Kokkos::deep_copy(op_data, old_val); + int result = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, 1), + KOKKOS_LAMBDA(int, int& local_result) { + auto fetch_result = + Op::atomic_op(&op_data(0), &op_data(1), &op_data(2), update); + T expected_val = Op::op(old_val, update); + Kokkos::memory_fence(); + if (expected_val == T(0)) { + if (fabs(op_data(0)) > relative_error_threshold) local_result += 1; + if (fabs(op_data(1)) > relative_error_threshold) local_result += 2; + if (fabs(op_data(2)) > relative_error_threshold) local_result += 4; + if (fetch_result.first != old_val) local_result += 8; + if (fabs(fetch_result.second) > relative_error_threshold) + local_result += 16; + } else { + if (fabs((op_data(0) - expected_val) / expected_val) > + relative_error_threshold) + local_result += 1; + if (fabs((op_data(1) - expected_val) / expected_val) > + relative_error_threshold) + local_result += 2; + if (fabs((op_data(2) - expected_val) / expected_val) > + relative_error_threshold) + local_result += 4; + if (fetch_result.first != old_val) local_result += 8; + if (fabs((fetch_result.second - expected_val) / expected_val) > + relative_error_threshold) + local_result += 16; + } + }, + result); + if ((result & 1) != 0) + printf("atomic_%s failed with type %s\n", Op::name(), typeid(T).name()); + if ((result & 2) != 0) + printf("atomic_fetch_%s failed with type %s\n", Op::name(), + typeid(T).name()); + if ((result & 4) != 0) + printf("atomic_%s_fetch failed with type %s\n", Op::name(), + typeid(T).name()); + if ((result & 8) != 0) + printf("atomic_fetch_%s did not return old value with type %s\n", + Op::name(), typeid(T).name()); + if ((result & 16) != 0) + printf("atomic_%s_fetch did not return updated value with type %s\n", + Op::name(), typeid(T).name()); + + return result == 0; +} + //--------------------------------------------------- //--------------atomic_test_control------------------ //--------------------------------------------------- @@ -395,6 +452,12 @@ bool AtomicOperationsTestIntegralType(int old_val_in, int update_in, int test) { case 9: return atomic_op_test(old_val, update); case 10: return atomic_op_test(old_val, update); +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + // FIXME_NVHPC: atomic-fetch-shift operation fails due to NVHPC OpenACC + // compiler bugs, which are reported to NVIDIA. + case 11: return true; + case 12: return true; +#else case 11: return update_in >= 0 ? atomic_op_test( old_val, update) @@ -403,6 +466,7 @@ bool AtomicOperationsTestIntegralType(int old_val_in, int update_in, int test) { return update_in >= 0 ? atomic_op_test( old_val, update) : true; +#endif case 13: return atomic_op_test(old_val, update); case 14: @@ -440,10 +504,20 @@ bool AtomicOperationsTestNonIntegralType(int old_val_in, int update_in, case 2: return atomic_op_test(old_val, update); case 3: return atomic_op_test(old_val, update); case 4: return atomic_op_test(old_val, update); +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + // NVHPC may use different internal precisions for the device and host + // atomic operations. Therefore, relative errors are used to compare the + // host results and device results. + case 5: + return update != 0 ? atomic_op_test_rel( + old_val, update) + : true; +#else case 5: return update != 0 ? atomic_op_test(old_val, update) : true; +#endif case 6: return atomic_op_test(old_val, update); } diff --git a/packages/kokkos/core/unit_test/TestAtomics.hpp b/packages/kokkos/core/unit_test/TestAtomics.hpp index 2b40f12d0a4d..5f48e8c97460 100644 --- a/packages/kokkos/core/unit_test/TestAtomics.hpp +++ b/packages/kokkos/core/unit_test/TestAtomics.hpp @@ -498,7 +498,9 @@ TEST(TEST_CATEGORY, atomics) { ASSERT_TRUE((TestAtomic::Loop(100, 2))); ASSERT_TRUE((TestAtomic::Loop(100, 3))); -#ifndef KOKKOS_ENABLE_OPENMPTARGET + // FIXME_OPENMPTARGET + // FIXME_OPENACC: atomic operations on composite types are not supported. +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_OPENACC) ASSERT_TRUE((TestAtomic::Loop, TEST_EXECSPACE>(1, 1))); ASSERT_TRUE((TestAtomic::Loop, TEST_EXECSPACE>(1, 2))); ASSERT_TRUE((TestAtomic::Loop, TEST_EXECSPACE>(1, 3))); diff --git a/packages/kokkos/core/unit_test/TestBitManipulationBuiltins.hpp b/packages/kokkos/core/unit_test/TestBitManipulationBuiltins.hpp index 092e7cff6180..2f3bcfe817df 100644 --- a/packages/kokkos/core/unit_test/TestBitManipulationBuiltins.hpp +++ b/packages/kokkos/core/unit_test/TestBitManipulationBuiltins.hpp @@ -804,26 +804,26 @@ struct TestBitCastFunction { using Kokkos::bit_cast; if (bit_cast(123) != 123) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #1\n"); + Kokkos::printf("failed check #1\n"); } if (bit_cast(123u) != 123) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #2\n"); + Kokkos::printf("failed check #2\n"); } if (bit_cast(~0u) != ~0) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #3\n"); + Kokkos::printf("failed check #3\n"); } if constexpr (sizeof(int) == sizeof(float)) { if (!check(12.34f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #4\n"); + Kokkos::printf("failed check #4\n"); } } if constexpr (sizeof(unsigned long long) == sizeof(double)) { if (!check(123.456)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #5\n"); + Kokkos::printf("failed check #5\n"); } } @@ -848,11 +848,11 @@ struct TestBitCastFunction { } if (!(bit_cast(arr) == arr)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #6\n"); + Kokkos::printf("failed check #6\n"); } if (!(bit_cast(arr2) == arr2)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #7\n"); + Kokkos::printf("failed check #7\n"); } } }; diff --git a/packages/kokkos/core/unit_test/TestComplex.hpp b/packages/kokkos/core/unit_test/TestComplex.hpp index bcae2e1d8160..5501a35b7f0f 100644 --- a/packages/kokkos/core/unit_test/TestComplex.hpp +++ b/packages/kokkos/core/unit_test/TestComplex.hpp @@ -451,17 +451,15 @@ TEST(TEST_CATEGORY, complex_issue_3867) { ASSERT_FLOAT_EQ(x.real(), y.real()); ASSERT_FLOAT_EQ(x.imag(), y.imag()); -#define CHECK_POW_COMPLEX_PROMOTION(ARGTYPE1, ARGTYPE2, RETURNTYPE) \ - static_assert( \ - std::is_same(), \ - std::declval()))>::value, \ - ""); \ - static_assert( \ - std::is_same(), \ - std::declval()))>::value, \ - ""); +#define CHECK_POW_COMPLEX_PROMOTION(ARGTYPE1, ARGTYPE2, RETURNTYPE) \ + static_assert( \ + std::is_same(), \ + std::declval()))>::value); \ + static_assert( \ + std::is_same(), \ + std::declval()))>::value); CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex, long double, Kokkos::complex); diff --git a/packages/kokkos/core/unit_test/TestConcepts.hpp b/packages/kokkos/core/unit_test/TestConcepts.hpp index 476a8848325c..b85867bf63ac 100644 --- a/packages/kokkos/core/unit_test/TestConcepts.hpp +++ b/packages/kokkos/core/unit_test/TestConcepts.hpp @@ -22,42 +22,42 @@ using ExecutionSpace = TEST_EXECSPACE; using MemorySpace = typename ExecutionSpace::memory_space; using DeviceType = typename ExecutionSpace::device_type; -static_assert(Kokkos::is_execution_space{}, ""); -static_assert(Kokkos::is_execution_space{}, ""); -static_assert(!Kokkos::is_execution_space{}, ""); -static_assert(!Kokkos::is_execution_space{}, ""); - -static_assert(Kokkos::is_memory_space{}, ""); -static_assert(Kokkos::is_memory_space{}, ""); -static_assert(!Kokkos::is_memory_space{}, ""); -static_assert(!Kokkos::is_memory_space{}, ""); - -static_assert(Kokkos::is_device{}, ""); -static_assert(Kokkos::is_device{}, ""); -static_assert(!Kokkos::is_device{}, ""); -static_assert(!Kokkos::is_device{}, ""); - -static_assert(!Kokkos::is_device{}, ""); -static_assert(!Kokkos::is_device{}, ""); - -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(!Kokkos::is_space{}, ""); -static_assert(!Kokkos::is_space{}, ""); -static_assert(!Kokkos::is_space{}, ""); - -static_assert(Kokkos::is_execution_space_v, ""); -static_assert(!Kokkos::is_execution_space_v, ""); +static_assert(Kokkos::is_execution_space{}); +static_assert(Kokkos::is_execution_space{}); +static_assert(!Kokkos::is_execution_space{}); +static_assert(!Kokkos::is_execution_space{}); + +static_assert(Kokkos::is_memory_space{}); +static_assert(Kokkos::is_memory_space{}); +static_assert(!Kokkos::is_memory_space{}); +static_assert(!Kokkos::is_memory_space{}); + +static_assert(Kokkos::is_device{}); +static_assert(Kokkos::is_device{}); +static_assert(!Kokkos::is_device{}); +static_assert(!Kokkos::is_device{}); + +static_assert(!Kokkos::is_device{}); +static_assert(!Kokkos::is_device{}); + +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(!Kokkos::is_space{}); +static_assert(!Kokkos::is_space{}); +static_assert(!Kokkos::is_space{}); + +static_assert(Kokkos::is_execution_space_v); +static_assert(!Kokkos::is_execution_space_v); static_assert( - std::is_same>{}, ""); -static_assert(std::is_same>{}, ""); -static_assert(std::is_same>{}, ""); -static_assert(std::is_same>{}, ""); + std::is_same>{}); +static_assert(std::is_same>{}); +static_assert(std::is_same>{}); +static_assert(std::is_same>{}); /*------------------------------------------------- begin test for team_handle concept diff --git a/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp b/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp deleted file mode 100644 index 929c91db4e00..000000000000 --- a/packages/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp +++ /dev/null @@ -1,491 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include - -#include - -#ifdef KOKKOS_ENABLE_OPENMP -#include -#endif -#include -#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__) - -namespace Test { - -namespace Impl { - -std::set delete_these; -void cleanup_memory() { - for (auto x : delete_these) { - delete[] x; - } -} - -char** init_kokkos_args(bool do_threads, bool do_numa, bool do_device, - bool do_other, bool do_tune, int& nargs, - Kokkos::InitArguments& init_args) { - nargs = (do_threads ? 1 : 0) + (do_numa ? 1 : 0) + (do_device ? 1 : 0) + - (do_other ? 4 : 0) + (do_tune ? 1 : 0); - - char** args_kokkos = new char*[nargs]; - const int max_args_size = 45; - for (int i = 0; i < nargs; i++) { - args_kokkos[i] = new char[max_args_size]; - delete_these.insert(args_kokkos[i]); - } - - int threads_idx = do_other ? 1 : 0; - int numa_idx = (do_other ? 3 : 0) + (do_threads ? 1 : 0); - int device_idx = - (do_other ? 3 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0); - int tune_idx = (do_other ? 4 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0) + - (do_device ? 1 : 0); - - if (do_threads) { - int nthreads = 3; - -#ifdef KOKKOS_ENABLE_OPENMP - if (omp_get_max_threads() < nthreads) { - nthreads = omp_get_max_threads(); - } -#elif defined(KOKKOS_ENABLE_HPX) - const int concurrency = std::thread::hardware_concurrency(); - if (concurrency < nthreads) { - nthreads = concurrency; - } -#endif - - if (Kokkos::hwloc::available()) { - if (Kokkos::hwloc::get_available_threads_per_core() < - static_cast(nthreads)) - nthreads = Kokkos::hwloc::get_available_threads_per_core() * - Kokkos::hwloc::get_available_numa_count(); - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same::value || - std::is_same::value) { - nthreads = 1; - } -#endif - - init_args.num_threads = nthreads; - snprintf(args_kokkos[threads_idx], max_args_size, "--threads=%i", nthreads); - } - - if (do_numa) { - int numa = 1; - if (Kokkos::hwloc::available()) { - numa = Kokkos::hwloc::get_available_numa_count(); - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same::value || - std::is_same::value) { - numa = 1; - } -#endif - - init_args.num_numa = numa; - snprintf(args_kokkos[numa_idx], max_args_size, "--numa=%i", numa); - } - - if (do_device) { - init_args.device_id = 0; - snprintf(args_kokkos[device_idx], max_args_size, "--device-id=%i", 0); - } - - if (do_other) { - snprintf(args_kokkos[0], max_args_size, "--dummyarg=1"); - snprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0)], max_args_size, - "--dummy2arg"); - snprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0) + 1], max_args_size, - "dummy3arg"); - snprintf(args_kokkos[device_idx + (do_device ? 1 : 0)], max_args_size, - "dummy4arg=1"); - } - - if (do_tune) { - init_args.tune_internals = true; - snprintf(args_kokkos[tune_idx], max_args_size, "--kokkos-tune-internals"); - } - - return args_kokkos; -} - -Kokkos::InitArguments init_initstruct(bool do_threads, bool do_numa, - bool do_device, bool do_tune) { - Kokkos::InitArguments args; - - if (do_threads) { - int nthreads = 3; - -#ifdef KOKKOS_ENABLE_OPENMP - if (omp_get_max_threads() < nthreads) { - nthreads = omp_get_max_threads(); - } -#elif defined(KOKKOS_ENABLE_HPX) - const int concurrency = std::thread::hardware_concurrency(); - if (concurrency < nthreads) { - nthreads = concurrency; - } -#endif - - if (Kokkos::hwloc::available()) { - if (Kokkos::hwloc::get_available_threads_per_core() < - static_cast(nthreads)) { - nthreads = Kokkos::hwloc::get_available_threads_per_core() * - Kokkos::hwloc::get_available_numa_count(); - } - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same::value || - std::is_same::value) { - nthreads = 1; - } -#endif - - args.num_threads = nthreads; - } - - if (do_numa) { - int numa = 1; - if (Kokkos::hwloc::available()) { - numa = Kokkos::hwloc::get_available_numa_count(); - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same::value || - std::is_same::value) { - numa = 1; - } -#endif - - args.num_numa = numa; - } - - if (do_device) { - args.device_id = 0; - } - - if (do_tune) { - args.tune_internals = true; - } - - return args; -} - -void check_correct_initialization(const Kokkos::InitArguments& argstruct) { - ASSERT_EQ(Kokkos::DefaultExecutionSpace::impl_is_initialized(), 1); - ASSERT_EQ(Kokkos::HostSpace::execution_space::impl_is_initialized(), 1); - - // Figure out the number of threads the HostSpace ExecutionSpace should have - // initialized to. - int expected_nthreads = argstruct.num_threads; - -#ifdef KOKKOS_ENABLE_OPENMP - if (std::is_same::value) { - // use openmp default num threads - if (expected_nthreads < 0 || - (expected_nthreads == 0 && !Kokkos::hwloc::available())) { - expected_nthreads = omp_get_max_threads(); - } - // use hwloc if available - else if (expected_nthreads == 0 && Kokkos::hwloc::available()) { - expected_nthreads = Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa() * - Kokkos::hwloc::get_available_threads_per_core(); - } - } -#endif - - if (expected_nthreads < 1) { - if (Kokkos::hwloc::available()) { - expected_nthreads = Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa() * - Kokkos::hwloc::get_available_threads_per_core(); - } else { - expected_nthreads = 1; - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same::value || - std::is_same::value) { - expected_nthreads = 1; - } -#endif - -#ifdef KOKKOS_ENABLE_HPX - // HPX uses all cores on machine by default. Skip this test. - if (std::is_same::value || - std::is_same::value) { - return; - } -#endif - } - - int expected_numa = argstruct.num_numa; - - if (expected_numa < 1) { - if (Kokkos::hwloc::available()) { - expected_numa = Kokkos::hwloc::get_available_numa_count(); - } else { - expected_numa = 1; - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same::value || - std::is_same::value) - expected_numa = 1; -#endif - } - - ASSERT_EQ(Kokkos::HostSpace::execution_space().impl_thread_pool_size(), - expected_nthreads); - -#ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) { - int device; - cudaGetDevice(&device); - - int expected_device = argstruct.device_id; - if (argstruct.device_id < 0) { - expected_device = Kokkos::Cuda().cuda_device(); - } - - ASSERT_EQ(expected_device, device); - } -#endif - ASSERT_EQ(argstruct.tune_internals, Kokkos::tune_internals()); -} - -// TODO: Add check whether correct number of threads are actually started. -void test_no_arguments() { - Kokkos::initialize(); - check_correct_initialization(Kokkos::InitArguments()); - Kokkos::finalize(); -} - -void test_commandline_args(int nargs, char** args, - const Kokkos::InitArguments& argstruct) { - Kokkos::initialize(nargs, args); - check_correct_initialization(argstruct); - Kokkos::finalize(); -} - -void test_initstruct_args(const Kokkos::InitArguments& args) { - Kokkos::initialize(args); - check_correct_initialization(args); - Kokkos::finalize(); -} - -} // namespace Impl - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_01 -TEST(defaultdevicetypeinit, no_args) { Impl::test_no_arguments(); } -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_02 -TEST(defaultdevicetypeinit, commandline_args_empty) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = Impl::init_kokkos_args(false, false, false, false, false, nargs, - argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_03 -TEST(defaultdevicetypeinit, commandline_args_other) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = Impl::init_kokkos_args(false, false, false, true, false, nargs, - argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_04 -TEST(defaultdevicetypeinit, commandline_args_nthreads) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = Impl::init_kokkos_args(true, false, false, false, false, nargs, - argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_05 -TEST(defaultdevicetypeinit, commandline_args_nthreads_numa) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, true, false, false, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_06 -TEST(defaultdevicetypeinit, commandline_args_nthreads_numa_device) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, true, true, false, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_07 -TEST(defaultdevicetypeinit, commandline_args_nthreads_device) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, false, true, false, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_08 -TEST(defaultdevicetypeinit, commandline_args_numa_device) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(false, true, true, false, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_09 -TEST(defaultdevicetypeinit, commandline_args_device) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = Impl::init_kokkos_args(false, false, true, false, false, nargs, - argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_10 -TEST(defaultdevicetypeinit, commandline_args_nthreads_numa_device_other) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, true, true, true, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_11 -TEST(defaultdevicetypeinit, commandline_args_nthreads_numa_device_other_tune) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, true, true, true, true, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_12 -TEST(defaultdevicetypeinit, initstruct_default) { - Kokkos::InitArguments args; - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_13 -TEST(defaultdevicetypeinit, initstruct_nthreads) { - Kokkos::InitArguments args = Impl::init_initstruct(true, false, false, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_14 -TEST(defaultdevicetypeinit, initstruct_nthreads_numa) { - Kokkos::InitArguments args = Impl::init_initstruct(true, true, false, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_15 -TEST(defaultdevicetypeinit, initstruct_device) { - Kokkos::InitArguments args = Impl::init_initstruct(false, false, true, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_16 -TEST(defaultdevicetypeinit, initstruct_nthreads_device) { - Kokkos::InitArguments args = Impl::init_initstruct(true, false, true, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_17 -TEST(defaultdevicetypeinit, initstruct_nthreads_numa_device) { - Kokkos::InitArguments args = Impl::init_initstruct(true, true, true, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_18 -TEST(defaultdevicetypeinit, initstruct_nthreads_numa_device_tune) { - Kokkos::InitArguments args = Impl::init_initstruct(true, true, true, true); - Impl::test_initstruct_args(args); -} -#endif - -} // namespace Test - -#endif diff --git a/packages/kokkos/core/unit_test/TestDeviceAndThreads.py b/packages/kokkos/core/unit_test/TestDeviceAndThreads.py index 1d3ff8eea7e7..63d26ad41a44 100644 --- a/packages/kokkos/core/unit_test/TestDeviceAndThreads.py +++ b/packages/kokkos/core/unit_test/TestDeviceAndThreads.py @@ -17,6 +17,8 @@ import unittest import subprocess +import platform +import os PREFIX = "$" EXECUTABLE = "$" @@ -30,7 +32,22 @@ def GetFlag(flag, *extra_args): return int(p.stdout) def GetNumThreads(max_threads): - for x in [1, 2, 3, 5, 7]: + args = [] + name = platform.system() + if name == 'Darwin': + args = ['sysctl', '-n', 'hw.physicalcpu_max'] + elif name == 'Linux': + args = ['nproc', '--all'] + else: + args = ['wmic', 'cpu', 'get', 'NumberOfCores'] + + result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output = result.stdout.decode('utf-8') + phys_cores_count = int(output) + looplist = [1] + [i*phys_cores_count for i in [1,2,3,4,5,6,7]] \ + if GetFlag("hwloc_enabled") else [1,2,3,4,5] + + for x in looplist: if x >= max_threads: break yield x @@ -48,13 +65,25 @@ def test_num_threads(self): "num_threads", "--kokkos-num-threads={}".format(num_threads))) + def test_num_devices(self): + if "KOKKOS_VISIBLE_DEVICES" in os.environ: + self.skipTest("KOKKOS_VISIBLE_DEVICES environment variable is set") + num_devices = GetFlag("num_devices") + self.assertNotEqual(num_devices, 0) + if num_devices == -1: + self.skipTest("no device backend enabled") + self.assertGreaterEqual(num_devices, 1) + def test_device_id(self): - device_count = GetFlag("device_count") - if device_count == 0: - self.skipTest("no device detected") + if "KOKKOS_VISIBLE_DEVICES" in os.environ: + self.skipTest("KOKKOS_VISIBLE_DEVICES environment variable is set") + num_devices = GetFlag("num_devices") + if num_devices == -1: + self.assertEqual(-1, GetFlag("device_id")) + self.skipTest("no device backend enabled") # by default use the first GPU available for execution self.assertEqual(0, GetFlag("device_id")) - for device_id in range(device_count): + for device_id in range(num_devices): self.assertEqual( device_id, GetFlag( diff --git a/packages/kokkos/core/unit_test/TestExecutionSpace.hpp b/packages/kokkos/core/unit_test/TestExecutionSpace.hpp index 6f0f159c1740..983a5975afd6 100644 --- a/packages/kokkos/core/unit_test/TestExecutionSpace.hpp +++ b/packages/kokkos/core/unit_test/TestExecutionSpace.hpp @@ -25,13 +25,7 @@ struct CheckClassWithExecutionSpaceAsDataMemberIsCopyable { Kokkos::DefaultExecutionSpace device; Kokkos::DefaultHostExecutionSpace host; - KOKKOS_FUNCTION void operator()(int, int& e) const { - // not actually doing anything useful, mostly checking that - // ExecutionSpace::in_parallel() is callable - if (static_cast(device.in_parallel()) < 0) { - ++e; - } - } + KOKKOS_FUNCTION void operator()(int i, int& e) const { e += i; } CheckClassWithExecutionSpaceAsDataMemberIsCopyable() { int errors; diff --git a/packages/kokkos/core/unit_test/TestFunctorAnalysis.hpp b/packages/kokkos/core/unit_test/TestFunctorAnalysis.hpp index c024526111b1..e58324144e46 100644 --- a/packages/kokkos/core/unit_test/TestFunctorAnalysis.hpp +++ b/packages/kokkos/core/unit_test/TestFunctorAnalysis.hpp @@ -59,16 +59,15 @@ void test_functor_analysis() { using R01 = typename A01::Reducer; - static_assert(std::is_void::value, ""); - static_assert(std::is_void::value, ""); - static_assert(std::is_void::value, ""); - static_assert(std::is_same::value, - ""); - - static_assert(!A01::has_join_member_function, ""); - static_assert(!A01::has_init_member_function, ""); - static_assert(!A01::has_final_member_function, ""); - static_assert(A01::StaticValueSize == 0, ""); + static_assert(std::is_void::value); + static_assert(std::is_void::value); + static_assert(std::is_void::value); + static_assert(std::is_same::value); + + static_assert(!A01::has_join_member_function); + static_assert(!A01::has_init_member_function); + static_assert(!A01::has_final_member_function); + static_assert(A01::StaticValueSize == 0); ASSERT_EQ(R01(c01).length(), 0); //------------------------------ @@ -78,16 +77,15 @@ void test_functor_analysis() { Kokkos::RangePolicy, decltype(c02), void>; using R02 = typename A02::Reducer; - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, - ""); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); - static_assert(!A02::has_join_member_function, ""); - static_assert(!A02::has_init_member_function, ""); - static_assert(!A02::has_final_member_function, ""); - static_assert(A02::StaticValueSize == sizeof(double), ""); + static_assert(!A02::has_join_member_function); + static_assert(!A02::has_init_member_function); + static_assert(!A02::has_final_member_function); + static_assert(A02::StaticValueSize == sizeof(double)); ASSERT_EQ(R02(c02).length(), 1); //------------------------------ @@ -99,23 +97,19 @@ void test_functor_analysis() { using R03 = typename A03::Reducer; static_assert(std::is_same::value, - ""); + TestFunctorAnalysis_03::value_type>::value); static_assert(std::is_same::value, - ""); + TestFunctorAnalysis_03::value_type*>::value); static_assert(std::is_same::value, - ""); + TestFunctorAnalysis_03::value_type&>::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); - static_assert(A03::has_join_member_function, ""); - static_assert(A03::has_init_member_function, ""); - static_assert(!A03::has_final_member_function, ""); - static_assert( - A03::StaticValueSize == sizeof(TestFunctorAnalysis_03::value_type), ""); + static_assert(A03::has_join_member_function); + static_assert(A03::has_init_member_function); + static_assert(!A03::has_final_member_function); + static_assert(A03::StaticValueSize == + sizeof(TestFunctorAnalysis_03::value_type)); ASSERT_EQ(R03(c03).length(), 1); //------------------------------ diff --git a/packages/kokkos/core/unit_test/TestHalfOperators.hpp b/packages/kokkos/core/unit_test/TestHalfOperators.hpp index 752e3b508161..c69cdd570342 100644 --- a/packages/kokkos/core/unit_test/TestHalfOperators.hpp +++ b/packages/kokkos/core/unit_test/TestHalfOperators.hpp @@ -268,96 +268,6 @@ enum OP_TESTS { N_OP_TESTS }; -// volatile-qualified parameter type 'volatile half_type' is deprecated -#if !defined(KOKKOS_ENABLE_CXX20) && !defined(KOKKOS_ENABLE_CXX23) -template -struct Functor_TestHalfVolatileOperators { - volatile half_type h_lhs, h_rhs; - view_type actual_lhs, expected_lhs; - double d_lhs, d_rhs; - Functor_TestHalfVolatileOperators(volatile half_type lhs = half_type(0), - volatile half_type rhs = half_type(0)) - : h_lhs(lhs), h_rhs(rhs) { - actual_lhs = view_type("actual_lhs", N_OP_TESTS); - expected_lhs = view_type("expected_lhs", N_OP_TESTS); - half_type nv_tmp; - nv_tmp = h_lhs; - d_lhs = static_cast(nv_tmp); - nv_tmp = h_rhs; - d_rhs = static_cast(nv_tmp); - if (std::is_same::value) { - auto run_on_host = *this; - run_on_host(0); - } else { - Kokkos::parallel_for("Test::Functor_TestHalfVolatileOperators", - Kokkos::RangePolicy(0, 1), *this); - } - } - - KOKKOS_FUNCTION - void operator()(int) const { - volatile half_type tmp_lhs; - half_type nv_tmp; - - // Initialze output views to catch missing test invocations - for (int i = 0; i < N_OP_TESTS; ++i) { - actual_lhs(i) = 1; - expected_lhs(i) = -1; - } - - nv_tmp = h_lhs; - actual_lhs(ASSIGN) = static_cast(nv_tmp); - expected_lhs(ASSIGN) = d_lhs; - - actual_lhs(LT_H_H) = h_lhs < h_rhs; - expected_lhs(LT_H_H) = d_lhs < d_rhs; - - actual_lhs(LE_H_H) = h_lhs <= h_rhs; - expected_lhs(LE_H_H) = d_lhs <= d_rhs; - - actual_lhs(NEQ) = h_lhs != h_rhs; - expected_lhs(NEQ) = d_lhs != d_rhs; - - actual_lhs(GT_H_H) = h_lhs > h_rhs; - expected_lhs(GT_H_H) = d_lhs > d_rhs; - - actual_lhs(GE_H_H) = h_lhs >= h_rhs; - expected_lhs(GE_H_H) = d_lhs >= d_rhs; - - actual_lhs(EQ) = h_lhs == h_rhs; - expected_lhs(EQ) = d_lhs == d_rhs; - - tmp_lhs = h_lhs; - tmp_lhs += h_rhs; - nv_tmp = tmp_lhs; - actual_lhs(CADD_H_H) = static_cast(nv_tmp); - expected_lhs(CADD_H_H) = d_lhs; - expected_lhs(CADD_H_H) += d_rhs; - - tmp_lhs = h_lhs; - tmp_lhs -= h_rhs; - nv_tmp = tmp_lhs; - actual_lhs(CSUB_H_H) = static_cast(nv_tmp); - expected_lhs(CSUB_H_H) = d_lhs; - expected_lhs(CSUB_H_H) -= d_rhs; - - tmp_lhs = h_lhs; - tmp_lhs *= h_rhs; - nv_tmp = tmp_lhs; - actual_lhs(CMUL_H_H) = static_cast(nv_tmp); - expected_lhs(CMUL_H_H) = d_lhs; - expected_lhs(CMUL_H_H) *= d_rhs; - - tmp_lhs = h_lhs; - tmp_lhs /= h_rhs; - nv_tmp = tmp_lhs; - actual_lhs(CDIV_H_H) = static_cast(nv_tmp); - expected_lhs(CDIV_H_H) = d_lhs; - expected_lhs(CDIV_H_H) /= d_rhs; - } -}; -#endif - template struct Functor_TestHalfOperators { half_type h_lhs, h_rhs; @@ -995,33 +905,6 @@ void __test_half_operators(half_type h_lhs, half_type h_rhs) { static_cast(epsilon)); } -// volatile-qualified parameter type 'volatile half_type' is deprecated -#if !defined(KOKKOS_ENABLE_CXX20) && !defined(KOKKOS_ENABLE_CXX23) - // Test partial volatile support - volatile half_type _h_lhs = h_lhs; - volatile half_type _h_rhs = h_rhs; - Functor_TestHalfVolatileOperators f_volatile_device( - _h_lhs, _h_rhs); - Functor_TestHalfVolatileOperators f_volatile_host( - _h_lhs, _h_rhs); - - ExecutionSpace().fence(); - Kokkos::deep_copy(f_device_actual_lhs, f_device.actual_lhs); - Kokkos::deep_copy(f_device_expected_lhs, f_device.expected_lhs); - for (int op_test = 0; op_test < N_OP_TESTS; op_test++) { - // printf("op_test = %d\n", op_test); - if (op_test == ASSIGN || op_test == LT_H_H || op_test == LE_H_H || - op_test == NEQ || op_test == EQ || op_test == GT_H_H || - op_test == GE_H_H || op_test == CADD_H_H || op_test == CSUB_H_H || - op_test == CMUL_H_H || op_test == CDIV_H_H) { - ASSERT_NEAR(f_device_actual_lhs(op_test), f_device_expected_lhs(op_test), - static_cast(epsilon)); - ASSERT_NEAR(f_host.actual_lhs(op_test), f_host.expected_lhs(op_test), - static_cast(epsilon)); - } - } -#endif - // is_trivially_copyable is false with the addition of explicit // copy constructors that are required for supporting reductions // ASSERT_TRUE(std::is_trivially_copyable::value); diff --git a/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp b/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp index 3ee2ff52051a..467b9ad157fe 100644 --- a/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp +++ b/packages/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp @@ -37,7 +37,7 @@ template struct CheckAccessStoredPointerAndDereferenceOnDevice { SmartPtr m_device_ptr; using ElementType = typename SmartPtr::element_type; - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); CheckAccessStoredPointerAndDereferenceOnDevice(SmartPtr device_ptr) : m_device_ptr(device_ptr) { diff --git a/packages/kokkos/core/unit_test/TestInitializationSettings.cpp b/packages/kokkos/core/unit_test/TestInitializationSettings.cpp index f5be0e47aab4..40dc3f11df35 100644 --- a/packages/kokkos/core/unit_test/TestInitializationSettings.cpp +++ b/packages/kokkos/core/unit_test/TestInitializationSettings.cpp @@ -20,30 +20,6 @@ namespace { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -void take_initialization_settings(Kokkos::InitializationSettings const&) {} - -TEST(defaultdevicetype, - init_arguments_implicit_conversion_to_initialization_settings) { - Kokkos::InitArguments arguments; - take_initialization_settings(arguments); // check that conversion is implicit - arguments.device_id = 1; - arguments.tune_internals = true; - Kokkos::InitializationSettings settings{arguments}; - EXPECT_FALSE(settings.has_num_threads()); - EXPECT_TRUE(settings.has_device_id()); - EXPECT_EQ(settings.get_device_id(), 1); - EXPECT_FALSE(settings.has_num_devices()); - EXPECT_FALSE(settings.has_skip_device()); - EXPECT_FALSE(settings.has_disable_warnings()); - EXPECT_TRUE(settings.has_tune_internals()); - EXPECT_TRUE(settings.get_tune_internals()); - EXPECT_FALSE(settings.has_tools_help()); - EXPECT_FALSE(settings.has_tools_libs()); - EXPECT_FALSE(settings.has_tools_args()); -} -#endif - TEST(defaultdevicetype, initialization_settings) { auto const settings = Kokkos::InitializationSettings() .set_num_threads(255) @@ -52,8 +28,6 @@ TEST(defaultdevicetype, initialization_settings) { EXPECT_TRUE(settings.has_num_threads()); EXPECT_EQ(settings.get_num_threads(), 255); EXPECT_FALSE(settings.has_device_id()); - EXPECT_FALSE(settings.has_num_devices()); - EXPECT_FALSE(settings.has_skip_device()); EXPECT_TRUE(settings.has_disable_warnings()); EXPECT_FALSE(settings.get_disable_warnings()); EXPECT_FALSE(settings.has_tune_internals()); @@ -75,8 +49,6 @@ constexpr bool test_initialization_settings_getter() { TYPE>::value); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(num_threads, int); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(device_id, int); - CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(num_devices, int); - CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(skip_device, int); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(disable_warnings, bool); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(tune_internals, bool); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(tools_help, bool); diff --git a/packages/kokkos/core/unit_test/TestJoinBackwardCompatibility.hpp b/packages/kokkos/core/unit_test/TestJoinBackwardCompatibility.hpp index 24cf52aa7090..efe4a2307a82 100644 --- a/packages/kokkos/core/unit_test/TestJoinBackwardCompatibility.hpp +++ b/packages/kokkos/core/unit_test/TestJoinBackwardCompatibility.hpp @@ -36,9 +36,8 @@ KOKKOS_FUNCTION constexpr MyErrorCode operator|(MyErrorCode lhs, } static_assert((no_error | error_operator_plus_equal_volatile) == - error_operator_plus_equal_volatile, - ""); -static_assert((error_join_volatile | error_operator_plus_equal) == 0b101, ""); + error_operator_plus_equal_volatile); +static_assert((error_join_volatile | error_operator_plus_equal) == 0b101); struct MyJoinBackCompatValueType { MyErrorCode err = no_error; diff --git a/packages/kokkos/core/unit_test/TestMDRangePolicyCTAD.cpp b/packages/kokkos/core/unit_test/TestMDRangePolicyCTAD.cpp new file mode 100644 index 000000000000..b2c3d021c353 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestMDRangePolicyCTAD.cpp @@ -0,0 +1,138 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +namespace { + +struct TestMDRangePolicyCTAD { + template + static void maybe_unused(Ts&&...) {} + + struct SomeExecutionSpace { + using execution_space = SomeExecutionSpace; + using size_type = size_t; + }; + static_assert(Kokkos::is_execution_space_v); + + struct ImplicitlyConvertibleToDefaultExecutionSpace { + [[maybe_unused]] operator Kokkos::DefaultExecutionSpace() const { + return Kokkos::DefaultExecutionSpace(); + } + }; + static_assert(!Kokkos::is_execution_space_v< + ImplicitlyConvertibleToDefaultExecutionSpace>); + + [[maybe_unused]] static inline Kokkos::DefaultExecutionSpace des; + [[maybe_unused]] static inline ImplicitlyConvertibleToDefaultExecutionSpace + notEs; + [[maybe_unused]] static inline SomeExecutionSpace ses; + + [[maybe_unused]] static inline int t[5]; + [[maybe_unused]] static inline int64_t tt[5]; + [[maybe_unused]] static inline Kokkos::Array a; + [[maybe_unused]] static inline Kokkos::Array aa; + [[maybe_unused]] static inline int64_t i64; + + // Workaround for nvc++ (CUDA-11.7-NVHPC) ignoring [[maybe_unused]] on + // ImplicitlyConvertibleToDefaultExecutionSpace::operator + // Kokkos::DefaultExecutionSpace() const + [[maybe_unused]] static inline Kokkos::DefaultExecutionSpace notEsToDes = + notEs; + + // Workaround for HIP-ROCm-5.2 "declared but never referenced" + TestMDRangePolicyCTAD() { + maybe_unused(des, notEs, ses, t, tt, a, aa, notEsToDes, i64); + } + + // MDRangePolicy with C array parameters + + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(t, t))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(t, t, tt))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(des, t, tt))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(notEs, t, t))>); + + static_assert( + std::is_same_v< + Kokkos::MDRangePolicy>, + decltype(Kokkos::MDRangePolicy(ses, t, t))>); + + // MDRangePolicy with Kokkos::initializer_list parameters + + static_assert(std::is_same_v>, + decltype(Kokkos::MDRangePolicy( + {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}))>); + + static_assert(std::is_same_v>, + decltype(Kokkos::MDRangePolicy( + {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}, + {i64, i64, i64, i64, i64, i64}))>); + + static_assert(std::is_same_v>, + decltype(Kokkos::MDRangePolicy( + des, {1, 2, 3, 4, 5, 6}, + {i64, i64, i64, i64, i64, i64}))>); + + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(notEs, {1, 2, 3, 4, 5, 6}, + {1, 2, 3, 4, 5, 6}))>); + + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(ses, {1, 2, 3, 4, 5, 6}, + {1, 2, 3, 4, 5, 6}))>); + + // MDRangePolicy with Kokkos::Array parameters + + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(a, a))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(a, a, aa))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(des, a, a))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(notEs, a, a))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(des, a, a, aa))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(notEs, a, a, aa))>); + + static_assert( + std::is_same_v< + Kokkos::MDRangePolicy>, + decltype(Kokkos::MDRangePolicy(ses, a, a))>); + static_assert( + std::is_same_v< + Kokkos::MDRangePolicy>, + decltype(Kokkos::MDRangePolicy(ses, a, a, aa))>); +}; + +} // namespace diff --git a/packages/kokkos/core/unit_test/TestMDRangePolicyConstructors.hpp b/packages/kokkos/core/unit_test/TestMDRangePolicyConstructors.hpp index f577f415e7cb..6f241b45d472 100644 --- a/packages/kokkos/core/unit_test/TestMDRangePolicyConstructors.hpp +++ b/packages/kokkos/core/unit_test/TestMDRangePolicyConstructors.hpp @@ -18,6 +18,8 @@ #include +#include + namespace { template @@ -86,12 +88,56 @@ TEST(TEST_CATEGORY_DEATH, policy_bounds_unsafe_narrowing_conversions) { using Policy = Kokkos::MDRangePolicy, Kokkos::IndexType>; + std::string msg = + "Kokkos::MDRangePolicy bound type error: an unsafe implicit conversion " + "is " + "performed on a bound (-1) in dimension (0), which may not preserve its " + "original value.\n"; + std::string expected = std::regex_replace(msg, std::regex("\\(|\\)"), "\\$&"); + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - ASSERT_DEATH( - { - (void)Policy({-1, 0}, {2, 3}); - }, - "unsafe narrowing conversion"); + ASSERT_DEATH({ (void)Policy({-1, 0}, {2, 3}); }, expected); +} + +TEST(TEST_CATEGORY_DEATH, policy_invalid_bounds) { + using Policy = Kokkos::MDRangePolicy>; + + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + + auto [dim0, dim1] = (Policy::inner_direction == Kokkos::Iterate::Right) + ? std::make_pair(1, 0) + : std::make_pair(0, 1); + std::string msg1 = + "Kokkos::MDRangePolicy bounds error: The lower bound (100) is greater " + "than its upper bound (90) in dimension " + + std::to_string(dim0) + ".\n"; + + std::string msg2 = + "Kokkos::MDRangePolicy bounds error: The lower bound (100) is greater " + "than its upper bound (90) in dimension " + + std::to_string(dim1) + ".\n"; + +#if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + // escape the parentheses in the regex to match the error message + msg1 = std::regex_replace(msg1, std::regex("\\(|\\)"), "\\$&"); + (void)msg2; + ASSERT_DEATH({ (void)Policy({100, 100}, {90, 90}); }, msg1); +#else + if (!Kokkos::show_warnings()) { + GTEST_SKIP() << "Kokkos warning messages are disabled"; + } + + ::testing::internal::CaptureStderr(); + (void)Policy({100, 100}, {90, 90}); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + ASSERT_EQ(::testing::internal::GetCapturedStderr(), msg1 + msg2); +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg1; + (void)msg2; +#endif + +#endif } #endif diff --git a/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp b/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp index 424ba05a9041..ad035d4e4bf7 100644 --- a/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp +++ b/packages/kokkos/core/unit_test/TestMathematicalFunctions.hpp @@ -287,21 +287,20 @@ struct FloatingPointComparison { public: template - KOKKOS_FUNCTION bool compare_near_zero(FPT const& fpv, double ulp) const { + KOKKOS_FUNCTION bool compare_near_zero(FPT const& fpv, int ulp) const { auto abs_tol = eps(fpv) * ulp; bool ar = absolute(fpv) < abs_tol; if (!ar) { Kokkos::printf("absolute value exceeds tolerance [|%e| > %e]\n", - (double)fpv, abs_tol); + (double)fpv, (double)abs_tol); } return ar; } template - KOKKOS_FUNCTION bool compare(Lhs const& lhs, Rhs const& rhs, - double ulp) const { + KOKKOS_FUNCTION bool compare(Lhs const& lhs, Rhs const& rhs, int ulp) const { if (lhs == 0) { return compare_near_zero(rhs, ulp); } else if (rhs == 0) { @@ -315,7 +314,7 @@ struct FloatingPointComparison { bool ar = abs_diff == 0 || rel_diff < rel_tol; if (!ar) { Kokkos::printf("relative difference exceeds tolerance [%e > %e]\n", - (double)rel_diff, rel_tol); + (double)rel_diff, (double)rel_tol); } return ar; @@ -348,7 +347,7 @@ struct math_function_name; } \ MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE \ } \ - static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \ + static KOKKOS_FUNCTION int ulp_factor() { return ULP_FACTOR; } \ }; \ using kk_##FUNC = MathUnaryFunction_##FUNC; \ template <> \ @@ -373,7 +372,7 @@ struct math_function_name; math_unary_function_return_type_t>::value); \ return REF_FUNC; \ } \ - static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \ + static KOKKOS_FUNCTION int ulp_factor() { return ULP_FACTOR; } \ }; \ using kk_##FUNC = MathUnaryFunction_##FUNC; \ template <> \ @@ -477,7 +476,7 @@ DEFINE_UNARY_FUNCTION_EVAL(logb, 2); } \ MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE \ } \ - static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \ + static KOKKOS_FUNCTION int ulp_factor() { return ULP_FACTOR; } \ }; \ using kk_##FUNC = MathBinaryFunction_##FUNC; \ template <> \ @@ -511,7 +510,7 @@ DEFINE_BINARY_FUNCTION_EVAL(copysign, 1); math_ternary_function_return_type_t>::value); \ return std::FUNC(x, y, z); \ } \ - static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \ + static KOKKOS_FUNCTION int ulp_factor() { return ULP_FACTOR; } \ }; \ using kk3_##FUNC = MathTernaryFunction_##FUNC; \ template <> \ @@ -1307,12 +1306,12 @@ struct TestAbsoluteValueFunction { if (abs(static_cast(4.f)) != static_cast(4.f) || abs(static_cast(-4.f)) != static_cast(4.f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(KE::half_t)\n"); + Kokkos::printf("failed abs(KE::half_t)\n"); } if (abs(static_cast(4.f)) != static_cast(4.f) || abs(static_cast(-4.f)) != static_cast(4.f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(KE::bhalf_t)\n"); + Kokkos::printf("failed abs(KE::bhalf_t)\n"); } if (abs(5.) != 5. || abs(-5.) != 5.) { ++e; @@ -1332,19 +1331,17 @@ struct TestAbsoluteValueFunction { Kokkos::printf("failed abs(floating_point) special values\n"); } - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); static_assert(std::is_same(4.f))), - KE::half_t>::value, - ""); + KE::half_t>::value); static_assert(std::is_same(4.f))), - KE::bhalf_t>::value, - ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); + KE::bhalf_t>::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); #endif } }; @@ -1365,26 +1362,26 @@ struct TestFloatingPointAbsoluteValueFunction { using Kokkos::fabs; if (fabs(4.f) != 4.f || fabs(-4.f) != 4.f) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(float)\n"); + Kokkos::printf("failed fabs(float)\n"); } if (fabs(static_cast(4.f)) != static_cast(4.f) || fabs(static_cast(-4.f)) != static_cast(4.f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(KE::half_t)\n"); + Kokkos::printf("failed fabs(KE::half_t)\n"); } if (fabs(static_cast(4.f)) != static_cast(4.f) || fabs(static_cast(-4.f)) != static_cast(4.f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(KE::bhalf_t)\n"); + Kokkos::printf("failed fabs(KE::bhalf_t)\n"); } if (fabs(5.) != 5. || fabs(-5.) != 5.) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(double)\n"); + Kokkos::printf("failed fabs(double)\n"); } #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS if (fabs(6.l) != 6.l || fabs(-6.l) != 6.l) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(long double)\n"); + Kokkos::printf("failed fabs(long double)\n"); } #endif // special values @@ -1392,8 +1389,7 @@ struct TestFloatingPointAbsoluteValueFunction { using Kokkos::isnan; if (fabs(-0.) != 0. || !isinf(fabs(-INFINITY)) || !isnan(fabs(-NAN))) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "failed fabs(floating_point) special values\n"); + Kokkos::printf("failed fabs(floating_point) special values\n"); } static_assert(std::is_same(4.f))), @@ -1425,7 +1421,7 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { if (!compare(fmod(6.2f, 4.f), 2.2f, 1) && !compare(fmod(-6.2f, 4.f), -2.2f, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(float)\n"); + Kokkos::printf("failed fmod(float)\n"); } if (!compare( fmod(static_cast(6.2f), static_cast(4.f)), @@ -1434,7 +1430,7 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { fmod(static_cast(-6.2f), static_cast(4.f)), -static_cast(2.2f), 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(KE::half_t)\n"); + Kokkos::printf("failed fmod(KE::half_t)\n"); } if (!compare( fmod(static_cast(6.2f), static_cast(4.f)), @@ -1443,17 +1439,17 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { static_cast(4.f)), -static_cast(2.2f), 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(KE::bhalf_t)\n"); + Kokkos::printf("failed fmod(KE::bhalf_t)\n"); } if (!compare(fmod(6.2, 4.), 2.2, 1) && !compare(fmod(-6.2, 4.), -2.2, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(double)\n"); + Kokkos::printf("failed fmod(double)\n"); } #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS if (!compare(fmod(6.2l, 4.l), 2.2l, 1) && !compare(fmod(-6.2l, 4.l), -2.2l, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(long double)\n"); + Kokkos::printf("failed fmod(long double)\n"); } #endif // special values @@ -1462,23 +1458,19 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { if (!isinf(fmod(-KE::infinity::value, 1.f)) && !isnan(fmod(-KE::quiet_NaN::value, 1.f))) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "failed fmod(floating_point) special values\n"); + Kokkos::printf("failed fmod(floating_point) special values\n"); } static_assert(std::is_same(4.f), static_cast(4.f))), - KE::half_t>::value, - ""); + KE::half_t>::value); static_assert(std::is_same(4.f), static_cast(4.f))), - KE::bhalf_t>::value, - ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); + KE::bhalf_t>::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS - static_assert(std::is_same::value, - ""); + static_assert(std::is_same::value); #endif } }; @@ -1502,7 +1494,7 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { if (!compare(remainder(6.2f, 4.f), 2.2f, 2) && !compare(remainder(-6.2f, 4.f), 2.2f, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(float)\n"); + Kokkos::printf("failed remainder(float)\n"); } if (!compare(remainder(static_cast(6.2f), static_cast(4.f)), @@ -1511,7 +1503,7 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { static_cast(4.f)), -static_cast(2.2f), 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(KE::half_t)\n"); + Kokkos::printf("failed remainder(KE::half_t)\n"); } if (!compare(remainder(static_cast(6.2f), static_cast(4.f)), @@ -1520,18 +1512,18 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { static_cast(4.f)), -static_cast(2.2f), 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(KE::bhalf_t)\n"); + Kokkos::printf("failed remainder(KE::bhalf_t)\n"); } if (!compare(remainder(6.2, 4.), 2.2, 2) && !compare(remainder(-6.2, 4.), 2.2, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(double)\n"); + Kokkos::printf("failed remainder(double)\n"); } #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS if (!compare(remainder(6.2l, 4.l), 2.2l, 1) && !compare(remainder(-6.2l, 4.l), -2.2l, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(long double)\n"); + Kokkos::printf("failed remainder(long double)\n"); } #endif // special values @@ -1540,26 +1532,23 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { if (!isinf(remainder(-KE::infinity::value, 1.f)) && !isnan(remainder(-KE::quiet_NaN::value, 1.f))) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF( + Kokkos::printf( "failed remainder(floating_point) special values\n"); } static_assert( std::is_same(4.f), static_cast(4.f))), - KE::half_t>::value, - ""); + KE::half_t>::value); static_assert( std::is_same(4.f), static_cast(4.f))), - KE::bhalf_t>::value, - ""); - static_assert(std::is_same::value, - ""); - static_assert(std::is_same::value, ""); + KE::bhalf_t>::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS static_assert( - std::is_same::value, ""); + std::is_same::value); #endif } }; @@ -1765,7 +1754,7 @@ struct TestIsNaN { #endif ) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(KE::half_t)\n"); + Kokkos::printf("failed isnan(KE::half_t)\n"); } if (isnan(static_cast(2.f)) #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 @@ -1775,7 +1764,7 @@ struct TestIsNaN { #endif ) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(KE::bhalf_t)\n"); + Kokkos::printf("failed isnan(KE::bhalf_t)\n"); } if (isnan(3.) #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 @@ -1801,11 +1790,11 @@ struct TestIsNaN { Kokkos::printf("failed isnan(floating_point) special values\n"); } - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); #endif } }; diff --git a/packages/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp b/packages/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp index 06c84c751370..7969dc86864c 100644 --- a/packages/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp +++ b/packages/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp @@ -1213,13 +1213,13 @@ struct TestComplexBesselI0K0Function { } EXPECT_EQ(h_ref_cbk0(0), h_cbk0(0)); - int upper_limit = N; + int upper_limit_0 = N; // FIXME_SYCL Failing for Intel GPUs, 19 is the first failing test case #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) if (std::is_same_v) - upper_limit = 19; + upper_limit_0 = 19; #endif - for (int i = 1; i < upper_limit; i++) { + for (int i = 1; i < upper_limit_0; i++) { EXPECT_LE(Kokkos::abs(h_cbk0(i) - h_ref_cbk0(i)), Kokkos::abs(h_ref_cbk0(i)) * 1e-13) << "at index " << i; @@ -1462,13 +1462,13 @@ struct TestComplexBesselI1K1Function { } EXPECT_EQ(h_ref_cbk1(0), h_cbk1(0)); - int upper_limit = N; + int upper_limit_1 = N; // FIXME_SYCL Failing for Intel GPUs, 8 is the first failing test case #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) if (std::is_same_v) - upper_limit = 8; + upper_limit_1 = 8; #endif - for (int i = 1; i < upper_limit; i++) { + for (int i = 1; i < upper_limit_1; i++) { EXPECT_LE(Kokkos::abs(h_cbk1(i) - h_ref_cbk1(i)), Kokkos::abs(h_ref_cbk1(i)) * 1e-13) << "at index " << i; @@ -1718,20 +1718,26 @@ struct TestComplexBesselH1Function { ((HIP_VERSION_MAJOR == 5) && \ !((HIP_VERSION_MINOR == 5) || (HIP_VERSION_MINOR == 6))) EXPECT_EQ(h_ref_ch10(0), h_ch10(0)); - for (int i = 1; i < N; i++) { + int upper_limit_10 = N; +// FIXME_SYCL Failing for Intel GPUs, 17 is the first failing test case +#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + upper_limit_10 = 17; +#endif + for (int i = 1; i < upper_limit_10; i++) { EXPECT_LE(Kokkos::abs(h_ch10(i) - h_ref_ch10(i)), Kokkos::abs(h_ref_ch10(i)) * 1e-13) << "at index " << i; } EXPECT_EQ(h_ref_ch11(0), h_ch11(0)); - int upper_limit = N; - // FIXME_SYCL Failing for Intel GPUs, 16 is the first failing test case + int upper_limit_11 = N; + // FIXME_SYCL Failing for Intel GPUs, 2 is the first failing test case #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) if (std::is_same_v) - upper_limit = 16; + upper_limit_11 = 2; #endif - for (int i = 1; i < upper_limit; i++) { + for (int i = 1; i < upper_limit_11; i++) { EXPECT_LE(Kokkos::abs(h_ch11(i) - h_ref_ch11(i)), Kokkos::abs(h_ref_ch11(i)) * 1e-13) << "at index " << i; @@ -1912,19 +1918,26 @@ struct TestComplexBesselH2Function { ((HIP_VERSION_MAJOR == 5) && \ !((HIP_VERSION_MINOR == 5) || (HIP_VERSION_MINOR == 6))) EXPECT_EQ(h_ref_ch20(0), h_ch20(0)); - for (int i = 1; i < N; i++) { + int upper_limit_20 = N; +// FIXME_SYCL Failing for Intel GPUs, 16 is the first failing test case +#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + upper_limit_20 = 16; +#endif + for (int i = 1; i < upper_limit_20; i++) { EXPECT_LE(Kokkos::abs(h_ch20(i) - h_ref_ch20(i)), - Kokkos::abs(h_ref_ch20(i)) * 1e-13); + Kokkos::abs(h_ref_ch20(i)) * 1e-13) + << "at index " << i; } EXPECT_EQ(h_ref_ch21(0), h_ch21(0)); - int upper_limit = N; - // FIXME_SYCL Failing for Intel GPUs, 17 is the first failing test case + int upper_limit_21 = N; + // FIXME_SYCL Failing for Intel GPUs, 1 is the first failing test case #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) if (std::is_same_v) - upper_limit = 17; + upper_limit_21 = 1; #endif - for (int i = 1; i < upper_limit; i++) { + for (int i = 1; i < upper_limit_21; i++) { EXPECT_LE(Kokkos::abs(h_ch21(i) - h_ref_ch21(i)), Kokkos::abs(h_ref_ch21(i)) * 1e-13) << "at index " << i; @@ -1954,31 +1967,61 @@ TEST(TEST_CATEGORY, mathspecialfunc_errorfunc) { #endif TEST(TEST_CATEGORY, mathspecialfunc_cbesselj0y0) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselJ0Y0Function test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesselj1y1) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselJ1Y1Function test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesseli0k0) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselI0K0Function test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesseli1k1) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselI1K1Function test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesselh1stkind) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselH1Function test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesselh2ndkind) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselH2Function test; test.testit(); } diff --git a/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp b/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp index eaf7a4125cc3..116ac58c39ff 100644 --- a/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp +++ b/packages/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp @@ -214,7 +214,7 @@ struct point_t { uint8_t x, y, z; KOKKOS_FUNCTION - point_t() : x(1), y(1), z(1){}; + point_t() : x(0), y(0), z(0){}; KOKKOS_FUNCTION point_t(const point_t &val) : x(val.x), y(val.y), z(val.z){}; diff --git a/packages/kokkos/core/unit_test/TestNumericTraits.hpp b/packages/kokkos/core/unit_test/TestNumericTraits.hpp index ec1c1e0ca0b4..81a9d0a5e0dd 100644 --- a/packages/kokkos/core/unit_test/TestNumericTraits.hpp +++ b/packages/kokkos/core/unit_test/TestNumericTraits.hpp @@ -210,9 +210,10 @@ TEST(TEST_CATEGORY, numeric_traits_infinity) { #endif TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -224,9 +225,9 @@ TEST(TEST_CATEGORY, numeric_traits_epsilon) { #endif TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -239,9 +240,9 @@ TEST(TEST_CATEGORY, numeric_traits_round_error) { #endif TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -253,9 +254,9 @@ TEST(TEST_CATEGORY, numeric_traits_norm_min) { #endif TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -263,9 +264,9 @@ TEST(TEST_CATEGORY, numeric_traits_norm_min) { TEST(TEST_CATEGORY, numeric_traits_denorm_min) { TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -302,8 +303,10 @@ TEST(TEST_CATEGORY, numeric_traits_finite_min_max) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); TestNumericTraits(); #endif @@ -326,8 +329,10 @@ TEST(TEST_CATEGORY, numeric_traits_digits) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -349,8 +354,10 @@ TEST(TEST_CATEGORY, numeric_traits_digits10) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -358,8 +365,10 @@ TEST(TEST_CATEGORY, numeric_traits_digits10) { TEST(TEST_CATEGORY, numeric_traits_max_digits10) { TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -380,8 +389,10 @@ TEST(TEST_CATEGORY, numeric_traits_radix) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -395,8 +406,10 @@ TEST(TEST_CATEGORY, numeric_traits_min_max_exponent) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); TestNumericTraits(); #endif @@ -407,8 +420,10 @@ TEST(TEST_CATEGORY, numeric_traits_min_max_exponent10) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); TestNumericTraits(); #endif @@ -426,8 +441,10 @@ TEST(TEST_CATEGORY, numeric_traits_quiet_and_signaling_nan) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); TestNumericTraits(); #endif @@ -442,7 +459,7 @@ struct HasNoSpecialization {}; using TRAIT##_value_t = decltype(Kokkos::Experimental::TRAIT::value); \ template \ using has_##TRAIT = Kokkos::is_detected; \ - static_assert(!has_##TRAIT::value, ""); + static_assert(!has_##TRAIT::value); CHECK_TRAIT_IS_SFINAE_FRIENDLY(infinity) CHECK_TRAIT_IS_SFINAE_FRIENDLY(finite_min) @@ -524,39 +541,39 @@ CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, denorm_min); #endif // clang-format off -static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits< float>::min(), ""); -static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits< double>::min(), ""); -static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits::min(), ""); +static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits< float>::min()); +static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits< double>::min()); +static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits::min()); // integer types -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< char>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< signed char>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned char>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< short>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned short>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< long int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned long int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< long long int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits::min(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< char>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< signed char>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned char>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< short>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned short>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< long int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned long int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< long long int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits::max(), ""); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< char>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< signed char>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned char>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< short>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned short>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< long int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned long int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< long long int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits::min()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< char>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< signed char>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned char>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< short>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned short>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< long int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned long int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< long long int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits::max()); // floating point types -static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits< float>::max(), ""); -static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits< double>::max(), ""); -static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< float>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< double>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits::max(), ""); +static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits< float>::max()); +static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits< double>::max()); +static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< float>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< double>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits::max()); // clang-format on CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(bool, digits); @@ -623,15 +640,13 @@ CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, max_exponent10); #undef CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION #undef CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT -#define CHECK_NAN_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(T, TRAIT) \ - static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ - static_assert( \ - std::numeric_limits::TRAIT() != std::numeric_limits::TRAIT(), ""); \ - static_assert(Kokkos::Experimental::TRAIT::value != \ - std::numeric_limits::TRAIT(), \ - "") +#define CHECK_NAN_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(T, TRAIT) \ + static_assert(Kokkos::Experimental::TRAIT::value != \ + Kokkos::Experimental::TRAIT::value); \ + static_assert(std::numeric_limits::TRAIT() != \ + std::numeric_limits::TRAIT()); \ + static_assert(Kokkos::Experimental::TRAIT::value != \ + std::numeric_limits::TRAIT()) // Workaround compiler issue error: expression must have a constant value // See kokkos/kokkos#4574 @@ -651,14 +666,11 @@ CHECK_NAN_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, signaling_NaN); #define CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES(T, TRAIT) \ static_assert(Kokkos::Experimental::TRAIT::value == \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value == \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value == \ - Kokkos::Experimental::TRAIT::value, \ - "") + Kokkos::Experimental::TRAIT::value) #define CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT(TRAIT) \ CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES(float, TRAIT); \ @@ -706,17 +718,13 @@ CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT(max_exponent10); #define CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES(T, TRAIT) \ static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - "") + Kokkos::Experimental::TRAIT::value) #define CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT(TRAIT) \ CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES(float, TRAIT); \ diff --git a/packages/kokkos/core/unit_test/TestOccupancyControlTrait.hpp b/packages/kokkos/core/unit_test/TestOccupancyControlTrait.hpp new file mode 100644 index 000000000000..345a906d6683 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestOccupancyControlTrait.hpp @@ -0,0 +1,80 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +namespace { + +template +void test_policy_execution(const Kokkos::RangePolicy& policy) { + Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int){}); +} +template +void test_policy_execution(const Kokkos::TeamPolicy& policy) { + Kokkos::parallel_for( + policy, + KOKKOS_LAMBDA( + const typename Kokkos::TeamPolicy::member_type&){}); +} +template +void test_policy_execution(const Kokkos::MDRangePolicy& policy) { + Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int, int){}); +} + +template +void test_prefer_desired_occupancy(Policy policy) { + using Kokkos::Experimental::DesiredOccupancy; + using Kokkos::Experimental::MaximizeOccupancy; + using Kokkos::Experimental::prefer; + using Kokkos::Experimental::WorkItemProperty; + + // MaximizeOccupancy -> MaximizeOccupancy + auto const policy_still_no_occ = prefer(policy, MaximizeOccupancy{}); + test_policy_execution(policy_still_no_occ); + + // MaximizeOccupancy -> DesiredOccupancy + auto const policy_with_occ = + prefer(policy_still_no_occ, DesiredOccupancy{33}); + test_policy_execution(policy_with_occ); + + // DesiredOccupancy -> DesiredOccupancy + auto const policy_change_occ = prefer(policy_with_occ, DesiredOccupancy{24}); + test_policy_execution(policy_change_occ); + + // DesiredOccupancy -> DesiredOccupancy w/ hint + auto policy_with_occ_and_hint = Kokkos::Experimental::require( + policy_change_occ, + Kokkos::Experimental::WorkItemProperty::HintLightWeight); + test_policy_execution(policy_with_occ_and_hint); + + // DesiredOccupancy -> MaximizeOccupancy + auto const policy_drop_occ = + prefer(policy_with_occ_and_hint, MaximizeOccupancy{}); + test_policy_execution(policy_drop_occ); +} + +// FIXME_MSVC_WITH_CUDA +// This test doesn't compile with CUDA on Windows +#if !(defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA)) +TEST(TEST_CATEGORY, occupancy_control) { + test_prefer_desired_occupancy(Kokkos::RangePolicy(0, 1)); + test_prefer_desired_occupancy( + Kokkos::TeamPolicy{1, Kokkos::AUTO}); + test_prefer_desired_occupancy( + Kokkos::MDRangePolicy>{{0, 0}, {1, 1}}); +} +#endif +} // namespace diff --git a/packages/kokkos/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp b/packages/kokkos/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp index 176ce9b5fed4..a56dfd9efc77 100644 --- a/packages/kokkos/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp +++ b/packages/kokkos/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp @@ -166,22 +166,6 @@ TEST(defaultdevicetype, cmd_line_args_device_id) { EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, {"--dummy"}); } -TEST(defaultdevicetype, cmd_line_args_num_devices) { - CmdLineArgsHelper cla = {{ - "--kokkos-num-devices=5,6", - "--kokkos-num-devices=7", - "-v", - }}; - Kokkos::InitializationSettings settings; - Kokkos::Impl::parse_command_line_arguments(cla.argc(), cla.argv(), settings); - EXPECT_TRUE(settings.has_num_devices()); - EXPECT_EQ(settings.get_num_devices(), 7); - // this is the current behavior, not suggesting this cannot be revisited - EXPECT_TRUE(settings.has_skip_device()) << "behavior changed see comment"; - EXPECT_EQ(settings.get_skip_device(), 6) << "behavior changed see comment"; - EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, {"-v"}); -} - TEST(defaultdevicetype, cmd_line_args_disable_warning) { CmdLineArgsHelper cla = {{ "--kokkos-disable-warnings=1", @@ -351,20 +335,6 @@ TEST(defaultdevicetype, env_vars_device_id) { EXPECT_EQ(settings.get_device_id(), 33); } -TEST(defaultdevicetype, env_vars_num_devices) { - EnvVarsHelper ev = {{ - {"KOKKOS_NUM_DEVICES", "4"}, - {"KOKKOS_SKIP_DEVICE", "1"}, - }}; - SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev); - Kokkos::InitializationSettings settings; - Kokkos::Impl::parse_environment_variables(settings); - EXPECT_TRUE(settings.has_num_devices()); - EXPECT_EQ(settings.get_num_devices(), 4); - EXPECT_TRUE(settings.has_skip_device()); - EXPECT_EQ(settings.get_skip_device(), 1); -} - TEST(defaultdevicetype, env_vars_disable_warnings) { for (auto const& value_true : {"1", "true", "TRUE", "yEs"}) { EnvVarsHelper ev = {{ @@ -420,22 +390,20 @@ TEST(defaultdevicetype, env_vars_tune_internals) { } TEST(defaultdevicetype, visible_devices) { -#define KOKKOS_TEST_VISIBLE_DEVICES(ENV, CNT, DEV) \ - do { \ - EnvVarsHelper ev{ENV}; \ - SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev); \ - Kokkos::InitializationSettings settings; \ - Kokkos::Impl::parse_environment_variables(settings); \ - auto computed = Kokkos::Impl::get_visible_devices(settings, CNT); \ - std::vector expected = DEV; \ - EXPECT_EQ(expected.size(), computed.size()) \ - << ev << "device count: " << CNT; \ - auto n = std::min(expected.size(), computed.size()); \ - for (int i = 0; i < n; ++i) { \ - EXPECT_EQ(expected[i], computed[i]) \ - << "devices differ at index " << i << '\n' \ - << ev << "device count: " << CNT; \ - } \ +#define KOKKOS_TEST_VISIBLE_DEVICES(ENV, CNT, DEV) \ + do { \ + EnvVarsHelper ev{ENV}; \ + SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev); \ + auto computed = Kokkos::Impl::get_visible_devices(CNT); \ + std::vector expected = DEV; \ + EXPECT_EQ(expected.size(), computed.size()) \ + << ev << "device count: " << CNT; \ + auto n = std::min(expected.size(), computed.size()); \ + for (int i = 0; i < n; ++i) { \ + EXPECT_EQ(expected[i], computed[i]) \ + << "devices differ at index " << i << '\n' \ + << ev << "device count: " << CNT; \ + } \ } while (false) #define DEV(...) \ @@ -444,6 +412,8 @@ TEST(defaultdevicetype, visible_devices) { // first test with all environment variables that are involved in determining // the visible devices so user set var do not mess up the logic below. + // KOKKOS_NUM_DEVICES and KOKKOS_SKIP_DEVICE are deprecated since 3.7 and are + // not taken into account anymore. KOKKOS_TEST_VISIBLE_DEVICES( ENV({"KOKKOS_VISIBLE_DEVICES", "2,1"}, {"KOKKOS_NUM_DEVICES", "8"}, {"KOKKOS_SKIP_DEVICE", "1"}), @@ -452,10 +422,10 @@ TEST(defaultdevicetype, visible_devices) { ENV({"KOKKOS_VISIBLE_DEVICES", "2,1"}, {"KOKKOS_NUM_DEVICES", "8"}, ), 6, DEV(2, 1)); KOKKOS_TEST_VISIBLE_DEVICES(ENV({"KOKKOS_NUM_DEVICES", "3"}), 6, - DEV(0, 1, 2)); + DEV(0, 1, 2, 3, 4, 5)); KOKKOS_TEST_VISIBLE_DEVICES( ENV({"KOKKOS_NUM_DEVICES", "4"}, {"KOKKOS_SKIP_DEVICE", "1"}, ), 6, - DEV(0, 2, 3)); + DEV(0, 1, 2, 3, 4, 5)); KOKKOS_TEST_VISIBLE_DEVICES(ENV({"KOKKOS_VISIBLE_DEVICES", "1,3,4"}), 6, DEV(1, 3, 4)); KOKKOS_TEST_VISIBLE_DEVICES( diff --git a/packages/kokkos/core/unit_test/TestRangePolicyCTAD.cpp b/packages/kokkos/core/unit_test/TestRangePolicyCTAD.cpp new file mode 100644 index 000000000000..20288e2b40a2 --- /dev/null +++ b/packages/kokkos/core/unit_test/TestRangePolicyCTAD.cpp @@ -0,0 +1,150 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include "Kokkos_Core_fwd.hpp" + +namespace { + +struct TestRangePolicyCTAD { + struct SomeExecutionSpace { + using execution_space = SomeExecutionSpace; + using size_type = size_t; + + [[maybe_unused]] static int concurrency() { return 0; } + }; + static_assert(Kokkos::is_execution_space_v); + + struct ImplicitlyConvertibleToDefaultExecutionSpace { + [[maybe_unused]] operator Kokkos::DefaultExecutionSpace() const { + return Kokkos::DefaultExecutionSpace(); + } + }; + static_assert(!Kokkos::is_execution_space_v< + ImplicitlyConvertibleToDefaultExecutionSpace>); + + [[maybe_unused]] static inline auto i64 = int64_t(); + [[maybe_unused]] static inline auto i32 = int32_t(); + [[maybe_unused]] static inline auto cs = Kokkos::ChunkSize(0); + [[maybe_unused]] static inline auto des = Kokkos::DefaultExecutionSpace(); + [[maybe_unused]] static inline auto nes = + ImplicitlyConvertibleToDefaultExecutionSpace(); + [[maybe_unused]] static inline auto ses = SomeExecutionSpace(); + + // RangePolicy() + + [[maybe_unused]] static inline auto rp = Kokkos::RangePolicy{}; + static_assert(std::is_same_v, decltype(rp)>); + + // RangePolicy(index_type, index_type) + + [[maybe_unused]] static inline auto rpi64i64 = Kokkos::RangePolicy(i64, i64); + static_assert(std::is_same_v, decltype(rpi64i64)>); + + [[maybe_unused]] static inline auto rpi64i32 = Kokkos::RangePolicy(i64, i32); + static_assert(std::is_same_v, decltype(rpi64i32)>); + + [[maybe_unused]] static inline auto rpi32i64 = Kokkos::RangePolicy(i32, i64); + static_assert(std::is_same_v, decltype(rpi32i64)>); + + [[maybe_unused]] static inline auto rpi32i32 = Kokkos::RangePolicy(i32, i32); + static_assert(std::is_same_v, decltype(rpi32i32)>); + + // RangePolicy(index_type, index_type, ChunkSize) + + [[maybe_unused]] static inline auto rpi64i64cs = + Kokkos::RangePolicy(i64, i64, cs); + static_assert(std::is_same_v, decltype(rpi64i64cs)>); + + [[maybe_unused]] static inline auto rpi64i32cs = + Kokkos::RangePolicy(i64, i32, cs); + static_assert(std::is_same_v, decltype(rpi64i32cs)>); + + [[maybe_unused]] static inline auto rpi32i64cs = + Kokkos::RangePolicy(i32, i64, cs); + static_assert(std::is_same_v, decltype(rpi32i64cs)>); + + [[maybe_unused]] static inline auto rpi32i32cs = + Kokkos::RangePolicy(i32, i32, cs); + static_assert(std::is_same_v, decltype(rpi32i32cs)>); + + // RangePolicy(execution_space, index_type, index_type) + + [[maybe_unused]] static inline auto rpdesi64i64 = + Kokkos::RangePolicy(des, i64, i64); + static_assert(std::is_same_v, decltype(rpdesi64i64)>); + + [[maybe_unused]] static inline auto rpdesi32i32 = + Kokkos::RangePolicy(des, i32, i32); + static_assert(std::is_same_v, decltype(rpdesi32i32)>); + + [[maybe_unused]] static inline auto rpnesi64i64 = + Kokkos::RangePolicy(nes, i64, i64); + static_assert(std::is_same_v, decltype(rpnesi64i64)>); + + [[maybe_unused]] static inline auto rpnesi32i32 = + Kokkos::RangePolicy(nes, i32, i32); + static_assert(std::is_same_v, decltype(rpnesi32i32)>); + + [[maybe_unused]] static inline auto rpsesi64i64 = + Kokkos::RangePolicy(ses, i64, i64); + static_assert(std::is_same_v, + decltype(rpsesi64i64)>); + + [[maybe_unused]] static inline auto rpsesi32i32 = + Kokkos::RangePolicy(ses, i32, i32); + static_assert(std::is_same_v, + decltype(rpsesi32i32)>); + + // RangePolicy(execution_space, index_type, index_type, ChunkSize) + + [[maybe_unused]] static inline auto rpdesi64i64cs = + Kokkos::RangePolicy(des, i64, i64, cs); + static_assert(std::is_same_v, decltype(rpdesi64i64cs)>); + + [[maybe_unused]] static inline auto rpdesi32i32cs = + Kokkos::RangePolicy(des, i32, i32, cs); + static_assert(std::is_same_v, decltype(rpdesi32i32cs)>); + + [[maybe_unused]] static inline auto rpnesi64i64cs = + Kokkos::RangePolicy(nes, i64, i64, cs); + static_assert(std::is_same_v, decltype(rpnesi64i64cs)>); + + [[maybe_unused]] static inline auto rpnesi32i32cs = + Kokkos::RangePolicy(nes, i32, i32, cs); + static_assert(std::is_same_v, decltype(rpnesi32i32cs)>); + + [[maybe_unused]] static inline auto rpsesi64i64cs = + Kokkos::RangePolicy(ses, i64, i64, cs); + static_assert(std::is_same_v, + decltype(rpsesi64i64cs)>); + + [[maybe_unused]] static inline auto rpsesi32i32cs = + Kokkos::RangePolicy(ses, i32, i32, cs); + static_assert(std::is_same_v, + decltype(rpsesi32i32cs)>); + +}; // TestRangePolicyCTAD struct + +// To eliminate maybe_unused warning on some compilers + +[[maybe_unused]] const Kokkos::DefaultExecutionSpace nestodes = + TestRangePolicyCTAD::ImplicitlyConvertibleToDefaultExecutionSpace(); + +[[maybe_unused]] const auto sesconcurrency = + TestRangePolicyCTAD::ses.concurrency(); + +} // namespace diff --git a/packages/kokkos/core/unit_test/TestRangePolicyConstructors.hpp b/packages/kokkos/core/unit_test/TestRangePolicyConstructors.hpp index 0a7e59ed980c..c8c1542af138 100644 --- a/packages/kokkos/core/unit_test/TestRangePolicyConstructors.hpp +++ b/packages/kokkos/core/unit_test/TestRangePolicyConstructors.hpp @@ -18,6 +18,9 @@ #include +#include +#include + namespace { TEST(TEST_CATEGORY, range_policy_runtime_parameters) { @@ -70,4 +73,127 @@ TEST(TEST_CATEGORY, range_policy_runtime_parameters) { } } +TEST(TEST_CATEGORY_DEATH, range_policy_invalid_bounds) { + using Policy = Kokkos::RangePolicy; + using ChunkSize = Kokkos::ChunkSize; + + std::string msg = + "Kokkos::RangePolicy bounds error: The lower bound (100) is greater than " + "the upper bound (90).\n"; +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 + // escape the parentheses in the regex to match the error message + msg = std::regex_replace(msg, std::regex("\\(|\\)"), "\\$&"); + ASSERT_DEATH({ (void)Policy(100, 90); }, msg); + + ASSERT_DEATH({ (void)Policy(TEST_EXECSPACE(), 100, 90, ChunkSize(10)); }, + msg); +#else + + if (!Kokkos::show_warnings()) { + GTEST_SKIP() << "Kokkos warning messages are disabled"; + } + + { + ::testing::internal::CaptureStderr(); + Policy policy(100, 90); + ASSERT_EQ((int)policy.begin(), 0); + ASSERT_EQ((int)policy.end(), 0); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + ASSERT_EQ(::testing::internal::GetCapturedStderr(), msg); +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg; +#endif + } + + { + ::testing::internal::CaptureStderr(); + Policy policy(TEST_EXECSPACE(), 100, 90, ChunkSize(10)); + ASSERT_EQ((int)policy.begin(), 0); + ASSERT_EQ((int)policy.end(), 0); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + ASSERT_EQ(::testing::internal::GetCapturedStderr(), msg); +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg; +#endif + } + +#endif +} + +TEST(TEST_CATEGORY_DEATH, range_policy_implicitly_converted_bounds) { + using UIntIndexType = Kokkos::IndexType; + using IntIndexType = Kokkos::IndexType; + using UIntPolicy = Kokkos::RangePolicy; + using IntPolicy = Kokkos::RangePolicy; + + std::string msg = + "Kokkos::RangePolicy bound type error: an unsafe implicit conversion is " + "performed on a bound (), which may not preserve its original value.\n"; + + auto get_error_msg = [](auto str, auto val) { + return str.insert(str.find("(") + 1, std::to_string(val).c_str()); + }; +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 + std::string expected = std::regex_replace(msg, std::regex("\\(|\\)"), "\\$&"); + { + int test_val = -1; + ASSERT_DEATH({ (void)UIntPolicy(test_val, 10); }, + get_error_msg(expected, test_val)); + } + { + unsigned test_val = std::numeric_limits::max(); + ASSERT_DEATH({ (void)IntPolicy(0u, test_val); }, + get_error_msg(expected, test_val)); + } + { + long long test_val = std::numeric_limits::max(); + ASSERT_DEATH({ (void)IntPolicy(0LL, test_val); }, + get_error_msg(expected, test_val)); + } + { + int test_val = -1; + ASSERT_DEATH({ (void)UIntPolicy(test_val, 10, Kokkos::ChunkSize(2)); }, + get_error_msg(expected, test_val)); + } + +#else + { + ::testing::internal::CaptureStderr(); + int test_val = -1; + UIntPolicy policy(test_val, 10); + ASSERT_EQ(policy.begin(), 0u); + ASSERT_EQ(policy.end(), 0u); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + if (Kokkos::show_warnings()) { + auto s = std::string(::testing::internal::GetCapturedStderr()); + ASSERT_EQ(s.substr(0, s.find("\n") + 1), get_error_msg(msg, test_val)); + } +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg; + (void)get_error_msg; +#endif + } + { + ::testing::internal::CaptureStderr(); + unsigned test_val = std::numeric_limits::max(); + IntPolicy policy(0u, test_val); + ASSERT_EQ(policy.begin(), 0); + ASSERT_EQ(policy.end(), 0); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + if (Kokkos::show_warnings()) { + auto s = std::string(::testing::internal::GetCapturedStderr()); + ASSERT_EQ(s.substr(0, s.find("\n") + 1), get_error_msg(msg, test_val)); + } +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg; + (void)get_error_msg; +#endif + } +#endif +} + } // namespace diff --git a/packages/kokkos/core/unit_test/TestReducers.hpp b/packages/kokkos/core/unit_test/TestReducers.hpp index 957b9a0ca1a7..fbcb9629af0a 100644 --- a/packages/kokkos/core/unit_test/TestReducers.hpp +++ b/packages/kokkos/core/unit_test/TestReducers.hpp @@ -19,6 +19,7 @@ #include #include +#include //-------------------------------------------------------------------------- @@ -46,6 +47,37 @@ struct TestReducers { void operator()(const int& i, Scalar& value) const { value += values(i); } }; + struct TeamSumFunctor { + using member_type = typename Kokkos::TeamPolicy::member_type; + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type& m, Scalar& value) const { + if (m.team_rank() == m.team_size() - 1) value += Scalar(1); + } + }; + + struct TeamSumNestedFunctor { + using member_type = typename Kokkos::TeamPolicy::member_type; + + SumFunctor f; + int M, N; + Kokkos::View result; + + TeamSumNestedFunctor(SumFunctor& f_, const int M_, const int N_, + Kokkos::View result_) + : f(f_), M(M_), N(N_), result(result_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type& m) const { + const int i = m.league_rank(); + Scalar local_scalar; + Kokkos::Sum reducer_scalar( + local_scalar); + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(m, N), f, reducer_scalar); + result(i) = local_scalar; + } + }; + struct ProdFunctor { Kokkos::View values; @@ -319,6 +351,102 @@ struct TestReducers { value = value || values(i); } }; + + // get number of teams for TeamPolicy depending on the tested type + constexpr static int get_num_teams() { + if constexpr (sizeof(Scalar) == 1) { + return 126; + } else if constexpr (std::is_same_v) { + return 256; + } + + return 1024; + } + + static void test_sum_team_policy(int N, SumFunctor f, Scalar reference_sum) { +#ifdef KOKKOS_ENABLE_OPENACC + if constexpr (std::is_same_v && + (std::is_same_v || + std::is_same_v)) { + return; // FIXME_OPENACC + } +#endif + + Scalar sum_scalar; + Kokkos::View sum_view("result"); + Kokkos::deep_copy(sum_view, Scalar(1)); + + // Test team policy reduction + { + constexpr int num_teams = get_num_teams(); + TeamSumFunctor tf; + // FIXME_OPENMPTARGET temporary restriction for team size to be at least + // 32 +#ifdef KOKKOS_ENABLE_OPENMPTARGET + int team_size = + std::is_same::value + ? 32 + : 1; +#else + int team_size = 1; +#endif + auto team_pol = Kokkos::TeamPolicy(num_teams, team_size); + Kokkos::parallel_reduce(team_pol, tf, sum_view); + Kokkos::deep_copy(sum_scalar, sum_view); + ASSERT_EQ(sum_scalar, Scalar{num_teams}) << "num_teams: " << num_teams; + } + + // Test TeamThreadRange level reduction with 0 work produces 0 result + { + const int league_size = 1; + Kokkos::View result("result", league_size); + TeamSumNestedFunctor tnf(f, league_size, 0, result); + // FIXME_OPENMPTARGET temporary restriction for team size to be at least + // 32 +#ifdef KOKKOS_ENABLE_OPENMPTARGET + int team_size = + std::is_same::value + ? 32 + : 1; +#else + int team_size = 1; +#endif + auto team_pol = Kokkos::TeamPolicy(1, team_size); + Kokkos::parallel_for(team_pol, tnf); + auto result_h = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), result); + ASSERT_EQ(result_h(0), Scalar{0}) << "N: " << N; + } + + // Same test as above, but with inner reduction over N, and league_size=10 + { + const int league_size = 10; + Kokkos::View result("result", league_size); + TeamSumNestedFunctor tnf(f, league_size, N, result); + // FIXME_OPENMPTARGET temporary restriction for team size to be at least + // 32 +#ifdef KOKKOS_ENABLE_OPENMPTARGET + int initial_team_size = + std::is_same_v ? 32 + : 1; +#else + int initial_team_size = 1; +#endif + auto team_size_max = + Kokkos::TeamPolicy(league_size, initial_team_size) + .team_size_max(tnf, Kokkos::ParallelForTag()); + auto team_size = std::min(team_size_max, TEST_EXECSPACE().concurrency()); + auto team_pol = Kokkos::TeamPolicy(league_size, team_size); + Kokkos::parallel_for(team_pol, tnf); + auto result_h = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), result); + for (int i = 0; i < result_h.extent_int(0); ++i) { + ASSERT_EQ(result_h(i), reference_sum) << "N: " << N; + } + } + } + static void test_sum(int N) { Kokkos::View values("Values", N); auto h_values = Kokkos::create_mirror_view(values); @@ -374,6 +502,8 @@ struct TestReducers { ASSERT_EQ(sum_scalar_view, reference_sum) << "N: " << N; } + test_sum_team_policy(N, f, reference_sum); + { Kokkos::View sum_view("View"); sum_view() = Scalar(1); diff --git a/packages/kokkos/core/unit_test/TestReducers_d.hpp b/packages/kokkos/core/unit_test/TestReducers_d.hpp index 19eaa6d70002..ecf851aa1089 100644 --- a/packages/kokkos/core/unit_test/TestReducers_d.hpp +++ b/packages/kokkos/core/unit_test/TestReducers_d.hpp @@ -80,6 +80,20 @@ TEST(TEST_CATEGORY, reducers_int8_t) { TestReducers::test_prod(4); } +TEST(TEST_CATEGORY, reducers_int16_t) { + using ThisTestType = int16_t; + + TestReducers::test_sum(1); + TestReducers::test_sum(2); + TestReducers::test_sum(3); + TestReducers::test_sum(4); + + TestReducers::test_prod(1); + TestReducers::test_prod(2); + TestReducers::test_prod(3); + TestReducers::test_prod(4); +} + #if !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_OPENMPTARGET) // TODO - resolve: "Kokkos_HIP_Vectorization.hpp:80:15: error: call to // implicitly-deleted default constructor of 'conv_type' diff --git a/packages/kokkos/core/unit_test/TestSwap.hpp b/packages/kokkos/core/unit_test/TestSwap.hpp new file mode 100644 index 000000000000..4e98351cf19c --- /dev/null +++ b/packages/kokkos/core/unit_test/TestSwap.hpp @@ -0,0 +1,68 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include +#include +#include + +namespace { + +template +struct TestSwap { + KOKKOS_FUNCTION void operator()(int, int& err) const { + { + int a = 1; + int b = 2; + Kokkos::kokkos_swap(a, b); + if (!(a == 2 && b == 1)) { + Kokkos::printf("Failed Kokkos::kokkos_swap(int, int)\n"); + ++err; + } + } + { + float a = 1; + float b = 2; + Kokkos::kokkos_swap(a, b); + if (!(a == 2 && b == 1)) { + Kokkos::printf("Failed Kokkos::kokkos_swap(float, float)\n"); + ++err; + } + } + { + int a[3] = {1, 2, 3}; + int b[3] = {4, 5, 6}; + Kokkos::kokkos_swap(a, b); + if (!(a[0] == 4 && a[1] == 5 && a[2] == 6 && b[0] == 1 && b[1] == 2 && + b[2] == 3)) { + Kokkos::printf("Failed Kokkos::kokkos_swap(int[3], int[3])\n"); + ++err; + } + } + } + + TestSwap() { + int errors; + Kokkos::parallel_reduce( + "TestSwap", Kokkos::RangePolicy(0, 1), *this, errors); + EXPECT_EQ(errors, 0); + } +}; + +TEST(TEST_CATEGORY, kokkos_swap) { TestSwap(); } + +} // namespace diff --git a/packages/kokkos/core/unit_test/TestTeamBasic.hpp b/packages/kokkos/core/unit_test/TestTeamBasic.hpp index c395bc0837ce..a3d84c5e16ba 100644 --- a/packages/kokkos/core/unit_test/TestTeamBasic.hpp +++ b/packages/kokkos/core/unit_test/TestTeamBasic.hpp @@ -280,7 +280,7 @@ namespace Test { // Test for non-arithmetic type TEST(TEST_CATEGORY, team_broadcast_long_wrapper) { - static_assert(!std::is_arithmetic::value, ""); + static_assert(!std::is_arithmetic::value); TestTeamBroadcast, long_wrapper>::test_teambroadcast(0, 1); diff --git a/packages/kokkos/core/unit_test/TestTeamMDRange.hpp b/packages/kokkos/core/unit_test/TestTeamMDRange.hpp index 6e65cde0cf88..81931467c5a5 100644 --- a/packages/kokkos/core/unit_test/TestTeamMDRange.hpp +++ b/packages/kokkos/core/unit_test/TestTeamMDRange.hpp @@ -169,7 +169,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -202,7 +209,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -236,7 +250,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -272,7 +293,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -310,7 +338,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4, n5); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -350,7 +385,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { n6); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -420,7 +462,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -457,7 +506,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -496,7 +552,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -536,7 +599,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4, n5); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -579,7 +649,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { n6); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -620,7 +697,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -653,7 +737,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -687,7 +778,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -723,7 +821,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -761,7 +866,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4, n5); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -801,7 +913,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { n6); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -908,13 +1027,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k) = fillFlattenedIndex(i, j, k); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange, TeamType>( @@ -923,7 +1049,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j); }, teamSum); - leagueSum += teamSum; + // FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -952,13 +1084,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l) = fillFlattenedIndex(i, j, k, l); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange, TeamType>( @@ -966,7 +1105,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { [=](const int& i, const int& j, const int& k, DataType& threadSum) { threadSum += v(leagueRank, i, j, k); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -997,13 +1142,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m) = fillFlattenedIndex(i, j, k, l, m); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange, TeamType>( @@ -1013,7 +1165,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j, k, l); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1045,13 +1203,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m, n) = fillFlattenedIndex(i, j, k, l, m, n); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange, TeamType>( @@ -1061,7 +1226,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j, k, l, m); }, teamSum); - leagueSum += teamSum; + // FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1100,13 +1271,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange, TeamType>( @@ -1116,7 +1294,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j, k, l, m, n); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1157,13 +1341,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange, TeamType>( @@ -1174,7 +1365,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j, k, l, m, n, o); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1207,20 +1404,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l) = fillFlattenedIndex(i, j, k, l); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange, TeamType>( team, n1, n2); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1228,11 +1431,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k); }, threadSum); - - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); @@ -1263,20 +1464,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m) = fillFlattenedIndex(i, j, k, l, m); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange, TeamType>( team, n1, n2, n3); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1286,10 +1493,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { }, threadSum); - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); @@ -1321,20 +1527,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m, n) = fillFlattenedIndex(i, j, k, l, m, n); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange, TeamType>( team, n1, n2, n3, n4); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1344,10 +1556,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { }, threadSum); - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); @@ -1384,20 +1595,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange, TeamType>( team, n1, n2, n3, n4, n5); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1407,10 +1624,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { }, threadSum); - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); @@ -1451,20 +1667,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange, TeamType>( team, n1, n2, n3, n4, n5, n6); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1474,10 +1696,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { }, threadSum); - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); @@ -1510,13 +1731,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l) = fillFlattenedIndex(i, j, k, l); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange, TeamType>( @@ -1527,7 +1755,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { [=](const int& i, const int& j, const int& k, DataType& vectorSum) { vectorSum += v(leagueRank, i, j, k); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1558,13 +1792,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m) = fillFlattenedIndex(i, j, k, l, m); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange, TeamType>( @@ -1577,7 +1818,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k, l); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1609,13 +1856,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m, n) = fillFlattenedIndex(i, j, k, l, m, n); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange, TeamType>( @@ -1628,7 +1882,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k, l, m); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1665,13 +1925,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange, TeamType>( @@ -1684,7 +1951,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k, l, m, n); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1725,13 +1998,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange, TeamType>( @@ -1745,7 +2025,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k, l, m, n, o); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1904,13 +2190,6 @@ TEST(TEST_CATEGORY, ThreadVectorMDRangeParallelReduce) { GTEST_SKIP() << "skipping because of bug in group_barrier implementation"; #endif -// FIXME_OPENMPTARGET_CRAY: The unit tests fails correctness. -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CRAYCLANG) - if (std::is_same_v) - GTEST_SKIP() << "Cray compiler fails correctness at runtime with the " - "OpenMPTarget backend."; -#endif - TestThreadVectorMDRangeParallelReduce:: test_parallel_reduce_for_4D_ThreadVectorMDRange(dims); TestThreadVectorMDRangeParallelReduce:: @@ -1944,13 +2223,6 @@ TEST(TEST_CATEGORY, TeamVectorMDRangeParallelReduce) { GTEST_SKIP() << "skipping because of bug in group_barrier implementation"; #endif -// FIXME_OPENMPTARGET_CRAY: The unit tests fails correctness. -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CRAYCLANG) - if (std::is_same_v) - GTEST_SKIP() << "Cray compiler fails correctness at runtime with the " - "OpenMPTarget backend."; -#endif - TestTeamVectorMDRangeParallelReduce:: test_parallel_reduce_for_4D_TeamVectorMDRange(dims); TestTeamVectorMDRangeParallelReduce:: diff --git a/packages/kokkos/core/unit_test/TestTeamPolicyConstructors.hpp b/packages/kokkos/core/unit_test/TestTeamPolicyConstructors.hpp index 5b0bfdb1755c..9d89f7570860 100644 --- a/packages/kokkos/core/unit_test/TestTeamPolicyConstructors.hpp +++ b/packages/kokkos/core/unit_test/TestTeamPolicyConstructors.hpp @@ -20,11 +20,24 @@ namespace { +struct SomeTag {}; + +struct FunctorFor { + KOKKOS_FUNCTION + void operator()( + Kokkos::TeamPolicy::member_type const&) const {} + + KOKKOS_FUNCTION + void operator()( + SomeTag, Kokkos::TeamPolicy::member_type const&) const {} +}; + template void test_run_time_parameters() { int league_size = 131; using ExecutionSpace = typename Policy::execution_space; + using ParallelTag = Kokkos::ParallelForTag; int team_size = 4 < ExecutionSpace().concurrency() ? 4 : ExecutionSpace().concurrency(); #ifdef KOKKOS_ENABLE_HPX @@ -44,6 +57,8 @@ void test_run_time_parameters() { ASSERT_EQ(p1.team_size(), team_size); ASSERT_GT(p1.chunk_size(), 0); ASSERT_EQ(p1.scratch_size(0), 0u); + ASSERT_GT(p1.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p1.team_size_recommended(FunctorFor(), ParallelTag()), 0); Policy p2 = p1.set_chunk_size(chunk_size); ASSERT_EQ(p1.league_size(), league_size); @@ -112,6 +127,8 @@ void test_run_time_parameters() { Policy p8; // default constructed ASSERT_EQ(p8.league_size(), 0); ASSERT_EQ(p8.scratch_size(0), 0u); + ASSERT_GT(p8.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p8.team_size_recommended(FunctorFor(), ParallelTag()), 0); p8 = p3; // call assignment operator ASSERT_EQ(p3.league_size(), league_size); ASSERT_EQ(p3.team_size(), team_size); @@ -121,11 +138,25 @@ void test_run_time_parameters() { ASSERT_EQ(p8.team_size(), team_size); ASSERT_EQ(p8.chunk_size(), chunk_size); ASSERT_EQ(p8.scratch_size(0), size_t(scratch_size)); + + Policy p9(league_size, Kokkos::AUTO); + ASSERT_EQ(p9.league_size(), league_size); + ASSERT_GT(p9.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p9.team_size_recommended(FunctorFor(), ParallelTag()), 0); + + Policy p10(league_size, team_size, Kokkos::AUTO); + ASSERT_EQ(p10.league_size(), league_size); + ASSERT_EQ(p10.team_size(), team_size); + ASSERT_GT(p10.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p10.team_size_recommended(FunctorFor(), ParallelTag()), 0); + + Policy p11(league_size, Kokkos::AUTO, Kokkos::AUTO); + ASSERT_EQ(p11.league_size(), league_size); + ASSERT_GT(p11.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p11.team_size_recommended(FunctorFor(), ParallelTag()), 0); } TEST(TEST_CATEGORY, team_policy_runtime_parameters) { - struct SomeTag {}; - using TestExecSpace = TEST_EXECSPACE; using DynamicSchedule = Kokkos::Schedule; using LongIndex = Kokkos::IndexType; diff --git a/packages/kokkos/core/unit_test/TestTeamVector.hpp b/packages/kokkos/core/unit_test/TestTeamVector.hpp index 39122736ed7e..5e16539d652c 100644 --- a/packages/kokkos/core/unit_test/TestTeamVector.hpp +++ b/packages/kokkos/core/unit_test/TestTeamVector.hpp @@ -1012,7 +1012,6 @@ struct checkScan { }; } // namespace VectorScanReducer -#if !defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) TEST(TEST_CATEGORY, team_vector) { ASSERT_TRUE((TestTeamVector::Test(0))); ASSERT_TRUE((TestTeamVector::Test(1))); @@ -1028,9 +1027,7 @@ TEST(TEST_CATEGORY, team_vector) { ASSERT_TRUE((TestTeamVector::Test(11))); ASSERT_TRUE((TestTeamVector::Test(12))); } -#endif -#if !defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) TEST(TEST_CATEGORY, triple_nested_parallelism) { // With KOKKOS_ENABLE_DEBUG enabled, the functor uses too many registers to run // with a team size of 32 on GPUs, 16 is the max possible (at least on a K80 @@ -1055,7 +1052,6 @@ TEST(TEST_CATEGORY, triple_nested_parallelism) { TestTripleNestedReduce(8192, 2048, 16, 16); TestTripleNestedReduce(8192, 2048, 7, 16); } -#endif TEST(TEST_CATEGORY, parallel_scan_with_reducers) { using T = double; diff --git a/packages/kokkos/core/unit_test/TestUtilities.hpp b/packages/kokkos/core/unit_test/TestUtilities.hpp index b1f9d30c1fc9..ad5a0df92de2 100644 --- a/packages/kokkos/core/unit_test/TestUtilities.hpp +++ b/packages/kokkos/core/unit_test/TestUtilities.hpp @@ -25,20 +25,18 @@ namespace Test { void test_is_specialization_of() { using Kokkos::Impl::is_specialization_of; - static_assert(is_specialization_of, Kokkos::pair>{}, - ""); - static_assert(!is_specialization_of, Kokkos::pair>{}, ""); - static_assert(is_specialization_of, Kokkos::View>{}, ""); + static_assert(is_specialization_of, Kokkos::pair>{}); + static_assert(!is_specialization_of, Kokkos::pair>{}); + static_assert(is_specialization_of, Kokkos::View>{}); // NOTE Not removing cv-qualifiers - static_assert(!is_specialization_of const, Kokkos::View>{}, - ""); + static_assert( + !is_specialization_of const, Kokkos::View>{}); // NOTE Would not compile because Kokkos::Array takes a non-type template // parameter - // static_assert(is_specialization_of, Kokkos::Array>{}, - // ""); + // static_assert(is_specialization_of, + // Kokkos::Array>{}); // But this is fine of course - static_assert(!is_specialization_of, Kokkos::pair>{}, - ""); + static_assert(!is_specialization_of, Kokkos::pair>{}); } namespace { diff --git a/packages/kokkos/core/unit_test/TestViewAPI.hpp b/packages/kokkos/core/unit_test/TestViewAPI.hpp index ffc500e4a9ad..ca098dbc2472 100644 --- a/packages/kokkos/core/unit_test/TestViewAPI.hpp +++ b/packages/kokkos/core/unit_test/TestViewAPI.hpp @@ -958,8 +958,7 @@ class TestViewAPI { using mirror_type = typename view_type::HostMirror; static_assert(std::is_same::value, - ""); + typename mirror_type::memory_space>::value); view_type a("a"); mirror_type am = Kokkos::create_mirror_view(a); @@ -1005,25 +1004,25 @@ class TestViewAPI { hView3 hv_3("dView3::HostMirror", N0); hView4 hv_4("dView4::HostMirror", N0); - dView0 dv_0_1(nullptr, 0); + dView0 dv_0_1(nullptr); dView0 dv_0_2(hv_0.label(), hv_0.layout()); - dView1 dv_1_1(nullptr, 0); + dView1 dv_1_1(nullptr, N0); dView1 dv_1_2(hv_1.label(), hv_1.layout()); - dView2 dv_2_1(nullptr, 0); + dView2 dv_2_1(nullptr, N0); dView2 dv_2_2(hv_2.label(), hv_2.layout()); - dView3 dv_3_1(nullptr, 0); + dView3 dv_3_1(nullptr, N0); dView3 dv_3_2(hv_3.label(), hv_3.layout()); - dView4 dv_4_1(nullptr, 0); + dView4 dv_4_1(nullptr, N0); dView4 dv_4_2(hv_4.label(), hv_4.layout()); } static void run_test_contruction_from_layout_2() { using dView3_0 = Kokkos::View; - using dView3_1 = Kokkos::View; + using dView3_1 = Kokkos::View; using dView3_2 = Kokkos::View; using dView3_3 = Kokkos::View; @@ -1554,6 +1553,7 @@ class TestViewAPI { Kokkos::CudaUVMSpace>::value) return; #endif + bool did_throw = false; auto alloc_size = std::numeric_limits::max() - 42; try { auto should_always_fail = dView1("hello_world_failure", alloc_size); @@ -1585,7 +1585,9 @@ class TestViewAPI { "because of an unknown error.", msg); } #endif + did_throw = true; } + ASSERT_TRUE(did_throw); } }; diff --git a/packages/kokkos/core/unit_test/TestViewAPI_d.hpp b/packages/kokkos/core/unit_test/TestViewAPI_d.hpp index 08d21f54499f..b0d759ffccc6 100644 --- a/packages/kokkos/core/unit_test/TestViewAPI_d.hpp +++ b/packages/kokkos/core/unit_test/TestViewAPI_d.hpp @@ -27,8 +27,19 @@ TEST(TEST_CATEGORY, view_api_d) { } TEST(TEST_CATEGORY, view_allocation_error) { +#if defined(__has_feature) +#if __has_feature(address_sanitizer) + GTEST_SKIP() << "AddressSanitzer detects allocating too much memory " + "preventing our checks to run"; +#endif +#endif #if ((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 3)) GTEST_SKIP() << "ROCm 5.3 segfaults when trying to allocate too much memory"; +#endif +#if defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC + if (std::is_same_v) { + GTEST_SKIP() << "acc_malloc() not properly returning nullptr"; + } #endif TestViewAPI::run_test_error(); } diff --git a/packages/kokkos/core/unit_test/TestViewCopy_a.hpp b/packages/kokkos/core/unit_test/TestViewCopy_a.hpp index 3bfc93aadacf..a4735b299887 100644 --- a/packages/kokkos/core/unit_test/TestViewCopy_a.hpp +++ b/packages/kokkos/core/unit_test/TestViewCopy_a.hpp @@ -147,6 +147,40 @@ TEST(TEST_CATEGORY, view_copy_tests) { Kokkos::deep_copy(s_a, hs_a); ASSERT_TRUE(run_check(s_a, 6)); } + } else { + // These copies won't succeed, but they should each throw + // an exception whose message contains the view labels, + // and the names of the views' memory spaces. + // + // Note: original a,b both have the same device type, + // and their mirrors have the same device type. + using memory_space = typename decltype(a)::memory_space; + using mirror_memory_space = typename decltype(h_a)::memory_space; + bool threw = false; + std::string msg; + try { + Kokkos::deep_copy(hs_b, s_b); + } catch (std::exception& e) { + threw = true; + msg = e.what(); + } + ASSERT_TRUE(threw); + ASSERT_NE(msg.find(hs_b.label()), std::string::npos); + ASSERT_NE(msg.find(s_b.label()), std::string::npos); + ASSERT_NE(msg.find(memory_space().name()), std::string::npos); + ASSERT_NE(msg.find(mirror_memory_space().name()), std::string::npos); + threw = false; + try { + Kokkos::deep_copy(s_a, hs_a); + } catch (std::exception& e) { + threw = true; + msg = e.what(); + } + ASSERT_TRUE(threw); + ASSERT_NE(msg.find(s_a.label()), std::string::npos); + ASSERT_NE(msg.find(hs_a.label()), std::string::npos); + ASSERT_NE(msg.find(memory_space().name()), std::string::npos); + ASSERT_NE(msg.find(mirror_memory_space().name()), std::string::npos); } // Contiguous copies diff --git a/packages/kokkos/core/unit_test/TestViewCtorDimMatch.hpp b/packages/kokkos/core/unit_test/TestViewCtorDimMatch.hpp index d71841eef847..40b7737f2e45 100644 --- a/packages/kokkos/core/unit_test/TestViewCtorDimMatch.hpp +++ b/packages/kokkos/core/unit_test/TestViewCtorDimMatch.hpp @@ -19,33 +19,72 @@ namespace Test { -#define LIVE(EXPR, ARGS, DYNRANK) EXPECT_NO_THROW(EXPR) -#define DIE(EXPR, ARGS, DYNRANK) \ - ASSERT_DEATH( \ - EXPR, \ - "Constructor for Kokkos View 'v_" #ARGS \ - "' has mismatched number of arguments. Number of arguments = " #ARGS \ - " but dynamic rank = " #DYNRANK) +template +void test_matching_arguments_rank_helper(std::index_sequence) { + constexpr int nargs = sizeof...(Is); + using view_type = Kokkos::View; + if (nargs == rank || nargs == dynrank) { + EXPECT_NO_THROW({ view_type v("v", ((Is * 0) + 1)...); }); + EXPECT_NO_THROW({ view_type v(nullptr, ((Is * 0) + 1)...); }); + } else { + ASSERT_DEATH( + { view_type v("v", ((Is * 0) + 1)...); }, + "Constructor for Kokkos::View 'v' has mismatched number of arguments. " + "The number of arguments = " + + std::to_string(nargs) + + " neither matches the dynamic rank = " + std::to_string(dynrank) + + " nor the total rank = " + std::to_string(rank)); + ASSERT_DEATH( + { view_type v(nullptr, ((Is * 0) + 1)...); }, + "Constructor for Kokkos::View 'UNMANAGED' has mismatched number of " + "arguments. " + "The number of arguments = " + + std::to_string(nargs) + + " neither matches the dynamic rank = " + std::to_string(dynrank) + + " nor the total rank = " + std::to_string(rank)); + } +} -#define PARAM_0 -#define PARAM_1 1 -#define PARAM_2 1, 1 -#define PARAM_3 1, 1, 1 -#define PARAM_4 1, 1, 1, 1 -#define PARAM_5 1, 1, 1, 1, 1 -#define PARAM_6 1, 1, 1, 1, 1, 1 -#define PARAM_7 1, 1, 1, 1, 1, 1, 1 +template class RankType> +void test_matching_arguments_rank() { + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<0>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<1>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<2>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<3>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<4>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<5>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<6>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<7>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<8>()); +} -#define PARAM_0_RANK 0 -#define PARAM_1_RANK 1 -#define PARAM_2_RANK 2 -#define PARAM_3_RANK 3 -#define PARAM_4_RANK 4 -#define PARAM_5_RANK 5 -#define PARAM_6_RANK 6 -#define PARAM_7_RANK 7 +template +struct DynamicRank { + using type = typename DynamicRank::type*; +}; -using DType = int; +template <> +struct DynamicRank<0> { + using type = int; +}; // Skip test execution when KOKKOS_ENABLE_OPENMPTARGET is enabled until // Kokkos::abort() aborts properly on that backend @@ -53,348 +92,110 @@ using DType = int; TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_params_dyn) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - using DType_0 = DType; - using DType_1 = DType *; - using DType_2 = DType **; - using DType_3 = DType ***; - using DType_4 = DType ****; - using DType_5 = DType *****; - using DType_6 = DType ******; - using DType_7 = DType *******; - { - // test View parameters for View dim = 0, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 1, dynamic = 1 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 1); - LIVE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 1); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 1); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 1); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 1); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 1); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 1); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 1); - } - - { - // test View parameters for View dim = 2, dynamic = 2 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 2); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 2); - LIVE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 2); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 2); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 2); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 2); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 2); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 2); - } - - { - // test View parameters for View dim = 3, dynamic = 3 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 3); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 3); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 3); - LIVE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 3); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 3); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 3); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 3); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 3); - } - - { - // test View parameters for View dim = 4, dynamic = 4 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 4); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 4); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 4); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 4); - LIVE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 4); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 4); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 4); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 4); - } - - { - // test View parameters for View dim = 5, dynamic = 5 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 5); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 5); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 5); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 5); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 5); - LIVE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 5); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 5); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 5); - } +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECKS + test_matching_arguments_rank<0, 0, DynamicRank>(); // dim = 0, dynamic = 0 + test_matching_arguments_rank<1, 1, DynamicRank>(); // dim = 1, dynamic = 1 + test_matching_arguments_rank<2, 2, DynamicRank>(); // dim = 2, dynamic = 2 + test_matching_arguments_rank<3, 3, DynamicRank>(); // dim = 3, dynamic = 3 + test_matching_arguments_rank<4, 4, DynamicRank>(); // dim = 4, dynamic = 4 + test_matching_arguments_rank<5, 5, DynamicRank>(); // dim = 5, dynamic = 5 + test_matching_arguments_rank<6, 6, DynamicRank>(); // dim = 6, dynamic = 6 + test_matching_arguments_rank<7, 7, DynamicRank>(); // dim = 7, dynamic = 7 + test_matching_arguments_rank<8, 8, DynamicRank>(); // dim = 8, dynamic = 8 +#endif +} - { - // test View parameters for View dim = 6, dynamic = 6 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 6); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 6); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 6); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 6); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 6); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 6); - LIVE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 6); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 6); - } +template +struct StaticRank { + using type = typename StaticRank::type[1]; +}; - { - // test View parameters for View dim = 7, dynamic = 7 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 7); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 7); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 7); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 7); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 7); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 7); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 7); - LIVE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 7); - } -} +template <> +struct StaticRank<0> { + using type = int; +}; TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_params_stat) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - using DType_0 = DType; - using DType_1 = DType[1]; - using DType_2 = DType[1][1]; - using DType_3 = DType[1][1][1]; - using DType_4 = DType[1][1][1][1]; - using DType_5 = DType[1][1][1][1][1]; - using DType_6 = DType[1][1][1][1][1][1]; - using DType_7 = DType[1][1][1][1][1][1][1]; - { - // test View parameters for View dim = 0, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 1, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - LIVE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 2, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - LIVE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 3, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - LIVE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 4, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - LIVE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECKS + test_matching_arguments_rank<0, 0, StaticRank>(); // dim = 0, dynamic = 0 + test_matching_arguments_rank<1, 0, StaticRank>(); // dim = 1, dynamic = 0 + test_matching_arguments_rank<2, 0, StaticRank>(); // dim = 2, dynamic = 0 + test_matching_arguments_rank<3, 0, StaticRank>(); // dim = 3, dynamic = 0 + test_matching_arguments_rank<4, 0, StaticRank>(); // dim = 4, dynamic = 0 + test_matching_arguments_rank<5, 0, StaticRank>(); // dim = 5, dynamic = 0 + test_matching_arguments_rank<6, 0, StaticRank>(); // dim = 6, dynamic = 0 + test_matching_arguments_rank<7, 0, StaticRank>(); // dim = 7, dynamic = 0 + test_matching_arguments_rank<8, 0, StaticRank>(); // dim = 8, dynamic = 0 +#endif +} - { - // test View parameters for View dim = 5, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - LIVE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } +template +struct MixedRank { + using type = typename DynamicRank::type[1]; +}; - { - // test View parameters for View dim = 6, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - LIVE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 7, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - LIVE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } -} +template <> +struct MixedRank<0> { + using type = int; +}; TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_params_mix) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - using DType_0 = DType; - using DType_1 = DType[1]; - using DType_2 = DType * [1]; - using DType_3 = DType * * [1]; - using DType_4 = DType ** * [1]; - using DType_5 = DType *** * [1]; - using DType_6 = DType **** * [1]; - using DType_7 = DType ***** * [1]; - { - // test View parameters for View dim = 0, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 1, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - LIVE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 2, dynamic = 1 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 1); - LIVE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 1); - LIVE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 1); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 1); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 1); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 1); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 1); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 1); - } - - { - // test View parameters for View dim = 3, dynamic = 2 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 2); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 2); - LIVE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 2); - LIVE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 2); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 2); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 2); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 2); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 2); - } +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECKS + test_matching_arguments_rank<0, 0, MixedRank>(); // dim = 0, dynamic = 0 + test_matching_arguments_rank<1, 0, MixedRank>(); // dim = 1, dynamic = 0 + test_matching_arguments_rank<2, 1, MixedRank>(); // dim = 2, dynamic = 1 + test_matching_arguments_rank<3, 2, MixedRank>(); // dim = 3, dynamic = 2 + test_matching_arguments_rank<4, 3, MixedRank>(); // dim = 4, dynamic = 3 + test_matching_arguments_rank<5, 4, MixedRank>(); // dim = 5, dynamic = 4 + test_matching_arguments_rank<6, 5, MixedRank>(); // dim = 6, dynamic = 5 + test_matching_arguments_rank<7, 6, MixedRank>(); // dim = 7, dynamic = 6 + test_matching_arguments_rank<8, 7, MixedRank>(); // dim = 8, dynamic = 7 +#endif +} - { - // test View parameters for View dim = 4, dynamic = 3 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 3); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 3); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 3); - LIVE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 3); - LIVE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 3); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 3); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 3); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 3); - } +#define CHECK_DEATH(EXPR) \ + ASSERT_DEATH(EXPR, \ + "The specified run-time extent for Kokkos::View 'v' does not " \ + "match the compile-time extent in dimension 0. The given " \ + "extent is 2 but should be 1.") - { - // test View parameters for View dim = 5, dynamic = 4 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 4); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 4); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 4); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 4); - LIVE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 4); - LIVE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 4); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 4); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 4); - } +#define CHECK_DEATH_UNMANAGED(EXPR) \ + ASSERT_DEATH( \ + EXPR, \ + "The specified run-time extent for Kokkos::View 'UNMANAGED' does not " \ + "match the compile-time extent in dimension 0. The given " \ + "extent is 2 but should be 1.") - { - // test View parameters for View dim = 6, dynamic = 5 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 5); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 5); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 5); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 5); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 5); - LIVE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 5); - LIVE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 5); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 5); - } +TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_static_extents) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - { - // test View parameters for View dim = 7, dynamic = 6 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 6); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 6); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 6); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 6); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 6); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 6); - LIVE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 6); - LIVE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 6); - } +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECKS + // clang-format off + CHECK_DEATH({ Kokkos::View v("v", 2); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1, 1); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1, 1, 1); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1, 1, 1, 1); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1, 1, 1, 1, 1); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1, 1, 1, 1, 1, 1); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1, 1, 1, 1, 1, 1, 1); }); + + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1, 1, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1, 1, 1, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1, 1, 1, 1, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1, 1, 1, 1, 1, 1, 1); }); + // clang-format on +#endif } -#endif // KOKKOS_ENABLE_OPENMPTARGET - -#undef PARAM_0 -#undef PARAM_1 -#undef PARAM_2 -#undef PARAM_3 -#undef PARAM_4 -#undef PARAM_5 -#undef PARAM_6 -#undef PARAM_7 -#undef PARAM_0_RANK -#undef PARAM_1_RANK -#undef PARAM_2_RANK -#undef PARAM_3_RANK -#undef PARAM_4_RANK -#undef PARAM_5_RANK -#undef PARAM_6_RANK -#undef PARAM_7_RANK - -#undef DType - -#undef LIVE -#undef DIE +#undef CHECK_DEATH +#endif // KOKKOS_ENABLE_OPENMPTARGET } // namespace Test diff --git a/packages/kokkos/core/unit_test/TestViewEmptyRuntimeUnmanaged.hpp b/packages/kokkos/core/unit_test/TestViewEmptyRuntimeUnmanaged.hpp new file mode 100644 index 000000000000..b156b72860ec --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewEmptyRuntimeUnmanaged.hpp @@ -0,0 +1,55 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include + +namespace { + +template +void test_empty_view_runtime_unmanaged() { + T d{}; + auto* p = reinterpret_cast(0xABADBABE); + + (void)Kokkos::View(p); + (void)Kokkos::View(&d); + (void)Kokkos::View(nullptr); + (void)Kokkos::View(NULL); // NOLINT(modernize-use-nullptr) + (void)Kokkos::View(0); // NOLINT(modernize-use-nullptr) + + (void)Kokkos::View(p, 0); + (void)Kokkos::View(&d, 0); + (void)Kokkos::View(nullptr, 0); + (void)Kokkos::View(NULL, 0); // NOLINT(modernize-use-nullptr) + (void)Kokkos::View(0, 0); // NOLINT(modernize-use-nullptr) + + (void)Kokkos::View(p, 0, 0); + (void)Kokkos::View(&d, 0, 0); + (void)Kokkos::View(nullptr, 0, 0); + (void)Kokkos::View(NULL, 0, 0); // NOLINT(modernize-use-nullptr) + (void)Kokkos::View(0, 0, 0); // NOLINT(modernize-use-nullptr) +} + +TEST(TEST_CATEGORY, view_empty_runtime_unmanaged) { + test_empty_view_runtime_unmanaged(); + test_empty_view_runtime_unmanaged(); + test_empty_view_runtime_unmanaged(); + test_empty_view_runtime_unmanaged(); + test_empty_view_runtime_unmanaged(); +} + +} // namespace diff --git a/packages/kokkos/core/unit_test/TestViewMapping_a.hpp b/packages/kokkos/core/unit_test/TestViewMapping_a.hpp index 9173f0d4316e..a4dfdb26e3f4 100644 --- a/packages/kokkos/core/unit_test/TestViewMapping_a.hpp +++ b/packages/kokkos/core/unit_test/TestViewMapping_a.hpp @@ -73,67 +73,67 @@ void test_view_mapping() { ASSERT_LE(sizeof(dim_s0_s0_s0_s0_s0_s0_s0), 8 * sizeof(unsigned)); ASSERT_EQ(sizeof(dim_s0_s0_s0_s0_s0_s0_s0_s0), 8 * sizeof(unsigned)); #endif - static_assert(int(dim_0::rank) == int(0), ""); - static_assert(int(dim_0::rank_dynamic) == int(0), ""); - static_assert(int(dim_0::ArgN0) == 1, ""); - static_assert(int(dim_0::ArgN1) == 1, ""); - static_assert(int(dim_0::ArgN2) == 1, ""); - - static_assert(int(dim_s2::rank) == int(1), ""); - static_assert(int(dim_s2::rank_dynamic) == int(0), ""); - static_assert(int(dim_s2::ArgN0) == 2, ""); - static_assert(int(dim_s2::ArgN1) == 1, ""); - - static_assert(int(dim_s2_s3::rank) == int(2), ""); - static_assert(int(dim_s2_s3::rank_dynamic) == int(0), ""); - static_assert(int(dim_s2_s3::ArgN0) == 2, ""); - static_assert(int(dim_s2_s3::ArgN1) == 3, ""); - static_assert(int(dim_s2_s3::ArgN2) == 1, ""); - - static_assert(int(dim_s2_s3_s4::rank) == int(3), ""); - static_assert(int(dim_s2_s3_s4::rank_dynamic) == int(0), ""); - static_assert(int(dim_s2_s3_s4::ArgN0) == 2, ""); - static_assert(int(dim_s2_s3_s4::ArgN1) == 3, ""); - static_assert(int(dim_s2_s3_s4::ArgN2) == 4, ""); - static_assert(int(dim_s2_s3_s4::ArgN3) == 1, ""); - - static_assert(int(dim_s0::rank) == int(1), ""); - static_assert(int(dim_s0::rank_dynamic) == int(1), ""); - - static_assert(int(dim_s0_s3::rank) == int(2), ""); - static_assert(int(dim_s0_s3::rank_dynamic) == int(1), ""); - static_assert(int(dim_s0_s3::ArgN0) == 0, ""); - static_assert(int(dim_s0_s3::ArgN1) == 3, ""); - - static_assert(int(dim_s0_s3_s4::rank) == int(3), ""); - static_assert(int(dim_s0_s3_s4::rank_dynamic) == int(1), ""); - static_assert(int(dim_s0_s3_s4::ArgN0) == 0, ""); - static_assert(int(dim_s0_s3_s4::ArgN1) == 3, ""); - static_assert(int(dim_s0_s3_s4::ArgN2) == 4, ""); - - static_assert(int(dim_s0_s0_s4::rank) == int(3), ""); - static_assert(int(dim_s0_s0_s4::rank_dynamic) == int(2), ""); - static_assert(int(dim_s0_s0_s4::ArgN0) == 0, ""); - static_assert(int(dim_s0_s0_s4::ArgN1) == 0, ""); - static_assert(int(dim_s0_s0_s4::ArgN2) == 4, ""); - - static_assert(int(dim_s0_s0_s0::rank) == int(3), ""); - static_assert(int(dim_s0_s0_s0::rank_dynamic) == int(3), ""); - - static_assert(int(dim_s0_s0_s0_s0::rank) == int(4), ""); - static_assert(int(dim_s0_s0_s0_s0::rank_dynamic) == int(4), ""); - - static_assert(int(dim_s0_s0_s0_s0_s0::rank) == int(5), ""); - static_assert(int(dim_s0_s0_s0_s0_s0::rank_dynamic) == int(5), ""); - - static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank) == int(6), ""); - static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(6), ""); - - static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank) == int(7), ""); - static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(7), ""); - - static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank) == int(8), ""); - static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(8), ""); + static_assert(int(dim_0::rank) == int(0)); + static_assert(int(dim_0::rank_dynamic) == int(0)); + static_assert(int(dim_0::ArgN0) == 1); + static_assert(int(dim_0::ArgN1) == 1); + static_assert(int(dim_0::ArgN2) == 1); + + static_assert(int(dim_s2::rank) == int(1)); + static_assert(int(dim_s2::rank_dynamic) == int(0)); + static_assert(int(dim_s2::ArgN0) == 2); + static_assert(int(dim_s2::ArgN1) == 1); + + static_assert(int(dim_s2_s3::rank) == int(2)); + static_assert(int(dim_s2_s3::rank_dynamic) == int(0)); + static_assert(int(dim_s2_s3::ArgN0) == 2); + static_assert(int(dim_s2_s3::ArgN1) == 3); + static_assert(int(dim_s2_s3::ArgN2) == 1); + + static_assert(int(dim_s2_s3_s4::rank) == int(3)); + static_assert(int(dim_s2_s3_s4::rank_dynamic) == int(0)); + static_assert(int(dim_s2_s3_s4::ArgN0) == 2); + static_assert(int(dim_s2_s3_s4::ArgN1) == 3); + static_assert(int(dim_s2_s3_s4::ArgN2) == 4); + static_assert(int(dim_s2_s3_s4::ArgN3) == 1); + + static_assert(int(dim_s0::rank) == int(1)); + static_assert(int(dim_s0::rank_dynamic) == int(1)); + + static_assert(int(dim_s0_s3::rank) == int(2)); + static_assert(int(dim_s0_s3::rank_dynamic) == int(1)); + static_assert(int(dim_s0_s3::ArgN0) == 0); + static_assert(int(dim_s0_s3::ArgN1) == 3); + + static_assert(int(dim_s0_s3_s4::rank) == int(3)); + static_assert(int(dim_s0_s3_s4::rank_dynamic) == int(1)); + static_assert(int(dim_s0_s3_s4::ArgN0) == 0); + static_assert(int(dim_s0_s3_s4::ArgN1) == 3); + static_assert(int(dim_s0_s3_s4::ArgN2) == 4); + + static_assert(int(dim_s0_s0_s4::rank) == int(3)); + static_assert(int(dim_s0_s0_s4::rank_dynamic) == int(2)); + static_assert(int(dim_s0_s0_s4::ArgN0) == 0); + static_assert(int(dim_s0_s0_s4::ArgN1) == 0); + static_assert(int(dim_s0_s0_s4::ArgN2) == 4); + + static_assert(int(dim_s0_s0_s0::rank) == int(3)); + static_assert(int(dim_s0_s0_s0::rank_dynamic) == int(3)); + + static_assert(int(dim_s0_s0_s0_s0::rank) == int(4)); + static_assert(int(dim_s0_s0_s0_s0::rank_dynamic) == int(4)); + + static_assert(int(dim_s0_s0_s0_s0_s0::rank) == int(5)); + static_assert(int(dim_s0_s0_s0_s0_s0::rank_dynamic) == int(5)); + + static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank) == int(6)); + static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(6)); + + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank) == int(7)); + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(7)); + + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank) == int(8)); + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(8)); dim_s0 d1(2, 3, 4, 5, 6, 7, 8, 9); dim_s0_s0 d2(2, 3, 4, 5, 6, 7, 8, 9); @@ -514,11 +514,11 @@ void test_view_mapping() { { using namespace Kokkos::Impl; - static_assert(rank_dynamic<>::value == 0, ""); - static_assert(rank_dynamic<1>::value == 0, ""); - static_assert(rank_dynamic<0>::value == 1, ""); - static_assert(rank_dynamic<0, 1>::value == 1, ""); - static_assert(rank_dynamic<0, 0, 1>::value == 2, ""); + static_assert(rank_dynamic<>::value == 0); + static_assert(rank_dynamic<1>::value == 0); + static_assert(rank_dynamic<0>::value == 1); + static_assert(rank_dynamic<0, 1>::value == 1); + static_assert(rank_dynamic<0, 0, 1>::value == 2); } { @@ -529,54 +529,48 @@ void test_view_mapping() { using a_const_int_r1 = ViewArrayAnalysis; using a_const_int_r5 = ViewArrayAnalysis; - static_assert(a_int_r1::dimension::rank == 1, ""); - static_assert(a_int_r1::dimension::rank_dynamic == 1, ""); - static_assert(a_int_r5::dimension::ArgN0 == 0, ""); - static_assert(a_int_r5::dimension::ArgN1 == 0, ""); - static_assert(a_int_r5::dimension::ArgN2 == 4, ""); - static_assert(a_int_r5::dimension::ArgN3 == 5, ""); - static_assert(a_int_r5::dimension::ArgN4 == 6, ""); - static_assert(a_int_r5::dimension::ArgN5 == 1, ""); + static_assert(a_int_r1::dimension::rank == 1); + static_assert(a_int_r1::dimension::rank_dynamic == 1); + static_assert(a_int_r5::dimension::ArgN0 == 0); + static_assert(a_int_r5::dimension::ArgN1 == 0); + static_assert(a_int_r5::dimension::ArgN2 == 4); + static_assert(a_int_r5::dimension::ArgN3 == 5); + static_assert(a_int_r5::dimension::ArgN4 == 6); + static_assert(a_int_r5::dimension::ArgN5 == 1); static_assert( - std::is_same >::value, - ""); + std::is_same >::value); static_assert( - std::is_same::value, ""); + std::is_same::value); - static_assert(a_const_int_r1::dimension::rank == 1, ""); - static_assert(a_const_int_r1::dimension::rank_dynamic == 1, ""); + static_assert(a_const_int_r1::dimension::rank == 1); + static_assert(a_const_int_r1::dimension::rank_dynamic == 1); static_assert(std::is_same >::value, - ""); - static_assert( - std::is_same::value, - ""); + ViewDimension<0> >::value); + static_assert(std::is_same::value); - static_assert(a_const_int_r5::dimension::rank == 5, ""); - static_assert(a_const_int_r5::dimension::rank_dynamic == 2, ""); + static_assert(a_const_int_r5::dimension::rank == 5); + static_assert(a_const_int_r5::dimension::rank_dynamic == 2); - static_assert(a_const_int_r5::dimension::ArgN0 == 0, ""); - static_assert(a_const_int_r5::dimension::ArgN1 == 0, ""); - static_assert(a_const_int_r5::dimension::ArgN2 == 4, ""); - static_assert(a_const_int_r5::dimension::ArgN3 == 5, ""); - static_assert(a_const_int_r5::dimension::ArgN4 == 6, ""); - static_assert(a_const_int_r5::dimension::ArgN5 == 1, ""); + static_assert(a_const_int_r5::dimension::ArgN0 == 0); + static_assert(a_const_int_r5::dimension::ArgN1 == 0); + static_assert(a_const_int_r5::dimension::ArgN2 == 4); + static_assert(a_const_int_r5::dimension::ArgN3 == 5); + static_assert(a_const_int_r5::dimension::ArgN4 == 6); + static_assert(a_const_int_r5::dimension::ArgN5 == 1); static_assert(std::is_same >::value, - ""); - static_assert( - std::is_same::value, - ""); + ViewDimension<0, 0, 4, 5, 6> >::value); + static_assert(std::is_same::value); - static_assert(a_int_r5::dimension::rank == 5, ""); - static_assert(a_int_r5::dimension::rank_dynamic == 2, ""); + static_assert(a_int_r5::dimension::rank == 5); + static_assert(a_int_r5::dimension::rank_dynamic == 2); static_assert(std::is_same >::value, - ""); + ViewDimension<0, 0, 4, 5, 6> >::value); static_assert( - std::is_same::value, ""); + std::is_same::value); } { @@ -587,15 +581,15 @@ void test_view_mapping() { // Dimensions of t_i4 are appended to the multdimensional array. using a_int_r5 = ViewArrayAnalysis; - static_assert(a_int_r5::dimension::rank == 5, ""); - static_assert(a_int_r5::dimension::rank_dynamic == 3, ""); - static_assert(a_int_r5::dimension::ArgN0 == 0, ""); - static_assert(a_int_r5::dimension::ArgN1 == 0, ""); - static_assert(a_int_r5::dimension::ArgN2 == 0, ""); - static_assert(a_int_r5::dimension::ArgN3 == 3, ""); - static_assert(a_int_r5::dimension::ArgN4 == 4, ""); + static_assert(a_int_r5::dimension::rank == 5); + static_assert(a_int_r5::dimension::rank_dynamic == 3); + static_assert(a_int_r5::dimension::ArgN0 == 0); + static_assert(a_int_r5::dimension::ArgN1 == 0); + static_assert(a_int_r5::dimension::ArgN2 == 0); + static_assert(a_int_r5::dimension::ArgN3 == 3); + static_assert(a_int_r5::dimension::ArgN4 == 4); static_assert( - std::is_same::value, ""); + std::is_same::value); } { @@ -603,71 +597,54 @@ void test_view_mapping() { using a_const_int_r1 = ViewDataAnalysis; - static_assert(std::is_void::value, ""); + static_assert(std::is_void::value); static_assert(std::is_same >::value, - ""); + Kokkos::Impl::ViewDimension<0> >::value); static_assert( - std::is_same::value, ""); + std::is_same::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert(std::is_same::value, - ""); + const int*>::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert(std::is_same::value, - ""); + const int>::value); static_assert(std::is_same::value, - ""); + const int*>::value); static_assert( - std::is_same::value, ""); - static_assert( - std::is_same::value, - ""); + std::is_same::value); + static_assert(std::is_same::value); using a_const_int_r3 = ViewDataAnalysis; - static_assert(std::is_void::value, ""); + static_assert(std::is_void::value); static_assert(std::is_same >::value, - ""); + Kokkos::Impl::ViewDimension<0, 0, 4> >::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert(std::is_same::value, - ""); + const int* * [4]>::value); static_assert(std::is_same::value, - ""); + const int* * [4]>::value); static_assert(std::is_same::value, - ""); + const int>::value); static_assert(std::is_same::value, - ""); + const int* * [4]>::value); static_assert(std::is_same::value, - ""); - static_assert( - std::is_same::value, - ""); + int* * [4]>::value); + static_assert(std::is_same::value); static_assert( std::is_same::value, - ""); + int* * [4]>::value); // std::cout << "typeid( const int**[4] ).name() = " << typeid( const // int**[4] ).name() << std::endl; diff --git a/packages/kokkos/core/unit_test/TestViewMapping_b.hpp b/packages/kokkos/core/unit_test/TestViewMapping_b.hpp index 9ac4e7da8453..4aee035d17a6 100644 --- a/packages/kokkos/core/unit_test/TestViewMapping_b.hpp +++ b/packages/kokkos/core/unit_test/TestViewMapping_b.hpp @@ -156,7 +156,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using dst_traits = Kokkos::ViewTraits; using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(mapping::is_assignable, ""); + static_assert(mapping::is_assignable); Kokkos::View src; Kokkos::View dst(src); @@ -167,7 +167,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using dst_traits = Kokkos::ViewTraits; using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(mapping::is_assignable, ""); + static_assert(mapping::is_assignable); Kokkos::View src; Kokkos::View dst(src); @@ -180,7 +180,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(mapping::is_assignable, ""); + static_assert(mapping::is_assignable); Kokkos::View src; Kokkos::View dst(src); @@ -193,7 +193,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(mapping::is_assignable, ""); + static_assert(mapping::is_assignable); Kokkos::View src; Kokkos::View dst(src); @@ -206,7 +206,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(!mapping::is_assignable, ""); + static_assert(!mapping::is_assignable); } { // Assignment of rank-2 Right = Left @@ -215,7 +215,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(!mapping::is_assignable, ""); + static_assert(!mapping::is_assignable); } } @@ -226,7 +226,7 @@ TEST(TEST_CATEGORY, view_mapping_trivially_copyable) { using src_traits = dst_traits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(std::is_trivially_copyable{}, ""); + static_assert(std::is_trivially_copyable{}); } } // namespace Test diff --git a/packages/kokkos/core/unit_test/TestViewOutOfBoundsAccess.hpp b/packages/kokkos/core/unit_test/TestViewOutOfBoundsAccess.hpp new file mode 100644 index 000000000000..2716856c1fcf --- /dev/null +++ b/packages/kokkos/core/unit_test/TestViewOutOfBoundsAccess.hpp @@ -0,0 +1,175 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +#include + +namespace { + +TEST(TEST_CATEGORY, append_formatted_multidimensional_index) { + using Kokkos::Impl::append_formatted_multidimensional_index; + { + char buffer[64] = "my prefix "; + append_formatted_multidimensional_index(buffer, 1); + EXPECT_STREQ(buffer, "my prefix [1]"); + } + { + char buffer[64] = "I was here"; + append_formatted_multidimensional_index(buffer, 1, 2, 3); + EXPECT_STREQ(buffer, "I was here[1,2,3]"); + } + { + char buffer[64] = "with mixed integer types "; + append_formatted_multidimensional_index(buffer, 1u, -2); + EXPECT_STREQ(buffer, "with mixed integer types [1,-2]"); + } +} + +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + +template +struct TestViewOutOfBoundAccess { + View v; + static constexpr auto rank = View::rank; + + template + KOKKOS_FUNCTION decltype(auto) bad_access(std::index_sequence) const { + return v((Is * 1 + Is == 0 ? v.extent(Is) + 3 : 0)...); + } + + KOKKOS_FUNCTION void operator()(int) const { + ++bad_access(std::make_index_sequence{}); + } + + template + std::string get_details(std::index_sequence) { + std::stringstream ss; + ss << "with indices \\["; + ((ss << (Is == 0 ? v.extent(Is) + 3 : 0) + << (Is == View::rank() - 1 ? "\\]" : ",")), + ...); + ss << " but extents \\["; + ((ss << v.extent(Is) << (Is == View::rank() - 1 ? "\\]" : ",")), ...); + return ss.str(); + } + + auto get_details() { + return get_details(std::make_index_sequence()); + } + + TestViewOutOfBoundAccess(View w, ExecutionSpace const& s, std::string matcher) + : v(std::move(w)) { + constexpr bool view_accessible_from_execution_space = + Kokkos::SpaceAccessibility< + /*AccessSpace=*/ExecutionSpace, + /*MemorySpace=*/typename View::memory_space>::accessible; + EXPECT_TRUE(view_accessible_from_execution_space); + + matcher += ".*" + get_details(); + + EXPECT_DEATH( + { + Kokkos::parallel_for(Kokkos::RangePolicy(s, 0, 1), + *this); + Kokkos::fence(); + }, + matcher); + } +}; + +template +auto make_view_impl(LblOrPtr x, std::index_sequence) { + return View(x, (Is + 1)...); +} + +template +auto make_view(LblOrPtr x) { + return make_view_impl(std::move(x), + std::make_index_sequence()); +} + +template +void test_view_out_of_bounds_access() { + ExecutionSpace const exec_space{}; + // clang-format off + using V1 = Kokkos::View; + using V2 = Kokkos::View; + using V3 = Kokkos::View; + using V4 = Kokkos::View; + using V5 = Kokkos::View; + using V6 = Kokkos::View; + using V7 = Kokkos::View; + using V8 = Kokkos::View; + std::string const prefix = "Kokkos::View ERROR: out of bounds access"; + std::string const lbl = "my_label"; + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + int* const ptr = nullptr; + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + // clang-format on +} + +TEST(TEST_CATEGORY_DEATH, view_out_of_bounds_access) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + + using ExecutionSpace = TEST_EXECSPACE; + + if (false && Kokkos::SpaceAccessibility< + /*AccessSpace=*/ExecutionSpace, + /*MemorySpace=*/Kokkos::HostSpace>::accessible) { + GTEST_SKIP() << "skipping since no memory access violation would occur"; + } + +#if defined(KOKKOS_ENABLE_SYCL) && defined(NDEBUG) // FIXME_SYCL + if (std::is_same_v) { + GTEST_SKIP() << "skipping SYCL device-side abort does not work when NDEBUG " + "is defined"; + } +#endif +#if defined(KOKKOS_ENABLE_OPENMPTARGET) // FIXME_OPENMPTARGET + if (std::is_same_v) { + GTEST_SKIP() << "skipping because OpenMPTarget backend is currently not " + "able to abort from the device"; + } +#endif +#if defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC + if (std::is_same::value) { + GTEST_SKIP() << "skipping because OpenACC backend is currently not " + "able to abort from the device"; + } +#endif + + test_view_out_of_bounds_access(); +} + +#endif + +} // namespace diff --git a/packages/kokkos/core/unit_test/UnitTest_DeviceAndThreads.cpp b/packages/kokkos/core/unit_test/UnitTest_DeviceAndThreads.cpp index b522ac3e69b7..25442146fbad 100644 --- a/packages/kokkos/core/unit_test/UnitTest_DeviceAndThreads.cpp +++ b/packages/kokkos/core/unit_test/UnitTest_DeviceAndThreads.cpp @@ -19,22 +19,23 @@ #include #include -int get_device_count() { +int get_num_devices() { + int num_devices; #if defined(KOKKOS_ENABLE_CUDA) - int count; - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count)); - return count; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&num_devices)); #elif defined(KOKKOS_ENABLE_HIP) - int count; - KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDevice(&count)); - return count; + KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&num_devices)); #elif defined(KOKKOS_ENABLE_OPENMPTARGET) - return omp_get_num_devices(); + num_devices = omp_get_num_devices(); #elif defined(KOKKOS_ENABLE_OPENACC) - return acc_get_num_devices(acc_get_device_type()); + num_devices = acc_get_num_devices(acc_get_device_type()); +#elif defined(KOKKOS_ENABLE_SYCL) + num_devices = sycl::device::get_devices(sycl::info::device_type::gpu).size(); #else - return 0; + num_devices = -1; #endif + assert(num_devices == Kokkos::num_devices()); + return num_devices; } int get_device_id() { @@ -44,15 +45,17 @@ int get_device_id() { #elif defined(KOKKOS_ENABLE_HIP) KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDevice(&device_id)); #elif defined(KOKKOS_ENABLE_OPENMPTARGET) - device_id = omp_get_device_num(); + device_id = omp_get_default_device(); #elif defined(KOKKOS_ENABLE_OPENACC) - device_id = acc_get_device_num(acc_get_device_type()); + device_id = acc_get_device_num(acc_get_device_type()); #elif defined(KOKKOS_ENABLE_SYCL) - // FIXME_SYCL ? - assert(false); - return -2; + // Not able to query the underlying runtime because there is no such thing as + // device currently being used with SYCL. We go through the Kokkos runtime + // which makes the assert below pointless but it still let us check that + // Kokkos selected the device we asked for from the Python tests. + device_id = Kokkos::device_id(); #else - device_id = -1; + device_id = -1; #endif assert(device_id == Kokkos::device_id()); return device_id; @@ -68,6 +71,14 @@ int get_max_threads() { #endif } +int get_hwloc_enabled() { +#ifdef KOKKOS_ENABLE_HWLOC + return 1; +#else + return 0; +#endif +} + int get_num_threads() { int const num_threads = Kokkos::DefaultHostExecutionSpace().concurrency(); assert(num_threads == Kokkos::num_threads()); @@ -90,9 +101,10 @@ int print_flag(std::string const& flag) { KOKKOS_TEST_PRINT_FLAG(num_threads); KOKKOS_TEST_PRINT_FLAG(max_threads); KOKKOS_TEST_PRINT_FLAG(device_id); - KOKKOS_TEST_PRINT_FLAG(device_count); + KOKKOS_TEST_PRINT_FLAG(num_devices); KOKKOS_TEST_PRINT_FLAG(disable_warnings); KOKKOS_TEST_PRINT_FLAG(tune_internals); + KOKKOS_TEST_PRINT_FLAG(hwloc_enabled); #undef KOKKOS_TEST_PRINT_FLAG diff --git a/packages/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash b/packages/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash index 8fe8e2b5ecea..8bc8ef21cd02 100755 --- a/packages/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash +++ b/packages/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash @@ -4,7 +4,7 @@ HostArch=(SNB HSW SKX KNL) DeviceArch=(Kepler35 Kepler37 Pascal60 Pascal61 Volta70) if [ ! -z "$KOKKOS_HOST_ARCH_TEST" ]; then export KOKKOS_ARCH_TEST=1 - HostArch=(WSM SNB HSW SKX WSM AMDAVX ARMv80 ARMv81 BDW KNC KNL BGQ Power7 Power8 Power9 Zen Zen2 Zen3 ARMv8_ThunderX ARMv8_ThunderX2) + HostArch=(SNB HSW SKX AMDAVX ARMv80 ARMv81 BDW KNC KNL Power8 Power9 Zen Zen2 Zen3 ARMv8_ThunderX ARMv8_ThunderX2) DeviceArch=() fi diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp new file mode 100644 index 000000000000..d94735ceb230 --- /dev/null +++ b/packages/kokkos/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp @@ -0,0 +1,268 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +namespace { + +struct StreamsAndDevices { + std::array streams; + std::array devices; + + StreamsAndDevices() { + int n_devices; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&n_devices)); + + devices = {0, n_devices - 1}; + for (int i = 0; i < 2; ++i) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(devices[i])); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&streams[i])); + } + } + StreamsAndDevices(const StreamsAndDevices &) = delete; + StreamsAndDevices &operator=(const StreamsAndDevices &) = delete; + ~StreamsAndDevices() { + for (int i = 0; i < 2; ++i) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(devices[i])); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(streams[i])); + } + } +}; + +std::array get_execution_spaces( + const StreamsAndDevices &streams_and_devices) { + TEST_EXECSPACE exec0(streams_and_devices.streams[0]); + TEST_EXECSPACE exec1(streams_and_devices.streams[1]); + + // Must return void to use ASSERT_EQ + [&]() { + ASSERT_EQ(exec0.cuda_device(), streams_and_devices.devices[0]); + ASSERT_EQ(exec1.cuda_device(), streams_and_devices.devices[1]); + }(); + + return {exec0, exec1}; +} + +// Test Interoperability with Cuda Streams +void test_policies(TEST_EXECSPACE exec0, Kokkos::View v0, + TEST_EXECSPACE exec, Kokkos::View v) { + using MemorySpace = typename TEST_EXECSPACE::memory_space; + + Kokkos::deep_copy(exec, v, 5); + Kokkos::deep_copy(exec0, v0, 5); + + Kokkos::deep_copy(v, v0); + + int sum; + int sum0; + + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Range_0", + Kokkos::RangePolicy(exec0, 0, 100), + Test::FunctorRange(v0)); + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Range", + Kokkos::RangePolicy(exec, 0, 100), + Test::FunctorRange(v)); + Kokkos::parallel_reduce( + "Test::cuda::raw_cuda_stream::RangeReduce_0", + Kokkos::RangePolicy>(exec0, + 0, 100), + Test::FunctorRangeReduce(v0), sum0); + Kokkos::parallel_reduce( + "Test::cuda::raw_cuda_stream::RangeReduce", + Kokkos::RangePolicy>(exec, 0, + 100), + Test::FunctorRangeReduce(v), sum); + ASSERT_EQ(600, sum0); + ASSERT_EQ(600, sum); + + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::MDRange_0", + Kokkos::MDRangePolicy>( + exec0, {0, 0}, {10, 10}), + Test::FunctorMDRange(v0)); + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::MDRange", + Kokkos::MDRangePolicy>( + exec, {0, 0}, {10, 10}), + Test::FunctorMDRange(v)); + Kokkos::parallel_reduce("Test::cuda::raw_cuda_stream::MDRangeReduce_0", + Kokkos::MDRangePolicy, + Kokkos::LaunchBounds<128, 2>>( + exec0, {0, 0}, {10, 10}), + Test::FunctorMDRangeReduce(v0), sum0); + Kokkos::parallel_reduce("Test::cuda::raw_cuda_stream::MDRangeReduce", + Kokkos::MDRangePolicy, + Kokkos::LaunchBounds<128, 2>>( + exec, {0, 0}, {10, 10}), + Test::FunctorMDRangeReduce(v), sum); + ASSERT_EQ(700, sum0); + ASSERT_EQ(700, sum); + + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Team_0", + Kokkos::TeamPolicy(exec0, 10, 10), + Test::FunctorTeam(v0)); + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Team", + Kokkos::TeamPolicy(exec, 10, 10), + Test::FunctorTeam(v)); + Kokkos::parallel_reduce( + "Test::cuda::raw_cuda_stream::Team_0", + Kokkos::TeamPolicy>(exec0, + 10, 10), + Test::FunctorTeamReduce(v0), sum0); + Kokkos::parallel_reduce( + "Test::cuda::raw_cuda_stream::Team", + Kokkos::TeamPolicy>(exec, 10, + 10), + Test::FunctorTeamReduce(v), sum); + ASSERT_EQ(800, sum0); + ASSERT_EQ(800, sum); +} + +TEST(cuda_multi_gpu, managed_views) { + StreamsAndDevices streams_and_devices; + { + std::array execs = + get_execution_spaces(streams_and_devices); + + Kokkos::View view0( + Kokkos::view_alloc("v0", execs[0]), 100); + Kokkos::View view(Kokkos::view_alloc("v", execs[1]), + 100); + + test_policies(execs[0], view0, execs[1], view); + } +} + +TEST(cuda_multi_gpu, unmanaged_views) { + StreamsAndDevices streams_and_devices; + { + std::array execs = + get_execution_spaces(streams_and_devices); + + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(execs[0].cuda_device())); + int *p0; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMalloc(reinterpret_cast(&p0), sizeof(int) * 100)); + Kokkos::View view0(p0, 100); + + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(execs[1].cuda_device())); + int *p; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMalloc(reinterpret_cast(&p), sizeof(int) * 100)); + Kokkos::View view(p, 100); + + test_policies(execs[0], view0, execs[1], view); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(p0)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(p)); + } +} + +struct ScratchFunctor { + int scratch_size; + int R; + + ScratchFunctor(int scratch_size_, int R_) + : scratch_size(scratch_size_), R(R_) {} + + KOKKOS_FUNCTION + void operator()(const Kokkos::TeamPolicy::member_type &team, + int &error_accum) const { + Kokkos::View scratch_mem( + team.team_scratch(1), scratch_size); + + // Initialize scratch memory + Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size), + [&](int i) { scratch_mem(i) = 0; }); + team.team_barrier(); + + // Increment each entry in scratch memory R times + for (int r = 0; r < R; ++r) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size), + [&](int i) { scratch_mem(i) += 1; }); + } + team.team_barrier(); + + // Check that each scratch entry has been incremented exactly R times + int team_error_accum; + auto R_loc = R; // avoid implicit capture of this + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(team, 0, scratch_size), + [&](int i, int &tsum) { + if (scratch_mem(i) != R_loc) { + tsum += 1; + } + }, + team_error_accum); + Kokkos::single(Kokkos::PerTeam(team), + [&]() { error_accum += team_error_accum; }); + } +}; + +void test_scratch(TEST_EXECSPACE exec0, TEST_EXECSPACE exec1) { + constexpr int N = 10; + constexpr int R = 1000; + constexpr int scratch_size = 100; + using ScratchType = Kokkos::View; + + // Test allocating and using scratch space + ScratchFunctor f(scratch_size, R); + + auto policy0 = + Kokkos::TeamPolicy(exec0, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size))); + auto policy1 = + Kokkos::TeamPolicy(exec1, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size))); + + int error0, error1; + + Kokkos::parallel_reduce("test_scratch_device_0", policy0, f, error0); + Kokkos::parallel_reduce("test_scratch_device_1", policy1, f, error1); + ASSERT_EQ(error0, 0); + ASSERT_EQ(error1, 0); + + // Request larger scratch size to trigger a realloc and test + const auto new_scratch_size = scratch_size + 10; + ScratchFunctor f_more_scratch(new_scratch_size, R); + + auto policy0_more_scratch = + Kokkos::TeamPolicy(exec0, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size))); + auto policy1_more_scratch = + Kokkos::TeamPolicy(exec1, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size))); + + Kokkos::parallel_reduce("test_realloc_scratch_device_0", policy0_more_scratch, + f_more_scratch, error0); + Kokkos::parallel_reduce("test_realloc_scratch_device_1", policy1_more_scratch, + f_more_scratch, error1); + ASSERT_EQ(error0, 0); + ASSERT_EQ(error1, 0); +} + +TEST(cuda_multi_gpu, scratch_space) { + StreamsAndDevices streams_and_devices; + { + std::array execs = + get_execution_spaces(streams_and_devices); + + test_scratch(execs[0], execs[1]); + } +} +} // namespace diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp index ae603101abb3..11fe6b8555b8 100644 --- a/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp +++ b/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp @@ -29,200 +29,166 @@ __global__ void test_cuda_spaces_int_value(int *ptr) { TEST(cuda, space_access) { static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaHostPinnedSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaUVMSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaUVMSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaUVMSpace>::assignable); - static_assert( - !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert(!Kokkos::Impl::MemorySpaceAccess< + Kokkos::CudaSpace, Kokkos::CudaHostPinnedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::accessible); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); //-------------------------------------- static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaUVMSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaSpace>::accessible); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); - - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::accessible); + + static_assert(!Kokkos::Impl::MemorySpaceAccess< + Kokkos::CudaUVMSpace, Kokkos::CudaHostPinnedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::accessible); //-------------------------------------- static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaHostPinnedSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaUVMSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaUVMSpace>::accessible); //-------------------------------------- static_assert( - !Kokkos::SpaceAccessibility::accessible, - ""); + !Kokkos::SpaceAccessibility::accessible); static_assert( - Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::SpaceAccessibility::accessible); static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaUVMSpace>::accessible); static_assert( Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::accessible); static_assert(!Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaSpace>::accessible); static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaUVMSpace>::accessible); static_assert( Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::accessible); static_assert(std::is_same::Space, - Kokkos::HostSpace>::value, - ""); + Kokkos::HostSpace>::value); static_assert( std::is_same::Space, Kokkos::Device>::value, - ""); + Kokkos::CudaUVMSpace>>::value); static_assert( std::is_same::Space, - Kokkos::CudaHostPinnedSpace>::value, - ""); + Kokkos::CudaHostPinnedSpace>::value); static_assert(std::is_same, Kokkos::Device>::value, - ""); + Kokkos::CudaUVMSpace>>::value); static_assert( Kokkos::SpaceAccessibility::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); - static_assert( - Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + static_assert(Kokkos::SpaceAccessibility< + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); #ifdef KOKKOS_ENABLE_CUDA_UVM using uvm_view = Kokkos::View; static_assert(std::is_same::Space; static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert( - Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::SpaceAccessibility::accessible); static_assert( - Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::SpaceAccessibility::accessible); } } // namespace Test diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp deleted file mode 100644 index 1b6a140920c8..000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_11 -#include diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp deleted file mode 100644 index 316bc85526f4..000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_12 -#include diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp deleted file mode 100644 index 6344960a1cfe..000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_13 -#include diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp deleted file mode 100644 index 4515174b82b1..000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_14 -#include diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp deleted file mode 100644 index 7ead50f0944e..000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_15 -#include diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp deleted file mode 100644 index e12b9b3894ae..000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_16 -#include diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_17.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_17.cpp deleted file mode 100644 index 959d0ab7503e..000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_17.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_17 -#include diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_18.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_18.cpp deleted file mode 100644 index 07d841519dcf..000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_18.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_18 -#include diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp deleted file mode 100644 index 042a515b16ac..000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_02 -#include diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp deleted file mode 100644 index dba401e5bcf9..000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_03 -#include diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp deleted file mode 100644 index a44c58bdb55a..000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_04 -#include diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp deleted file mode 100644 index cac0841dd832..000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_05 -#include diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp deleted file mode 100644 index bafe3b3fd2af..000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_06 -#include diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp deleted file mode 100644 index 3a4dd9d2533d..000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_07 -#include diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp deleted file mode 100644 index 4e92aae565a3..000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_08 -#include diff --git a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp b/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp deleted file mode 100644 index 44b8f3428d92..000000000000 --- a/packages/kokkos/core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_09 -#include diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_Memory_Requirements.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_Memory_Requirements.cpp index 8c72e9f29724..a213453ea182 100644 --- a/packages/kokkos/core/unit_test/hip/TestHIP_Memory_Requirements.cpp +++ b/packages/kokkos/core/unit_test/hip/TestHIP_Memory_Requirements.cpp @@ -48,6 +48,9 @@ TEST(hip, memory_requirements) { // we want all user-facing memory in hip to be coarse grained. As of // today(07.01.22) the documentation is not reliable/correct, we test the // memory on the device and host + // FIXME_HIP + GTEST_SKIP() << "skipping the test because the CI on MI100 returns: error( " + "hipErrorInvalidValue)"; KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::HIPSpace, int, 10); KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::HIPHostPinnedSpace, int, 10); KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::HIPManagedSpace, int, 10); diff --git a/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp b/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp index 14fd4e28837c..8f7499c244b0 100644 --- a/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp +++ b/packages/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp @@ -29,198 +29,164 @@ __global__ void test_hip_spaces_int_value(int *ptr) { TEST(hip, space_access) { static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPHostPinnedSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPManagedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPManagedSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPHostPinnedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::accessible); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPManagedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPManagedSpace>::accessible); //-------------------------------------- static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPHostPinnedSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPManagedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPManagedSpace>::accessible); //-------------------------------------- static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPManagedSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPHostPinnedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::accessible); //-------------------------------------- static_assert( - !Kokkos::SpaceAccessibility::accessible, - ""); + !Kokkos::SpaceAccessibility::accessible); static_assert( - Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::SpaceAccessibility::accessible); static_assert( Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::accessible); - static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + static_assert( + Kokkos::SpaceAccessibility::accessible); static_assert(!Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HIPSpace>::accessible); static_assert( Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::accessible); - static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + static_assert( + Kokkos::SpaceAccessibility::accessible); static_assert(std::is_same::Space, - Kokkos::HostSpace>::value, - ""); + Kokkos::HostSpace>::value); static_assert( std::is_same::Space, - Kokkos::HIPHostPinnedSpace>::value, - ""); + Kokkos::HIPHostPinnedSpace>::value); static_assert( std::is_same::Space, Kokkos::Device>::value, - ""); + Kokkos::HIPManagedSpace>>::value); static_assert( Kokkos::SpaceAccessibility::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); } template diff --git a/packages/kokkos/core/unit_test/incremental/Test01_execspace.hpp b/packages/kokkos/core/unit_test/incremental/Test01_execspace.hpp index 25c7138ed3c1..d7b2a57b4421 100644 --- a/packages/kokkos/core/unit_test/incremental/Test01_execspace.hpp +++ b/packages/kokkos/core/unit_test/incremental/Test01_execspace.hpp @@ -62,8 +62,10 @@ struct TestIncrExecSpace { auto concurrency = ExecSpace().concurrency(); ASSERT_GT(concurrency, 0); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 int in_parallel = ExecSpace::in_parallel(); ASSERT_FALSE(in_parallel); +#endif const char* name = ExecSpace::name(); std::cout << name << std::endl; diff --git a/packages/kokkos/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp b/packages/kokkos/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp deleted file mode 100644 index 92b8032bf0c4..000000000000 --- a/packages/kokkos/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp +++ /dev/null @@ -1,105 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#include - -#include - -namespace Test { - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -TEST(openmp, partition_master) { - using Mutex = Kokkos::Experimental::MasterLock; - - Mutex mtx; - int errors = 0; - - auto master = [&errors, &mtx](int /*partition_id*/, int /*num_partitions*/) { - const int pool_size = Kokkos::OpenMP().impl_thread_pool_size(); - - { - std::unique_lock lock(mtx); - if (Kokkos::OpenMP::in_parallel()) { - ++errors; - } - if (Kokkos::OpenMP::impl_thread_pool_rank() != 0) { - ++errors; - } - } - - { - int local_errors = 0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, 1000), - [pool_size](const int, int& errs) { - if (Kokkos::OpenMP().impl_thread_pool_size() != pool_size) { - ++errs; - } - }, - local_errors); - Kokkos::atomic_add(&errors, local_errors); - } - - Kokkos::Experimental::UniqueToken token; - - Kokkos::View count("", token.size()); - - Kokkos::parallel_for(Kokkos::RangePolicy(0, 1000), - [=](const int) { - int i = token.acquire(); - ++count[i]; - token.release(i); - }); - - Kokkos::View sum(""); - Kokkos::parallel_for( - Kokkos::RangePolicy(0, token.size()), - [=](const int i) { Kokkos::atomic_add(sum.data(), count[i]); }); - - if (sum() != 1000) { - Kokkos::atomic_add(&errors, 1); - } - }; - - master(0, 1); - - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 4, 0); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 0, 4); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 2, 2); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 8, 0); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 0, 8); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 8, 8); - ASSERT_EQ(errors, 0); -} -#endif - -} // namespace Test diff --git a/packages/kokkos/core/unit_test/sycl/TestSYCL_Spaces.cpp b/packages/kokkos/core/unit_test/sycl/TestSYCL_Spaces.cpp index 914f8432488d..a4fd053e83d7 100644 --- a/packages/kokkos/core/unit_test/sycl/TestSYCL_Spaces.cpp +++ b/packages/kokkos/core/unit_test/sycl/TestSYCL_Spaces.cpp @@ -21,235 +21,192 @@ namespace Test { TEST(sycl, space_access) { static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::assignable, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::accessible); //-------------------------------------- static_assert(!Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLSharedUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLSharedUSMSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); static_assert(!Kokkos::SpaceAccessibility< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); static_assert( std::is_same::Space, - Kokkos::HostSpace>::value, - ""); + Kokkos::HostSpace>::value); static_assert( std::is_same< Kokkos::Impl::HostMirror< Kokkos::Experimental::SYCLSharedUSMSpace>::Space, Kokkos::Device>::value, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace>>::value); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); static_assert(std::is_same::Space, - Kokkos::Experimental::SYCLHostUSMSpace>::value, - ""); + Kokkos::Experimental::SYCLHostUSMSpace>::value); static_assert( std::is_same< Kokkos::Device, Kokkos::Device>::value, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace>>::value); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror< - Kokkos::Experimental::SYCLDeviceUSMSpace>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror< + Kokkos::Experimental::SYCLDeviceUSMSpace>::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror< - Kokkos::Experimental::SYCLSharedUSMSpace>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror< + Kokkos::Experimental::SYCLSharedUSMSpace>::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror< - Kokkos::Experimental::SYCLHostUSMSpace>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror< + Kokkos::Experimental::SYCLHostUSMSpace>::Space, + Kokkos::HostSpace>::accessible); } TEST(sycl, uvm) { diff --git a/packages/kokkos/core/unit_test/tools/TestEventCorrectness.hpp b/packages/kokkos/core/unit_test/tools/TestEventCorrectness.hpp index 3c85f661aaea..946169a786db 100644 --- a/packages/kokkos/core/unit_test/tools/TestEventCorrectness.hpp +++ b/packages/kokkos/core/unit_test/tools/TestEventCorrectness.hpp @@ -409,14 +409,19 @@ TEST(kokkosp, parallel_scan_no_fence) { << "skipping since the OpenMPTarget backend has unexpected fences"; #endif + // Execute the parallel_scan first without looking for fence events. + // Depending on the backend implementation and the order of tests, + // it might be that the first call to parallel_scan is reallocating scratch + // memory which implies a fence when deallocating. We are not interested in + // detecting this event. + TestScanFunctor tf; + Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf); + using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableFences()); auto success = validate_absence( - [=]() { - TestScanFunctor tf; - Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf); - }, + [=]() { Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf); }, [=](BeginFenceEvent begin_event) { if (begin_event.name.find("Debug Only Check for Execution Error") != std::string::npos || @@ -450,13 +455,20 @@ TEST(kokkosp, parallel_scan_no_fence_view) { << "skipping since the OpenMPTarget backend has unexpected fences"; #endif + // Execute the parallel_scan first without looking for fence events. + // Depending on the backend implementation and the order of tests, + // it might be that the first call to parallel_scan is reallocating scratch + // memory which implies a fence when deallocating. We are not interested in + // detecting this event. + TestScanFunctor tf; + Kokkos::View v("scan_result"); + Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf, v); + using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableFences()); - Kokkos::View v("scan_result"); auto success = validate_absence( [=]() { - TestScanFunctor tf; Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf, v); }, [=](BeginFenceEvent begin_event) { diff --git a/packages/kokkos/core/unit_test/tools/TestLogicalSpaces.hpp b/packages/kokkos/core/unit_test/tools/TestLogicalSpaces.hpp deleted file mode 100644 index 4e56f8996a03..000000000000 --- a/packages/kokkos/core/unit_test/tools/TestLogicalSpaces.hpp +++ /dev/null @@ -1,177 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER -#include -#include -#include "Kokkos_Core.hpp" - -#include - -namespace Test { - -void debug_print(const Kokkos_Profiling_SpaceHandle hand, const char* name, - const void* ptr, const size_t size) { - std::cout << "Alloc: " << hand.name << ", [" << name << "," << ptr << "] " - << size << std::endl; -} -void debug_dealloc(const Kokkos_Profiling_SpaceHandle hand, const char* name, - const void* ptr, const size_t size) { - std::cout << "Dealloc: " << hand.name << ", [" << name << "," << ptr << "] " - << size << std::endl; -} - -void fail_on_event(const Kokkos::Profiling::SpaceHandle, const char*, - const void*, const uint64_t) { - ASSERT_TRUE(false) << "Unexpected memory event"; -} - -void expect_no_events() { - Kokkos::Tools::Experimental::set_allocate_data_callback(&fail_on_event); - Kokkos::Tools::Experimental::set_deallocate_data_callback(&fail_on_event); -} - -std::string expected_view_name; -std::string expected_space_name; -std::string error_message; -void expect_allocation_event(const std::string evn, const std::string esn, - const std::string em) { - expected_view_name = evn; - expected_space_name = esn; - error_message = em; - Kokkos::Tools::Experimental::set_allocate_data_callback( - [](const Kokkos_Profiling_SpaceHandle hand, const char* name, const void*, - const uint64_t) { - ASSERT_EQ(std::string(hand.name), expected_space_name) - << error_message << " (bad handle)"; - ASSERT_EQ(std::string(name), expected_view_name) - << error_message << " (bad view name)"; - expect_no_events(); - }); -} -void expect_deallocation_event(const std::string& evn, const std::string& esn, - const std::string em) { - expected_view_name = evn; - expected_space_name = esn; - error_message = em; - Kokkos::Tools::Experimental::set_deallocate_data_callback( - [](const Kokkos_Profiling_SpaceHandle hand, const char* name, const void*, - const uint64_t) { - ASSERT_EQ(std::string(hand.name), expected_space_name) - << error_message << " (bad handle)"; - ASSERT_EQ(std::string(name), expected_view_name) - << error_message << " (bad view name)"; - expect_no_events(); - }); -} - -struct TestSpaceNamer { - static constexpr const char* get_name() { return "TestSpace"; } -}; -struct TestSpaceNamerTwo { - static constexpr const char* get_name() { return "YoDawg"; } -}; -struct TestSpaceNamerThree { - static constexpr const char* get_name() { return "CustomAccessSpace"; } -}; -using fake_memory_space = Kokkos::Experimental::LogicalMemorySpace< - Kokkos::HostSpace, Kokkos::DefaultHostExecutionSpace, TestSpaceNamer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>; - -void test_view_construct() { - { - expect_allocation_event("puppy_view", "TestSpace", "View allocation"); - Kokkos::View pup_view("puppy_view", 1000); - expect_deallocation_event("puppy_view", "TestSpace", "View free"); - } - Kokkos::Tools::Experimental::pause_tools(); -} -void test_malloc_free() { - expect_allocation_event("does_malloc_work", "TestSpace", - "Error in malloc event"); - auto* temp = - Kokkos::kokkos_malloc("does_malloc_work", 1000); - expect_deallocation_event("does_malloc_work", "TestSpace", "Error in free"); - Kokkos::kokkos_free(temp); - Kokkos::Tools::Experimental::pause_tools(); -} -void test_chained_spaces() { - using doubly_fake_memory_space = Kokkos::Experimental::LogicalMemorySpace< - fake_memory_space, Kokkos::DefaultHostExecutionSpace, TestSpaceNamerTwo, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>; - { - expect_allocation_event("xzibit_dot_jpeg", "YoDawg", - "Chained space view allocation"); - Kokkos::View pup_view("xzibit_dot_jpeg", - 1000); - expect_deallocation_event("xzibit_dot_jpeg", "YoDawg", - "Chained space free"); - } - Kokkos::Tools::Experimental::pause_tools(); -} -void test_space_allocations() { - fake_memory_space debug_space; - expect_allocation_event("allocation_from_space", "TestSpace", - "Space allocation"); - auto* temp = debug_space.allocate("allocation_from_space", 1000); - expect_deallocation_event("allocation_from_space", "TestSpace", - "Space deallocation"); - debug_space.deallocate("allocation_from_space", temp, 1000); - Kokkos::Tools::Experimental::pause_tools(); -} -template -struct AccessCheckKernel { - Kokkos::View data; - KOKKOS_FUNCTION void operator()(const int i) const { data[i] = i; } -}; - -template -void test_allowed_access() { - constexpr const int data_size = 1000; - // We use an unmananged View here since we want to detect a memory access - // violation in the parallel_for and not in the initialization of the View. - std::vector test_data(data_size); - Kokkos::View test_view(test_data.data(), data_size); - AccessCheckKernel functor{test_view}; - Kokkos::parallel_for( - "access_allowed", - Kokkos::RangePolicy(0, data_size), - functor); - Kokkos::fence(); -} - -using semantically_independent_logical_space = - Kokkos::Experimental::LogicalMemorySpace< - Kokkos::HostSpace, Kokkos::DefaultHostExecutionSpace, - TestSpaceNamerThree, - Kokkos::Experimental::LogicalSpaceSharesAccess::no_shared_access>; - -TEST(defaultdevicetype, logical_space_views) { test_view_construct(); } -TEST(defaultdevicetype, logical_space_malloc) { test_malloc_free(); } -TEST(defaultdevicetype, logical_space_alloc) { test_space_allocations(); } -TEST(defaultdevicetype, chained_logical_spaces) { test_chained_spaces(); } -TEST(defaultdevicetype, access_allowed) { - test_allowed_access(); -} -// FIXME_SYCL -#if !(defined(KOKKOS_COMPILER_INTEL_LLVM) && defined(KOKKOS_ENABLE_SYCL)) -TEST(defaultdevicetype_DeathTest, access_forbidden) { - ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - ASSERT_DEATH( - { test_allowed_access(); }, - "Kokkos::View ERROR: attempt to access inaccessible memory space"); -} -#endif - -} // namespace Test diff --git a/packages/kokkos/core/unit_test/tools/TestProfilingSection.cpp b/packages/kokkos/core/unit_test/tools/TestProfilingSection.cpp index 318766ac455f..9d35d67feb0f 100644 --- a/packages/kokkos/core/unit_test/tools/TestProfilingSection.cpp +++ b/packages/kokkos/core/unit_test/tools/TestProfilingSection.cpp @@ -108,8 +108,8 @@ TEST(defaultdevicetype, profiling_section) { } using Kokkos::Profiling::ProfilingSection; -static_assert(!std::is_default_constructible::value, ""); -static_assert(!std::is_copy_constructible::value, ""); -static_assert(!std::is_move_constructible::value, ""); -static_assert(!std::is_copy_assignable::value, ""); -static_assert(!std::is_move_assignable::value, ""); +static_assert(!std::is_default_constructible::value); +static_assert(!std::is_copy_constructible::value); +static_assert(!std::is_move_constructible::value); +static_assert(!std::is_copy_assignable::value); +static_assert(!std::is_move_assignable::value); diff --git a/packages/kokkos/example/tutorial/01_hello_world/hello_world.cpp b/packages/kokkos/example/tutorial/01_hello_world/hello_world.cpp index 5b8a21af833b..22b8b6d63c88 100644 --- a/packages/kokkos/example/tutorial/01_hello_world/hello_world.cpp +++ b/packages/kokkos/example/tutorial/01_hello_world/hello_world.cpp @@ -58,12 +58,7 @@ struct hello_world { // is unnecessary but harmless. KOKKOS_INLINE_FUNCTION void operator()(const int i) const { - // FIXME_SYCL needs workaround for printf -#ifndef __SYCL_DEVICE_ONLY__ - printf("Hello from i = %i\n", i); -#else - (void)i; -#endif + Kokkos::printf("Hello from i = %i\n", i); } }; diff --git a/packages/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp b/packages/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp index c78f30763613..909765e1fc31 100644 --- a/packages/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp +++ b/packages/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp @@ -76,13 +76,9 @@ int main(int argc, char* argv[]) { #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( 15, KOKKOS_LAMBDA(const int i) { - // FIXME_SYCL needs workaround for printf -#ifndef __SYCL_DEVICE_ONLY__ - // printf works in a CUDA parallel kernel; std::ostream does not. - printf("Hello from i = %i\n", i); -#else - (void)i; -#endif + // Kokko::printf works for all backends in a parallel kernel; + // std::ostream does not. + Kokkos::printf("Hello from i = %i\n", i); }); #endif // You must call finalize() after you are done using Kokkos. diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp index b041f8d435b9..ee3f4721d917 100644 --- a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp @@ -47,13 +47,9 @@ struct hello_world { // The TeamPolicy<>::member_type provides functions to query the multi // dimensional index of a thread as well as the number of thread-teams and // the size of each team. -#ifndef __SYCL_DEVICE_ONLY__ - // FIXME_SYCL needs printf workaround - printf("Hello World: %i %i // %i %i\n", thread.league_rank(), - thread.team_rank(), thread.league_size(), thread.team_size()); -#else - (void)thread; -#endif + Kokkos::printf("Hello World: %i %i // %i %i\n", thread.league_rank(), + thread.team_rank(), thread.league_size(), + thread.team_size()); } }; diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp index 933b254f7c7b..1e6812adeadb 100644 --- a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp @@ -57,16 +57,12 @@ int main(int narg, char* args[]) { policy, KOKKOS_LAMBDA(const team_member& thread, int& lsum) { lsum += 1; - // TeamPolicy<>::member_type provides functions to query the - // multidimensional index of a thread, as well as the number of - // thread teams and the size of each team. -#ifndef __SYCL_DEVICE_ONLY__ - // FIXME_SYCL needs workaround for printf - printf("Hello World: %i %i // %i %i\n", thread.league_rank(), - thread.team_rank(), thread.league_size(), thread.team_size()); -#else - (void)thread; -#endif + // TeamPolicy<>::member_type provides functions to query the + // multidimensional index of a thread, as well as the number of + // thread teams and the size of each team. + Kokkos::printf("Hello World: %i %i // %i %i\n", thread.league_rank(), + thread.team_rank(), thread.league_size(), + thread.team_size()); }, sum); #endif diff --git a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp index 398810d13319..75d6089e9af4 100644 --- a/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp +++ b/packages/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp @@ -43,16 +43,11 @@ struct hello_world { // the operator using a team_policy acts like a parallel region for the // team. That means that everything outside of the nested parallel_for is // also executed by all threads of the team. - Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, 31), - [&](const int& i) { -#ifndef __SYCL_DEVICE_ONLY__ - // FIXME_SYCL needs printf workaround - printf("Hello World: (%i , %i) executed loop %i \n", - thread.league_rank(), thread.team_rank(), i); -#else - (void) i; -#endif - }); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(thread, 31), [&](const int& i) { + Kokkos::printf("Hello World: (%i , %i) executed loop %i \n", + thread.league_rank(), thread.team_rank(), i); + }); } }; diff --git a/packages/kokkos/generate_makefile.bash b/packages/kokkos/generate_makefile.bash index 301a1fceb5a1..25370daa3f2c 100755 --- a/packages/kokkos/generate_makefile.bash +++ b/packages/kokkos/generate_makefile.bash @@ -170,12 +170,9 @@ display_help_text() { echo " ARMV8_THUNDERX = ARMv8 Cavium ThunderX CPU" echo " ARMV8_THUNDERX2 = ARMv8 Cavium ThunderX2 CPU" echo " [IBM]" - echo " BGQ = IBM Blue Gene Q" - echo " Power7 = IBM POWER7 and POWER7+ CPUs" echo " Power8 = IBM POWER8 CPUs" echo " Power9 = IBM POWER9 CPUs" echo " [Intel]" - echo " WSM = Intel Westmere CPUs" echo " SNB = Intel Sandy/Ivy Bridge CPUs" echo " HSW = Intel Haswell CPUs" echo " BDW = Intel Broadwell Xeon E-class CPUs" @@ -226,7 +223,6 @@ display_help_text() { echo "--with-gtest=/Path/To/Gtest: Set path to gtest. (Used in unit and performance" echo " tests.)" echo "--with-hwloc=/Path/To/Hwloc: Set path to hwloc library." - echo "--with-memkind=/Path/To/MemKind: Set path to memkind library." echo "--with-options=[OPT]: Additional options to Kokkos:" echo " compiler_warnings" echo " aggressive_vectorization = add ivdep on loops" @@ -342,10 +338,6 @@ do KOKKOS_HWLOC=ON HWLOC_PATH="${key#*=}" ;; - --with-memkind*) - KOKKOS_MEMKIND=ON - MEMKIND_PATH="${key#*=}" - ;; --arch*) KOKKOS_ARCH="${key#*=}" ;; @@ -452,15 +444,6 @@ else KOKKOS_HWLOC_CMD= fi -if [ "$KOKKOS_MEMKIND" == "ON" ]; then - KOKKOS_MEMKIND_CMD=-DKokkos_ENABLE_MEMKIND=ON - if [ "$MEMKIND_PATH" != "" ]; then - KOKKOS_MEMKIND_PATH_CMD=-DMEMKIND_ROOT=$MEMKIND_PATH - fi -else - KOKKOS_MEMKIND_CMD= -fi - if [ ! -e ${KOKKOS_PATH}/CMakeLists.txt ]; then if [ "${KOKKOS_PATH}" == "" ]; then CM_SCRIPT=$0 @@ -506,5 +489,5 @@ if [[ ${COMPILER} == *clang* ]]; then fi fi -echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} -cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${PASSTHRU_CMAKE_FLAGS} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} +echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} +cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${PASSTHRU_CMAKE_FLAGS} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} diff --git a/packages/kokkos/gnu_generate_makefile.bash b/packages/kokkos/gnu_generate_makefile.bash index 5ea159cdd47f..7a197bb71d46 100755 --- a/packages/kokkos/gnu_generate_makefile.bash +++ b/packages/kokkos/gnu_generate_makefile.bash @@ -74,9 +74,6 @@ do --with-hwloc*) HWLOC_PATH="${key#*=}" ;; - --with-memkind*) - MEMKIND_PATH="${key#*=}" - ;; --arch*) KOKKOS_ARCH="${key#*=}" ;; @@ -148,12 +145,9 @@ do echo " ARMv8-ThunderX = ARMv8 Cavium ThunderX CPU" echo " ARMv8-TX2 = ARMv8 Cavium ThunderX2 CPU" echo " [IBM]" - echo " BGQ = IBM Blue Gene Q" - echo " Power7 = IBM POWER7 and POWER7+ CPUs" echo " Power8 = IBM POWER8 CPUs" echo " Power9 = IBM POWER9 CPUs" echo " [Intel]" - echo " WSM = Intel Westmere CPUs" echo " SNB = Intel Sandy/Ivy Bridge CPUs" echo " HSW = Intel Haswell CPUs" echo " BDW = Intel Broadwell Xeon E-class CPUs" @@ -198,7 +192,6 @@ do echo "--with-gtest=/Path/To/Gtest: Set path to gtest. (Used in unit and performance" echo " tests.)" echo "--with-hwloc=/Path/To/Hwloc: Set path to hwloc library." - echo "--with-memkind=/Path/To/MemKind: Set path to memkind library." echo "--with-options=[OPT]: Additional options to Kokkos:" echo " compiler_warnings" echo " aggressive_vectorization = add ivdep on loops" @@ -298,11 +291,6 @@ if [ ${#HWLOC_PATH} -gt 0 ]; then KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},hwloc" fi -if [ ${#MEMKIND_PATH} -gt 0 ]; then - KOKKOS_SETTINGS="${KOKKOS_SETTINGS} MEMKIND_PATH=${MEMKIND_PATH}" - KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},experimental_memkind" -fi - if [ ${#KOKKOS_USE_TPLS} -gt 0 ]; then KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_USE_TPLS=${KOKKOS_USE_TPLS}" fi diff --git a/packages/kokkos/master_history.txt b/packages/kokkos/master_history.txt index a43b5276a831..bd122a456bdc 100644 --- a/packages/kokkos/master_history.txt +++ b/packages/kokkos/master_history.txt @@ -35,3 +35,4 @@ tag: 4.0.01 date: 04:26:2023 master: aa1f48f3 release: 5893754f tag: 4.1.00 date: 06:20:2023 master: 62d2b6c8 release: adde1e6a tag: 4.2.00 date: 11:09:2023 master: 1a3ea28f release: abe01c88 tag: 4.2.01 date: 01:30:2024 master: 71a9bcae release: 221e5f7a +tag: 4.3.00 date: 04:03:2024 master: e0dc0128 release: f08217a4 diff --git a/packages/kokkos/scripts/docker/Dockerfile.clang b/packages/kokkos/scripts/docker/Dockerfile.clang index 5c6abc1c6de5..b493c3bbff08 100644 --- a/packages/kokkos/scripts/docker/Dockerfile.clang +++ b/packages/kokkos/scripts/docker/Dockerfile.clang @@ -1,49 +1,13 @@ -FROM ubuntu:18.04 +FROM ubuntu:20.04 RUN apt-get update && apt-get install -y \ bc \ git \ build-essential \ + clang-format-8 \ wget \ - ccache \ && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \ - KEYDUMP_FILE=keydump && \ - wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \ - wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \ - gpg --import ${KEYDUMP_FILE} && \ - gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \ - rm ${KEYDUMP_FILE}* - -ARG CMAKE_VERSION=3.16.8 -ENV CMAKE_DIR=/opt/cmake -RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \ - CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \ - CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \ - wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \ - wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \ - wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \ - gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \ - grep -i ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sed -e s/linux/Linux/ | sha256sum --check && \ - mkdir -p ${CMAKE_DIR} && \ - sh ${CMAKE_SCRIPT} --skip-license --prefix=${CMAKE_DIR} && \ - rm cmake* -ENV PATH=${CMAKE_DIR}/bin:$PATH - -ENV LLVM_DIR=/opt/llvm -RUN LLVM_VERSION=8.0.0 && \ - LLVM_URL=https://releases.llvm.org/${LLVM_VERSION}/clang+llvm-${LLVM_VERSION}-x86_64-linux-gnu-ubuntu-18.04.tar.xz && \ - LLVM_ARCHIVE=llvm-${LLVM_VERSION}.tar.xz && \ - SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \ - wget --quiet ${LLVM_URL} --output-document=${LLVM_ARCHIVE} && \ - wget --quiet ${LLVM_URL}.sig --output-document=${LLVM_ARCHIVE}.sig && \ - gpg --verify ${LLVM_ARCHIVE}.sig ${LLVM_ARCHIVE} && \ - mkdir -p ${LLVM_DIR} && \ - tar -xvf ${LLVM_ARCHIVE} -C ${LLVM_DIR} --strip-components=1 && \ - echo "${LLVM_DIR}/lib" > /etc/ld.so.conf.d/llvm.conf && ldconfig && \ - rm -rf /root/.gnupg && \ - rm -rf ${SCRATCH_DIR} -ENV PATH=${LLVM_DIR}/bin:$PATH +ENV CLANG_FORMAT_EXE=clang-format-8 diff --git a/packages/kokkos/scripts/docker/Dockerfile.openmptarget b/packages/kokkos/scripts/docker/Dockerfile.openmptarget index 708cf533b8a6..22edcda2a073 100644 --- a/packages/kokkos/scripts/docker/Dockerfile.openmptarget +++ b/packages/kokkos/scripts/docker/Dockerfile.openmptarget @@ -38,7 +38,7 @@ RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSIO rm ${CMAKE_SCRIPT} ENV PATH=${CMAKE_DIR}/bin:$PATH -ARG LLVM_VERSION=llvmorg-17.0.1 +ARG LLVM_VERSION=llvmorg-17.0.3 ENV LLVM_DIR=/opt/llvm RUN LLVM_URL=https://github.com/llvm/llvm-project/archive &&\ LLVM_ARCHIVE=${LLVM_VERSION}.tar.gz &&\ diff --git a/packages/kokkos/scripts/docker/Dockerfile.sycl b/packages/kokkos/scripts/docker/Dockerfile.sycl index 714461bfe6a5..87864da1bf76 100644 --- a/packages/kokkos/scripts/docker/Dockerfile.sycl +++ b/packages/kokkos/scripts/docker/Dockerfile.sycl @@ -55,3 +55,12 @@ RUN wget https://registrationcenter-download.intel.com/akdlm/irc_nas/19133/l_one chmod +x ./l_oneDPL_p_2022.0.0.25335.sh && \ ./l_oneDPL_p_2022.0.0.25335.sh -a -s --eula accept && \ rm l_oneDPL_p_2022.0.0.25335.sh + +# clang++ +ENV PATH=/opt/intel/oneapi/compiler/latest/linux/bin-llvm/:$PATH +# sycl-ls, icpx +ENV PATH=/opt/intel/oneapi/compiler/latest/linux/bin/:$PATH +# libsycl +ENV LD_LIBRARY_PATH=/opt/intel/oneapi/compiler/latest/linux/lib:$LD_LIBRARY_PATH +# libsvml +ENV LD_LIBRARY_PATH=/opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin:$LD_LIBRARY_PATH diff --git a/packages/kokkos/scripts/testing_scripts/generate_makefile.bash b/packages/kokkos/scripts/testing_scripts/generate_makefile.bash index ae1db3186f72..830d7b12d904 100755 --- a/packages/kokkos/scripts/testing_scripts/generate_makefile.bash +++ b/packages/kokkos/scripts/testing_scripts/generate_makefile.bash @@ -59,9 +59,6 @@ do --with-hwloc*) HWLOC_PATH="${key#*=}" ;; - --with-memkind*) - MEMKIND_PATH="${key#*=}" - ;; --arch*) KOKKOS_ARCH="${key#*=}" ;; @@ -136,12 +133,9 @@ do echo " ARMv8-ThunderX = ARMv8 Cavium ThunderX CPU" echo " ARMv8-TX2 = ARMv8 Cavium ThunderX2 CPU" echo " [IBM]" - echo " BGQ = IBM Blue Gene Q" - echo " Power7 = IBM POWER7 and POWER7+ CPUs" echo " Power8 = IBM POWER8 CPUs" echo " Power9 = IBM POWER9 CPUs" echo " [Intel]" - echo " WSM = Intel Westmere CPUs" echo " SNB = Intel Sandy/Ivy Bridge CPUs" echo " HSW = Intel Haswell CPUs" echo " BDW = Intel Broadwell Xeon E-class CPUs" @@ -177,7 +171,6 @@ do echo "--with-gtest=/Path/To/Gtest: Set path to gtest. (Used in unit and performance" echo " tests.)" echo "--with-hwloc=/Path/To/Hwloc: Set path to hwloc library." - echo "--with-memkind=/Path/To/MemKind: Set path to memkind library." echo "--with-options=[OPT]: Additional options to Kokkos:" echo " compiler_warnings" echo " aggressive_vectorization = add ivdep on loops" @@ -269,11 +262,6 @@ if [ ${#HWLOC_PATH} -gt 0 ]; then KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},hwloc" fi -if [ ${#MEMKIND_PATH} -gt 0 ]; then - KOKKOS_SETTINGS="${KOKKOS_SETTINGS} MEMKIND_PATH=${MEMKIND_PATH}" - KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},experimental_memkind" -fi - if [ ${#KOKKOS_USE_TPLS} -gt 0 ]; then KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_USE_TPLS=${KOKKOS_USE_TPLS}" fi diff --git a/packages/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp b/packages/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp index 521160b76fc4..6d0956f38321 100644 --- a/packages/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp +++ b/packages/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp @@ -30,9 +30,11 @@ "Kokkos_SIMD_AVX2.hpp must be included before Kokkos_SIMD_Common_Math.hpp!" #endif -// FIXME_HIP ROCm 5.6 and 5.7 can't compile with the intrinsic used here. -#if defined(__HIPCC__) && (HIP_VERSION_MAJOR == 5) && \ - ((HIP_VERSION_MINOR == 6) || (HIP_VERSION_MINOR == 7)) +// FIXME_HIP ROCm 5.6, 5.7, and 6.0 can't compile with the intrinsic used here. +#if defined(__HIPCC__) && \ + (((HIP_VERSION_MAJOR == 5) && \ + ((HIP_VERSION_MINOR == 6) || (HIP_VERSION_MINOR == 7))) || \ + ((HIP_VERSION_MAJOR == 6) && ((HIP_VERSION_MINOR == 0)))) #define KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE #endif @@ -563,10 +565,18 @@ class simd> { element_aligned_tag) { m_value = _mm256_loadu_pd(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm256_load_pd(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm256_storeu_pd(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_store_pd(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256d() const { return m_value; @@ -818,10 +828,18 @@ class simd> { element_aligned_tag) { m_value = _mm_loadu_ps(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm_load_ps(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm_storeu_ps(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm_store_ps(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m128() const { return m_value; @@ -1059,17 +1077,31 @@ class simd> { } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, element_aligned_tag) { - // FIXME_HIP ROCm 5.6 can't compile with the intrinsic used here. + // FIXME_HIP ROCm 5.6, 5.7, and 6.0 can't compile with the intrinsic used + // here. #ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE m_value = _mm_loadu_si128(reinterpret_cast<__m128i const*>(ptr)); #else m_value = _mm_maskload_epi32(ptr, static_cast<__m128i>(mask_type(true))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + // FIXME_HIP ROCm 5.6 can't compile with the intrinsic used here. +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + m_value = _mm_load_si128(reinterpret_cast<__m128i const*>(ptr)); +#else + m_value = _mm_maskload_epi32(ptr, static_cast<__m128i>(mask_type(true))); #endif } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm_maskstore_epi32(ptr, static_cast<__m128i>(mask_type(true)), m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm_maskstore_epi32(ptr, static_cast<__m128i>(mask_type(true)), m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m128i() const { return m_value; @@ -1111,6 +1143,11 @@ class simd> { return simd( _mm_add_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd( + _mm_mullo_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( simd const& lhs, int rhs) noexcept { @@ -1249,6 +1286,15 @@ class simd> { #else m_value = _mm256_maskload_epi64(reinterpret_cast(ptr), static_cast<__m256i>(mask_type(true))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + m_value = _mm256_load_si256(reinterpret_cast<__m256i const*>(ptr)); +#else + m_value = _mm256_maskload_epi64(reinterpret_cast(ptr), + static_cast<__m256i>(mask_type(true))); #endif } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( @@ -1256,6 +1302,11 @@ class simd> { _mm256_maskstore_epi64(reinterpret_cast(ptr), static_cast<__m256i>(mask_type(true)), m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_maskstore_epi64(reinterpret_cast(ptr), + static_cast<__m256i>(mask_type(true)), m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() const { return m_value; @@ -1278,6 +1329,13 @@ class simd> { _mm256_add_epi64(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); } + // fallback simd multiplication using generator constructor + // multiplying vectors of 64-bit signed integers is not available in AVX2 + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); + } + // AVX2 only has eq and gt comparisons for int64 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator==(simd const& lhs, simd const& rhs) noexcept { @@ -1306,17 +1364,19 @@ class simd> { return !(lhs == rhs); } + // fallback simd shift right arithmetic using generator constructor // Shift right arithmetic for 64bit packed ints is not availalbe in AVX2 - // [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd( - // simd const& lhs, int rhs) noexcept { - // return simd(_mm256_srai_epi64(static_cast<__m256i>(lhs), rhs)); - // } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, int rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] >> rhs; }); + } - // [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd( - // simd const& lhs, simd const& rhs) noexcept { - // return simd(_mm256_srav_epi64(static_cast<__m256i>(lhs), - // static_cast<__m256i>(rhs)))); - // } + // fallback simd shift right arithmetic using generator constructor + // Shift right arithmetic for 64bit packed ints is not availalbe in AVX2 + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] >> rhs[i]; }); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( simd const& lhs, int rhs) noexcept { @@ -1444,6 +1504,15 @@ class simd> { #else m_value = _mm256_maskload_epi64(reinterpret_cast(ptr), static_cast<__m256i>(mask_type(true))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + m_value = _mm256_load_si256(reinterpret_cast<__m256i const*>(ptr)); +#else + m_value = _mm256_maskload_epi64(reinterpret_cast(ptr), + static_cast<__m256i>(mask_type(true))); #endif } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() @@ -1460,6 +1529,14 @@ class simd> { return simd( _mm256_sub_epi64(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); } + + // fallback simd multiplication using generator constructor + // multiplying vectors of 64-bit unsigned integers is not available in AVX2 + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( simd const& lhs, int rhs) noexcept { return _mm256_srli_epi64(static_cast<__m256i>(lhs), rhs); @@ -1588,6 +1665,11 @@ class const_where_expression>, static_cast<__m256d>(m_value)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(double* mem, vector_aligned_tag) const { + _mm256_maskstore_pd(mem, _mm256_castpd_si256(static_cast<__m256d>(m_mask)), + static_cast<__m256d>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( double* mem, simd> const& index) const { @@ -1624,6 +1706,11 @@ class where_expression>, mem, _mm256_castpd_si256(static_cast<__m256d>(m_mask)))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(double const* mem, vector_aligned_tag) { + m_value = value_type(_mm256_maskload_pd( + mem, _mm256_castpd_si256(static_cast<__m256d>(m_mask)))); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( double const* mem, simd> const& index) { @@ -1667,6 +1754,11 @@ class const_where_expression>, static_cast<__m128>(m_value)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, vector_aligned_tag) const { + _mm_maskstore_ps(mem, _mm_castps_si128(static_cast<__m128>(m_mask)), + static_cast<__m128>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( float* mem, simd> const& index) const { @@ -1703,6 +1795,11 @@ class where_expression>, _mm_maskload_ps(mem, _mm_castps_si128(static_cast<__m128>(m_mask)))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(float const* mem, vector_aligned_tag) { + m_value = value_type( + _mm_maskload_ps(mem, _mm_castps_si128(static_cast<__m128>(m_mask)))); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( float const* mem, simd> const& index) { @@ -1746,6 +1843,12 @@ class const_where_expression< _mm_maskstore_epi32(mem, static_cast<__m128i>(m_mask), static_cast<__m128i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, vector_aligned_tag) const { + _mm_maskstore_epi32(mem, static_cast<__m128i>(m_mask), + static_cast<__m128i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int32_t* mem, @@ -1786,6 +1889,16 @@ class where_expression>, m_value = value_type(_mm_maskload_epi32(mem, static_cast<__m128i>(m_mask))); #endif } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int32_t const* mem, vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + __m128i tmp = _mm_load_si128(reinterpret_cast<__m128i const*>(mem)); + m_value = value_type(_mm_and_si128(tmp, static_cast<__m128i>(m_mask))); +#else + m_value = value_type(_mm_maskload_epi32(mem, static_cast<__m128i>(m_mask))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int32_t const* mem, @@ -1833,6 +1946,13 @@ class const_where_expression< static_cast<__m256i>(m_mask), static_cast<__m256i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(std::int64_t* mem, + vector_aligned_tag) const { + _mm256_maskstore_epi64(reinterpret_cast(mem), + static_cast<__m256i>(m_mask), + static_cast<__m256i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int64_t* mem, @@ -1874,6 +1994,17 @@ class where_expression>, reinterpret_cast(mem), static_cast<__m256i>(m_mask))); #endif } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(std::int64_t const* mem, + vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + __m256i tmp = _mm256_load_si256(reinterpret_cast<__m256i const*>(mem)); + m_value = value_type(_mm256_and_si256(tmp, static_cast<__m256i>(m_mask))); +#else + m_value = value_type(_mm256_maskload_epi64( + reinterpret_cast(mem), static_cast<__m256i>(m_mask))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int64_t const* mem, @@ -1922,6 +2053,13 @@ class const_where_expression< static_cast<__m256i>(m_mask), static_cast<__m256i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(std::uint64_t* mem, + vector_aligned_tag) const { + _mm256_maskstore_epi64(reinterpret_cast(mem), + static_cast<__m256i>(m_mask), + static_cast<__m256i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::uint64_t* mem, @@ -1963,6 +2101,17 @@ class where_expression>, reinterpret_cast(mem), static_cast<__m256i>(m_mask))); #endif } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(std::uint64_t const* mem, + vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + __m256i tmp = _mm256_load_si256(reinterpret_cast<__m256i const*>(mem)); + m_value = value_type(_mm256_and_si256(tmp, static_cast<__m256i>(m_mask))); +#else + m_value = value_type(_mm256_maskload_epi64( + reinterpret_cast(mem), static_cast<__m256i>(m_mask))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::uint64_t const* mem, diff --git a/packages/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp b/packages/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp index c5d1717ad4ea..7fa35c204ae1 100644 --- a/packages/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp +++ b/packages/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp @@ -193,10 +193,18 @@ class simd> { element_aligned_tag) { m_value = _mm512_loadu_pd(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm512_load_pd(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm512_storeu_pd(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm512_store_pd(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512d() const { return m_value; @@ -475,10 +483,18 @@ class simd> { element_aligned_tag) { m_value = _mm256_loadu_ps(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm256_load_ps(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm256_storeu_ps(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_store_ps(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256() const { return m_value; @@ -735,15 +751,25 @@ class simd> { operator[](std::size_t i) const { return reinterpret_cast(&m_value)[i]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + m_value = _mm256_mask_loadu_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm256_mask_load_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm256_mask_storeu_epi32(ptr, static_cast<__mmask8>(mask_type(true)), m_value); } - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, - element_aligned_tag) { - m_value = _mm256_mask_loadu_epi32( - _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_mask_store_epi32(ptr, static_cast<__mmask8>(mask_type(true)), + m_value); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() const { @@ -934,21 +960,30 @@ class simd> { operator[](std::size_t i) const { return reinterpret_cast(&m_value)[i]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + m_value = _mm256_mask_loadu_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm256_mask_load_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm256_mask_storeu_epi32(ptr, static_cast<__mmask8>(mask_type(true)), m_value); } - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, - element_aligned_tag) { - m_value = _mm256_mask_loadu_epi32( - _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_mask_store_epi32(ptr, static_cast<__mmask8>(mask_type(true)), + m_value); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() const { return m_value; } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( simd const& lhs, simd const& rhs) noexcept { return simd(_mm256_mullo_epi32(static_cast<__m256i>(lhs), @@ -1130,10 +1165,19 @@ class simd> { element_aligned_tag) { m_value = _mm512_loadu_si512(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm512_load_si512(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm512_storeu_si512(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm512_store_si512(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i() const { return m_value; @@ -1331,10 +1375,19 @@ class simd> { element_aligned_tag) { m_value = _mm512_loadu_si512(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm512_load_si512(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm512_storeu_si512(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm512_store_si512(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i() const { return m_value; @@ -1505,6 +1558,11 @@ class const_where_expression>, static_cast<__m512d>(m_value)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(double* mem, vector_aligned_tag) const { + _mm512_mask_store_pd(mem, static_cast<__mmask8>(m_mask), + static_cast<__m512d>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( double* mem, simd> const& index) const { @@ -1541,6 +1599,11 @@ class where_expression>, _mm512_set1_pd(0.0), static_cast<__mmask8>(m_mask), mem)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(double const* mem, vector_aligned_tag) { + m_value = value_type(_mm512_mask_load_pd( + _mm512_set1_pd(0.0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( double const* mem, simd> const& index) { @@ -1584,6 +1647,11 @@ class const_where_expression>, static_cast<__m256>(m_value)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, vector_aligned_tag) const { + _mm256_mask_store_ps(mem, static_cast<__mmask8>(m_mask), + static_cast<__m256>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( float* mem, simd> const& index) const { @@ -1619,6 +1687,10 @@ class where_expression>, m_value = value_type(_mm256_mask_loadu_ps( _mm256_set1_ps(0.0), static_cast<__mmask8>(m_mask), mem)); } + void copy_from(float const* mem, vector_aligned_tag) { + m_value = value_type(_mm256_mask_load_ps( + _mm256_set1_ps(0.0), static_cast<__mmask8>(m_mask), mem)); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( float const* mem, @@ -1666,6 +1738,12 @@ class const_where_expression< _mm256_mask_storeu_epi32(mem, static_cast<__mmask8>(m_mask), static_cast<__m256i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, vector_aligned_tag) const { + _mm256_mask_store_epi32(mem, static_cast<__mmask8>(m_mask), + static_cast<__m256i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int32_t* mem, @@ -1702,6 +1780,11 @@ class where_expression>, m_value = value_type(_mm256_mask_loadu_epi32( _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem)); } + void copy_from(std::int32_t const* mem, vector_aligned_tag) { + m_value = value_type(_mm256_mask_load_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int32_t const* mem, @@ -1710,6 +1793,7 @@ class where_expression>, static_cast<__m256i>(m_value), static_cast<__mmask8>(m_mask), static_cast<__m256i>(index), mem, 4)); } + template (m_mask), static_cast<__m256i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::uint32_t* mem, vector_aligned_tag) const { + _mm256_mask_store_epi32(mem, static_cast<__mmask8>(m_mask), + static_cast<__m256i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::uint32_t* mem, @@ -1784,6 +1874,12 @@ class where_expression>, m_value = value_type(_mm256_mask_loadu_epi32( _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::uint32_t const* mem, vector_aligned_tag) { + m_value = value_type(_mm256_mask_load_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::uint32_t const* mem, @@ -1792,6 +1888,7 @@ class where_expression>, static_cast<__m256i>(m_value), static_cast<__mmask8>(m_mask), static_cast<__m256i>(index), mem, 4)); } + template (m_mask), static_cast<__m512i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int64_t* mem, vector_aligned_tag) const { + _mm512_mask_store_epi64(mem, static_cast<__mmask8>(m_mask), + static_cast<__m512i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int64_t* mem, @@ -1866,6 +1969,12 @@ class where_expression>, m_value = value_type(_mm512_mask_loadu_epi64( _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int64_t const* mem, vector_aligned_tag) { + m_value = value_type(_mm512_mask_load_epi64( + _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int64_t const* mem, @@ -1874,6 +1983,7 @@ class where_expression>, static_cast<__m512i>(m_value), static_cast<__mmask8>(m_mask), static_cast<__m256i>(index), mem, 8)); } + template (m_mask), static_cast<__m512i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::uint64_t* mem, vector_aligned_tag) const { + _mm512_mask_store_epi64(mem, static_cast<__mmask8>(m_mask), + static_cast<__m512i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::uint64_t* mem, @@ -1949,6 +2065,11 @@ class where_expression>, _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::uint64_t const* mem, vector_aligned_tag) { + m_value = value_type(_mm512_mask_load_epi64( + _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::uint64_t const* mem, simd> const& index) { @@ -1956,6 +2077,7 @@ class where_expression>, static_cast<__m512i>(m_value), static_cast<__mmask8>(m_mask), static_cast<__m256i>(index), mem, 8)); } + template class simd_mask; -struct element_aligned_tag {}; +class simd_alignment_vector_aligned {}; + +template +struct simd_flags {}; + +inline constexpr simd_flags<> simd_flag_default{}; +inline constexpr simd_flags simd_flag_aligned{}; + +using element_aligned_tag = simd_flags<>; +using vector_aligned_tag = simd_flags; // class template declarations for const_where_expression and where_expression @@ -117,48 +126,6 @@ template return const_where_expression(mask, value); } -// fallback simd multiplication using generator constructor -// At the time of this writing, this fallback is only used -// to multiply vectors of 64-bit signed integers for the AVX2 backend - -template -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator*( - simd const& lhs, simd const& rhs) { - return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); -} - -// fallback simd shift using generator constructor -// At the time of this edit, only the fallback for shift vectors of -// 64-bit signed integers for the AVX2 backend is used - -template >> -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator>>( - simd const& lhs, int rhs) { - return simd([&](std::size_t i) { return lhs[i] >> rhs; }); -} - -template >> -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator<<( - simd const& lhs, int rhs) { - return simd([&](std::size_t i) { return lhs[i] << rhs; }); -} - -template >> -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator>>( - simd const& lhs, simd const& rhs) { - return simd([&](std::size_t i) { return lhs[i] >> rhs[i]; }); -} - -template >> -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator<<( - simd const& lhs, simd const& rhs) { - return simd([&](std::size_t i) { return lhs[i] << rhs[i]; }); -} - // The code below provides: // operator@(simd, Arithmetic) // operator@(Arithmetic, simd) diff --git a/packages/kokkos/simd/src/Kokkos_SIMD_NEON.hpp b/packages/kokkos/simd/src/Kokkos_SIMD_NEON.hpp index 43ece2038903..efc81135d165 100644 --- a/packages/kokkos/simd/src/Kokkos_SIMD_NEON.hpp +++ b/packages/kokkos/simd/src/Kokkos_SIMD_NEON.hpp @@ -363,10 +363,18 @@ class simd> { element_aligned_tag) { m_value = vld1q_f64(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1q_f64(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { vst1q_f64(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1q_f64(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator float64x2_t() const { return m_value; @@ -607,10 +615,18 @@ class simd> { element_aligned_tag) { m_value = vld1_f32(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1_f32(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { vst1_f32(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1_f32(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator float32x2_t() const { return m_value; @@ -844,10 +860,18 @@ class simd> { element_aligned_tag) { m_value = vld1_s32(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1_s32(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { vst1_s32(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1_s32(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator int32x2_t() const { return m_value; @@ -868,7 +892,11 @@ class simd> { return simd( vadd_s32(static_cast(lhs), static_cast(rhs))); } - + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd( + vmul_s32(static_cast(lhs), static_cast(rhs))); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator==(simd const& lhs, simd const& rhs) noexcept { return mask_type( @@ -1044,10 +1072,18 @@ class simd> { element_aligned_tag) { m_value = vld1q_s64(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1q_s64(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { vst1q_s64(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1q_s64(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator int64x2_t() const { return m_value; @@ -1068,7 +1104,10 @@ class simd> { return simd( vaddq_s64(static_cast(lhs), static_cast(rhs))); } - + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator==(simd const& lhs, simd const& rhs) noexcept { return mask_type( @@ -1246,6 +1285,18 @@ class simd> { element_aligned_tag) { m_value = vld1q_u64(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1q_u64(ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + vst1q_u64(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1q_u64(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator uint64x2_t() const { return m_value; @@ -1261,7 +1312,10 @@ class simd> { return simd( vaddq_u64(static_cast(lhs), static_cast(rhs))); } - + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator&( simd const& lhs, simd const& rhs) noexcept { return simd( @@ -1386,6 +1440,11 @@ class const_where_expression>, if (m_mask[1]) mem[1] = m_value[1]; } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(double* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( double* mem, simd> const& index) const { @@ -1421,6 +1480,11 @@ class where_expression>, if (m_mask[1]) m_value[1] = mem[1]; } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(double const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( double const* mem, simd> const& index) { @@ -1464,6 +1528,11 @@ class const_where_expression>, if (m_mask[1]) mem[1] = m_value[1]; } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( float* mem, simd> const& index) const { @@ -1498,6 +1567,10 @@ class where_expression>, if (m_mask[0]) m_value[0] = mem[0]; if (m_mask[1]) m_value[1] = mem[1]; } + void copy_from(float const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( float const* mem, @@ -1542,6 +1615,12 @@ class const_where_expression< if (m_mask[0]) mem[0] = m_value[0]; if (m_mask[1]) mem[1] = m_value[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int32_t* mem, @@ -1577,6 +1656,12 @@ class where_expression>, if (m_mask[0]) m_value[0] = mem[0]; if (m_mask[1]) m_value[1] = mem[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int32_t const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int32_t const* mem, @@ -1584,6 +1669,7 @@ class where_expression>, if (m_mask[0]) m_value[0] = mem[index[0]]; if (m_mask[1]) m_value[1] = mem[index[1]]; } + template < class U, std::enable_if_t< @@ -1622,6 +1708,12 @@ class const_where_expression< if (m_mask[0]) mem[0] = m_value[0]; if (m_mask[1]) mem[1] = m_value[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int64_t* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int64_t* mem, @@ -1657,6 +1749,12 @@ class where_expression>, if (m_mask[0]) m_value[0] = mem[0]; if (m_mask[1]) m_value[1] = mem[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int64_t const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int64_t const* mem, @@ -1664,6 +1762,7 @@ class where_expression>, if (m_mask[0]) m_value[0] = mem[index[0]]; if (m_mask[1]) m_value[1] = mem[index[1]]; } + template < class U, std::enable_if_t>, if (m_mask[0]) m_value[0] = mem[0]; if (m_mask[1]) m_value[1] = mem[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::uint64_t const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::uint64_t const* mem, @@ -1744,6 +1855,7 @@ class where_expression>, if (m_mask[0]) m_value[0] = mem[index[0]]; if (m_mask[1]) m_value[1] = mem[index[1]]; } + template { element_aligned_tag) { m_value = *ptr; } + KOKKOS_FORCEINLINE_FUNCTION void copy_from(T const* ptr, vector_aligned_tag) { + m_value = *ptr; + } KOKKOS_FORCEINLINE_FUNCTION void copy_to(T* ptr, element_aligned_tag) const { *ptr = m_value; } + KOKKOS_FORCEINLINE_FUNCTION void copy_to(T* ptr, vector_aligned_tag) const { + *ptr = m_value; + } + KOKKOS_FORCEINLINE_FUNCTION reference operator[](std::size_t) { return m_value; } @@ -308,6 +315,10 @@ class const_where_expression, void copy_to(T* mem, element_aligned_tag) const { if (static_cast(m_mask)) *mem = static_cast(m_value); } + KOKKOS_FORCEINLINE_FUNCTION + void copy_to(T* mem, vector_aligned_tag) const { + if (static_cast(m_mask)) *mem = static_cast(m_value); + } template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t> scatter_to(T* mem, simd const& index) const { @@ -315,13 +326,13 @@ class const_where_expression, mem[static_cast(index)] = static_cast(m_value); } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const& - impl_get_value() const { + [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION value_type const& impl_get_value() + const { return m_value; } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const& - impl_get_mask() const { + [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION mask_type const& impl_get_mask() + const { return m_mask; } }; @@ -344,6 +355,10 @@ class where_expression, void copy_from(T const* mem, element_aligned_tag) { if (static_cast(this->m_mask)) this->m_value = *mem; } + KOKKOS_FORCEINLINE_FUNCTION + void copy_from(T const* mem, vector_aligned_tag) { + if (static_cast(this->m_mask)) this->m_value = *mem; + } template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t> gather_from(T const* mem, simd const& index) { diff --git a/packages/kokkos/simd/unit_tests/TestSIMD.cpp b/packages/kokkos/simd/unit_tests/TestSIMD.cpp index 61c076e82466..7a1f9be2a0f9 100644 --- a/packages/kokkos/simd/unit_tests/TestSIMD.cpp +++ b/packages/kokkos/simd/unit_tests/TestSIMD.cpp @@ -21,3 +21,4 @@ #include #include #include +#include diff --git a/packages/kokkos/simd/unit_tests/include/SIMDTesting_Ops.hpp b/packages/kokkos/simd/unit_tests/include/SIMDTesting_Ops.hpp index 6529f20e66ac..c587ccf30468 100644 --- a/packages/kokkos/simd/unit_tests/include/SIMDTesting_Ops.hpp +++ b/packages/kokkos/simd/unit_tests/include/SIMDTesting_Ops.hpp @@ -209,4 +209,165 @@ class shift_left { } }; +class cbrt_op { + public: + template + auto on_host(T const& a) const { +#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + return Kokkos::Experimental::cbrt(a); +#else + return Kokkos::cbrt(a); +#endif + } + template + auto on_host_serial(T const& a) const { + return Kokkos::cbrt(a); + } +}; + +class exp_op { + public: + template + auto on_host(T const& a) const { +#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + return Kokkos::Experimental::exp(a); +#else + return Kokkos::exp(a); +#endif + } + template + auto on_host_serial(T const& a) const { + return Kokkos::exp(a); + } +}; + +class log_op { + public: + template + auto on_host(T const& a) const { +#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + return Kokkos::Experimental::log(a); +#else + return Kokkos::log(a); +#endif + } + template + auto on_host_serial(T const& a) const { + return Kokkos::log(a); + } +}; + +class hmin { + public: + template + auto on_host(T const& a) const { + return Kokkos::Experimental::hmin(a); + } + template + auto on_host_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity::min(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result = Kokkos::min(result, v[i]); + } + return result; + } + + template + KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const { + return Kokkos::Experimental::hmin(a); + } + template + KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity::min(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result = Kokkos::min(result, v[i]); + } + return result; + } +}; + +class hmax { + public: + template + auto on_host(T const& a) const { + return Kokkos::Experimental::hmax(a); + } + template + auto on_host_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity::max(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result = Kokkos::max(result, v[i]); + } + return result; + } + + template + KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const { + return Kokkos::Experimental::hmax(a); + } + template + KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity::max(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result = Kokkos::max(result, v[i]); + } + return result; + } +}; + +class reduce { + public: + template + auto on_host(T const& a) const { + using DataType = typename T::value_type::value_type; + return Kokkos::Experimental::reduce(a, DataType(0), std::plus<>()); + } + template + auto on_host_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity::sum(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result += v[i]; + } + return result; + } + + template + KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const { + using DataType = typename T::value_type::value_type; + return Kokkos::Experimental::reduce(a, DataType(0), std::plus<>()); + } + template + KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity::sum(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result += v[i]; + } + return result; + } +}; + #endif diff --git a/packages/kokkos/simd/unit_tests/include/SIMDTesting_Utilities.hpp b/packages/kokkos/simd/unit_tests/include/SIMDTesting_Utilities.hpp index ae2ab2c697c5..d36e1e5afc5e 100644 --- a/packages/kokkos/simd/unit_tests/include/SIMDTesting_Utilities.hpp +++ b/packages/kokkos/simd/unit_tests/include/SIMDTesting_Utilities.hpp @@ -93,7 +93,7 @@ class load_element_aligned { bool host_load(T const* mem, std::size_t n, Kokkos::Experimental::simd& result) const { if (n < result.size()) return false; - result.copy_from(mem, Kokkos::Experimental::element_aligned_tag()); + result.copy_from(mem, Kokkos::Experimental::simd_flag_default); return true; } template @@ -101,7 +101,26 @@ class load_element_aligned { T const* mem, std::size_t n, Kokkos::Experimental::simd& result) const { if (n < result.size()) return false; - result.copy_from(mem, Kokkos::Experimental::element_aligned_tag()); + result.copy_from(mem, Kokkos::Experimental::simd_flag_default); + return true; + } +}; + +class load_vector_aligned { + public: + template + bool host_load(T const* mem, std::size_t n, + Kokkos::Experimental::simd& result) const { + if (n < result.size()) return false; + result.copy_from(mem, Kokkos::Experimental::simd_flag_aligned); + return true; + } + template + KOKKOS_INLINE_FUNCTION bool device_load( + T const* mem, std::size_t n, + Kokkos::Experimental::simd& result) const { + if (n < result.size()) return false; + result.copy_from(mem, Kokkos::Experimental::simd_flag_aligned); return true; } }; @@ -116,8 +135,7 @@ class load_masked { for (std::size_t i = 0; i < n; ++i) { mask[i] = true; } - where(mask, result) - .copy_from(mem, Kokkos::Experimental::element_aligned_tag()); + where(mask, result).copy_from(mem, Kokkos::Experimental::simd_flag_default); where(!mask, result) = 0; return true; } @@ -130,8 +148,7 @@ class load_masked { for (std::size_t i = 0; i < n; ++i) { mask[i] = true; } - where(mask, result) - .copy_from(mem, Kokkos::Experimental::element_aligned_tag()); + where(mask, result).copy_from(mem, Kokkos::Experimental::simd_flag_default); where(!mask, result) = T(0); return true; } diff --git a/packages/kokkos/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp b/packages/kokkos/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp index 4af08c266bb8..23e3826c752a 100644 --- a/packages/kokkos/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp +++ b/packages/kokkos/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp @@ -37,10 +37,10 @@ inline void host_check_gen_ctor() { } simd_type rhs; - rhs.copy_from(init, Kokkos::Experimental::element_aligned_tag()); + rhs.copy_from(init, Kokkos::Experimental::simd_flag_default); simd_type blend; - blend.copy_from(expected, Kokkos::Experimental::element_aligned_tag()); + blend.copy_from(expected, Kokkos::Experimental::simd_flag_default); #if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC)) if constexpr (std::is_same_v) { @@ -98,7 +98,7 @@ KOKKOS_INLINE_FUNCTION void device_check_gen_ctor() { simd_type basic(KOKKOS_LAMBDA(std::size_t i) { return init[i]; }); simd_type rhs; - rhs.copy_from(init, Kokkos::Experimental::element_aligned_tag()); + rhs.copy_from(init, Kokkos::Experimental::simd_flag_default); device_check_equality(basic, rhs, lanes); simd_type lhs(KOKKOS_LAMBDA(std::size_t i) { return init[i] * 9; }); @@ -106,7 +106,7 @@ KOKKOS_INLINE_FUNCTION void device_check_gen_ctor() { KOKKOS_LAMBDA(std::size_t i) { return (mask[i]) ? lhs[i] : rhs[i]; }); simd_type blend; - blend.copy_from(expected, Kokkos::Experimental::element_aligned_tag()); + blend.copy_from(expected, Kokkos::Experimental::simd_flag_default); device_check_equality(result, blend, lanes); } diff --git a/packages/kokkos/simd/unit_tests/include/TestSIMD_MathOps.hpp b/packages/kokkos/simd/unit_tests/include/TestSIMD_MathOps.hpp index 802e41efe5f2..59f2f6c18fdf 100644 --- a/packages/kokkos/simd/unit_tests/include/TestSIMD_MathOps.hpp +++ b/packages/kokkos/simd/unit_tests/include/TestSIMD_MathOps.hpp @@ -61,13 +61,18 @@ void host_check_math_op_one_loader(UnaryOp unary_op, std::size_t n, simd_type arg; bool const loaded_arg = loader.host_load(args + i, nlanes, arg); if (!loaded_arg) continue; - auto computed_result = unary_op.on_host(arg); - decltype(computed_result) expected_result; + decltype(unary_op.on_host(arg)) expected_result; for (std::size_t lane = 0; lane < simd_type::size(); ++lane) { - if (lane < nlanes) + if (lane < nlanes) { + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) + arg[lane] = Kokkos::abs(arg[lane]); expected_result[lane] = unary_op.on_host_serial(T(arg[lane])); + } } + auto computed_result = unary_op.on_host(arg); host_check_equality(expected_result, computed_result, nlanes); } } @@ -78,6 +83,7 @@ inline void host_check_math_op_all_loaders(Op op, std::size_t n, host_check_math_op_one_loader(op, n, args...); host_check_math_op_one_loader(op, n, args...); host_check_math_op_one_loader(op, n, args...); + host_check_math_op_one_loader(op, n, args...); } template @@ -96,6 +102,13 @@ inline void host_check_all_math_ops(const DataType (&first_args)[n], // TODO: Place fallback implementations for all simd integer types if constexpr (std::is_floating_point_v) { host_check_math_op_all_loaders(divides(), n, first_args, second_args); + +#if defined(__INTEL_COMPILER) && \ + (defined(KOKKOS_ARCH_AVX2) || defined(KOKKOS_ARCH_AVX512XEON)) + host_check_math_op_all_loaders(cbrt_op(), n, first_args); + host_check_math_op_all_loaders(exp_op(), n, first_args); + host_check_math_op_all_loaders(log_op(), n, first_args); +#endif } } @@ -109,23 +122,29 @@ inline void host_check_abi_size() { template inline void host_check_math_ops() { constexpr size_t n = 11; + constexpr size_t alignment = + Kokkos::Experimental::simd::size() * sizeof(DataType); host_check_abi_size(); if constexpr (!std::is_integral_v) { - DataType const first_args[n] = {0.1, 0.4, 0.5, 0.7, 1.0, 1.5, - -2.0, 10.0, 0.0, 1.2, -2.8}; - DataType const second_args[n] = {1.0, 0.2, 1.1, 1.8, -0.1, -3.0, - -2.4, 1.0, 13.0, -3.2, -2.1}; + alignas(alignment) DataType const first_args[n] = { + 0.1, 0.4, 0.5, 0.7, 1.0, 1.5, -2.0, 10.0, 0.0, 1.2, -2.8}; + alignas(alignment) DataType const second_args[n] = { + 1.0, 0.2, 1.1, 1.8, -0.1, -3.0, -2.4, 1.0, 13.0, -3.2, -2.1}; host_check_all_math_ops(first_args, second_args); } else { if constexpr (std::is_signed_v) { - DataType const first_args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; - DataType const second_args[n] = {1, 2, 1, 1, 1, -3, -2, 1, 13, -3, -2}; + alignas(alignment) + DataType const first_args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; + alignas(alignment) DataType const second_args[n] = {1, 2, 1, 1, 1, -3, + -2, 1, 13, -3, -2}; host_check_all_math_ops(first_args, second_args); } else { - DataType const first_args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; - DataType const second_args[n] = {1, 2, 1, 1, 1, 3, 2, 1, 13, 3, 2}; + alignas(alignment) + DataType const first_args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; + alignas(alignment) + DataType const second_args[n] = {1, 2, 1, 1, 1, 3, 2, 1, 13, 3, 2}; host_check_all_math_ops(first_args, second_args); } } @@ -202,6 +221,7 @@ KOKKOS_INLINE_FUNCTION void device_check_math_op_all_loaders(Op op, device_check_math_op_one_loader(op, n, args...); device_check_math_op_one_loader(op, n, args...); device_check_math_op_one_loader(op, n, args...); + device_check_math_op_one_loader(op, n, args...); } template @@ -282,8 +302,13 @@ TEST(simd, host_math_ops) { } TEST(simd, device_math_ops) { - Kokkos::parallel_for(Kokkos::RangePolicy>(0, 1), - simd_device_math_ops_functor()); +#ifdef KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENMPTARGET + GTEST_SKIP() + << "skipping because of a non-deterministic failure reporting: " + "Failure to synchronize stream (nil): Error in " + "cuStreamSynchronize: an illegal memory access was encountered"; +#endif + Kokkos::parallel_for(1, simd_device_math_ops_functor()); } #endif diff --git a/packages/kokkos/simd/unit_tests/include/TestSIMD_Reductions.hpp b/packages/kokkos/simd/unit_tests/include/TestSIMD_Reductions.hpp new file mode 100644 index 000000000000..b3c7ac9a01e8 --- /dev/null +++ b/packages/kokkos/simd/unit_tests/include/TestSIMD_Reductions.hpp @@ -0,0 +1,184 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_TEST_SIMD_REDUCTIONS_HPP +#define KOKKOS_TEST_SIMD_REDUCTIONS_HPP + +#include +#include + +template +inline void host_check_reduction_one_loader(ReductionOp reduce_op, + std::size_t n, T const* args) { + Loader loader; + using simd_type = Kokkos::Experimental::simd; + using mask_type = typename Kokkos::Experimental::simd::mask_type; + constexpr std::size_t width = simd_type::size(); + + for (std::size_t i = 0; i < n; i += width) { + std::size_t const nremaining = n - i; + std::size_t const nlanes = Kokkos::min(nremaining, width); + simd_type arg; + bool const loaded_arg = loader.host_load(args + i, nlanes, arg); + if (!loaded_arg) continue; + + mask_type mask(false); + for (std::size_t j = 0; j < n; ++j) { + mask[j] = true; + } + auto value = where(mask, arg); + auto expected = reduce_op.on_host_serial(value); + auto computed = reduce_op.on_host(value); + + gtest_checker().equality(expected, computed); + } +} + +template +inline void host_check_reduction_all_loaders(ReductionOp reduce_op, + std::size_t n, T const* args) { + host_check_reduction_one_loader(reduce_op, n, + args); + host_check_reduction_one_loader(reduce_op, n, args); + host_check_reduction_one_loader(reduce_op, n, args); +} + +template +inline void host_check_all_reductions(const DataType (&args)[n]) { + host_check_reduction_all_loaders(hmin(), n, args); + host_check_reduction_all_loaders(hmax(), n, args); + host_check_reduction_all_loaders(reduce(), n, args); +} + +template +inline void host_check_reductions() { + constexpr size_t n = 11; + + if constexpr (std::is_signed_v) { + DataType const args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; + host_check_all_reductions(args); + } else { + DataType const args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; + host_check_all_reductions(args); + } +} + +template +inline void host_check_reductions_all_types( + Kokkos::Experimental::Impl::data_types) { + (host_check_reductions(), ...); +} + +template +inline void host_check_reductions_all_abis( + Kokkos::Experimental::Impl::abi_set) { + using DataTypes = Kokkos::Experimental::Impl::data_type_set; + (host_check_reductions_all_types(DataTypes()), ...); +} + +template +KOKKOS_INLINE_FUNCTION void device_check_reduction_one_loader( + ReductionOp reduce_op, std::size_t n, T const* args) { + Loader loader; + using simd_type = Kokkos::Experimental::simd; + using mask_type = typename Kokkos::Experimental::simd::mask_type; + constexpr std::size_t width = simd_type::size(); + + for (std::size_t i = 0; i < n; i += width) { + std::size_t const nremaining = n - i; + std::size_t const nlanes = Kokkos::min(nremaining, width); + simd_type arg; + bool const loaded_arg = loader.device_load(args + i, nlanes, arg); + if (!loaded_arg) continue; + + mask_type mask(false); + for (std::size_t j = 0; j < n; ++j) { + mask[j] = true; + } + auto value = where(mask, arg); + auto expected = reduce_op.on_device_serial(value); + auto computed = reduce_op.on_device(value); + + kokkos_checker().equality(expected, computed); + } +} + +template +KOKKOS_INLINE_FUNCTION void device_check_reduction_all_loaders( + ReductionOp reduce_op, std::size_t n, T const* args) { + device_check_reduction_one_loader(reduce_op, n, + args); + device_check_reduction_one_loader(reduce_op, n, args); + device_check_reduction_one_loader(reduce_op, n, args); +} + +template +KOKKOS_INLINE_FUNCTION void device_check_all_reductions( + const DataType (&args)[n]) { + device_check_reduction_all_loaders(hmin(), n, args); + device_check_reduction_all_loaders(hmax(), n, args); + device_check_reduction_all_loaders(reduce(), n, args); +} + +template +KOKKOS_INLINE_FUNCTION void device_check_reductions() { + constexpr size_t n = 11; + + if constexpr (std::is_signed_v) { + DataType const args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; + device_check_all_reductions(args); + } else { + DataType const args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; + device_check_all_reductions(args); + } +} + +template +KOKKOS_INLINE_FUNCTION void device_check_reductions_all_types( + Kokkos::Experimental::Impl::data_types) { + (device_check_reductions(), ...); +} + +template +KOKKOS_INLINE_FUNCTION void device_check_reductions_all_abis( + Kokkos::Experimental::Impl::abi_set) { + using DataTypes = Kokkos::Experimental::Impl::data_type_set; + (device_check_reductions_all_types(DataTypes()), ...); +} + +class simd_device_reduction_functor { + public: + KOKKOS_INLINE_FUNCTION void operator()(int) const { + device_check_reductions_all_abis( + Kokkos::Experimental::Impl::device_abi_set()); + } +}; + +TEST(simd, host_reductions) { + host_check_reductions_all_abis(Kokkos::Experimental::Impl::host_abi_set()); +} + +TEST(simd, device_reductions) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENMPTARGET + GTEST_SKIP() + << "skipping because of a non-deterministic failure reporting: " + "Failure to synchronize stream (nil): Error in " + "cuStreamSynchronize: an illegal memory access was encountered"; +#endif + Kokkos::parallel_for(1, simd_device_reduction_functor()); +} + +#endif diff --git a/packages/kokkos/simd/unit_tests/include/TestSIMD_ShiftOps.hpp b/packages/kokkos/simd/unit_tests/include/TestSIMD_ShiftOps.hpp index f6fdcb920ed2..ffdd2cba4a0e 100644 --- a/packages/kokkos/simd/unit_tests/include/TestSIMD_ShiftOps.hpp +++ b/packages/kokkos/simd/unit_tests/include/TestSIMD_ShiftOps.hpp @@ -85,10 +85,11 @@ inline void host_check_shift_op_all_loaders(ShiftOp shift_op, shift_by, n); host_check_shift_on_one_loader(shift_op, test_vals, shift_by, n); + host_check_shift_on_one_loader(shift_op, test_vals, + shift_by, n); Kokkos::Experimental::simd shift_by_lanes; - shift_by_lanes.copy_from(shift_by, - Kokkos::Experimental::element_aligned_tag()); + shift_by_lanes.copy_from(shift_by, Kokkos::Experimental::simd_flag_default); host_check_shift_by_lanes_on_one_loader( shift_op, test_vals, shift_by_lanes); @@ -96,6 +97,8 @@ inline void host_check_shift_op_all_loaders(ShiftOp shift_op, shift_by_lanes); host_check_shift_by_lanes_on_one_loader( shift_op, test_vals, shift_by_lanes); + host_check_shift_by_lanes_on_one_loader( + shift_op, test_vals, shift_by_lanes); } template @@ -104,12 +107,14 @@ inline void host_check_shift_ops() { using simd_type = Kokkos::Experimental::simd; constexpr std::size_t width = simd_type::size(); constexpr std::size_t num_cases = 8; + constexpr size_t alignment = + Kokkos::Experimental::simd::size() * sizeof(DataType); DataType max = std::numeric_limits::max(); - DataType shift_by[num_cases] = { + alignas(alignment) DataType shift_by[num_cases] = { 0, 1, 3, width / 2, width / 2 + 1, width - 1, width, width + 1}; - DataType test_vals[width]; + alignas(alignment) DataType test_vals[width]; for (std::size_t i = 0; i < width; ++i) { DataType inc = max / width; test_vals[i] = i * inc + 1; @@ -201,10 +206,11 @@ KOKKOS_INLINE_FUNCTION void device_check_shift_op_all_loaders( shift_by, n); device_check_shift_on_one_loader(shift_op, test_vals, shift_by, n); + device_check_shift_on_one_loader( + shift_op, test_vals, shift_by, n); Kokkos::Experimental::simd shift_by_lanes; - shift_by_lanes.copy_from(shift_by, - Kokkos::Experimental::element_aligned_tag()); + shift_by_lanes.copy_from(shift_by, Kokkos::Experimental::simd_flag_default); device_check_shift_by_lanes_on_one_loader( shift_op, test_vals, shift_by_lanes); @@ -212,6 +218,8 @@ KOKKOS_INLINE_FUNCTION void device_check_shift_op_all_loaders( shift_op, test_vals, shift_by_lanes); device_check_shift_by_lanes_on_one_loader( shift_op, test_vals, shift_by_lanes); + device_check_shift_by_lanes_on_one_loader( + shift_op, test_vals, shift_by_lanes); } template diff --git a/packages/kokkos/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp b/packages/kokkos/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp index 129f2b0d5c90..152fd9e9840d 100644 --- a/packages/kokkos/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp +++ b/packages/kokkos/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp @@ -29,7 +29,7 @@ inline void host_check_where_expr_scatter_to() { std::size_t nlanes = simd_type::size(); DataType init[] = {11, 13, 17, 19, 23, 29, 31, 37}; simd_type src; - src.copy_from(init, Kokkos::Experimental::element_aligned_tag()); + src.copy_from(init, Kokkos::Experimental::simd_flag_default); for (std::size_t idx = 0; idx < nlanes; ++idx) { mask_type mask(true); @@ -46,7 +46,7 @@ inline void host_check_where_expr_scatter_to() { where(mask, src).scatter_to(dst, index); simd_type dst_simd; - dst_simd.copy_from(dst, Kokkos::Experimental::element_aligned_tag()); + dst_simd.copy_from(dst, Kokkos::Experimental::simd_flag_default); host_check_equality(expected_result, dst_simd, nlanes); } @@ -107,7 +107,7 @@ KOKKOS_INLINE_FUNCTION void device_check_where_expr_scatter_to() { std::size_t nlanes = simd_type::size(); DataType init[] = {11, 13, 17, 19, 23, 29, 31, 37}; simd_type src; - src.copy_from(init, Kokkos::Experimental::element_aligned_tag()); + src.copy_from(init, Kokkos::Experimental::simd_flag_default); for (std::size_t idx = 0; idx < nlanes; ++idx) { mask_type mask(true); @@ -124,7 +124,7 @@ KOKKOS_INLINE_FUNCTION void device_check_where_expr_scatter_to() { where(mask, src).scatter_to(dst, index); simd_type dst_simd; - dst_simd.copy_from(dst, Kokkos::Experimental::element_aligned_tag()); + dst_simd.copy_from(dst, Kokkos::Experimental::simd_flag_default); device_check_equality(expected_result, dst_simd, nlanes); } diff --git a/packages/kokkos/tpls/desul/Config.hpp.cmake.in b/packages/kokkos/tpls/desul/Config.hpp.cmake.in index a7bc738191e7..aed7ecfabc96 100644 --- a/packages/kokkos/tpls/desul/Config.hpp.cmake.in +++ b/packages/kokkos/tpls/desul/Config.hpp.cmake.in @@ -14,6 +14,8 @@ SPDX-License-Identifier: (BSD-3-Clause) #cmakedefine DESUL_ATOMICS_ENABLE_HIP #cmakedefine DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION #cmakedefine DESUL_ATOMICS_ENABLE_SYCL +#cmakedefine DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION #cmakedefine DESUL_ATOMICS_ENABLE_OPENMP +#cmakedefine DESUL_ATOMICS_ENABLE_OPENACC #endif diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp index 082fc132de53..15c6d78d94bf 100644 --- a/packages/kokkos/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp @@ -88,15 +88,18 @@ using sycl_atomic_ref = sycl::atomic_ref; #endif -// FIXME_SYCL Use SYCL_EXT_ONEAPI_DEVICE_GLOBAL when available instead #ifdef DESUL_SYCL_DEVICE_GLOBAL_SUPPORTED -// FIXME_SYCL The compiler forces us to use device_image_scope. Drop this when possible. +#ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL +template +using sycl_device_global = sycl::ext::oneapi::experimental::device_global; +#else template using sycl_device_global = sycl::ext::oneapi::experimental::device_global< T, decltype(sycl::ext::oneapi::experimental::properties( sycl::ext::oneapi::experimental::device_image_scope))>; #endif +#endif } // namespace Impl } // namespace desul diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange.hpp index e91569e1dee8..72639fc49322 100644 --- a/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange.hpp +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange.hpp @@ -26,6 +26,9 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifdef DESUL_HAVE_OPENMP_ATOMICS #include #endif +#ifdef DESUL_HAVE_OPENACC_ATOMICS +#include +#endif #ifdef DESUL_HAVE_SYCL_ATOMICS #include #endif diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp new file mode 100644 index 000000000000..77149bd47419 --- /dev/null +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp @@ -0,0 +1,153 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_OPENACC_HPP_ +#define DESUL_ATOMICS_COMPARE_EXCHANGE_OPENACC_HPP_ + +#include + +#include +#include +#include + +namespace desul { +namespace Impl { + +#ifdef __NVCOMPILER + +#pragma acc routine seq +template +T device_atomic_exchange(T* dest, T value, MemoryOrder, MemoryScope /*scope*/) { + if constexpr (std::is_arithmetic_v && ((sizeof(T) == 4) || (sizeof(T) == 8))) { + T return_val; +#pragma acc atomic capture + { + return_val = *dest; + *dest = value; + } + return return_val; + } else { + // FIXME_OPENACC + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_exchange(): Not supported atomic operation in " + "the OpenACC backend\n"); + } + // Acquire a lock for the address + // while (!lock_address_openacc((void*)dest, scope)) { + // } + // device_atomic_thread_fence(MemoryOrderAcquire(), scope); + T return_val = *dest; + *dest = value; + // device_atomic_thread_fence(MemoryOrderRelease(), scope); + // unlock_address_openacc((void*)dest, scope); + return return_val; + } +} + +#pragma acc routine seq +template +T device_atomic_compare_exchange( + T* dest, T compare, T value, MemoryOrder, MemoryScope scope) { + // Floating point types treated separetely to work around compiler errors + // "parse invalid cast opcode for cast from 'i32' to 'float'". + // Also not just "forwarding" arguments to atomicCAS because it does not have an + // overload that takes int64_t + if constexpr (std::is_integral_v && ((sizeof(T) == 4) || (sizeof(T) == 8))) { + static_assert(sizeof(unsigned int) == 4); + static_assert(sizeof(unsigned long long int) == 8); + using cas_t = + std::conditional_t<(sizeof(T) == 4), unsigned int, unsigned long long int>; + cas_t return_val = atomicCAS(reinterpret_cast(dest), + reinterpret_cast(compare), + reinterpret_cast(value)); + return reinterpret_cast(return_val); +#ifdef DESUL_CUDA_ARCH_IS_PRE_PASCAL + } else if constexpr (std::is_same_v) { +#else + } else if constexpr (std::is_same_v || std::is_same_v) { +#endif + return atomicCAS(dest, compare, value); + } else { + // FIXME_OPENACC + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_compare_exchange(): Not supported atomic " + "operation in the OpenACC backend\n"); + } + T current_val = *dest; + // Acquire a lock for the address + // while (!lock_address_openacc((void*)dest, scope)) { + //} + // device_atomic_thread_fence(MemoryOrderAcquire(), scope); + if (current_val == compare) { + *dest = value; + // device_atomic_thread_fence(MemoryOrderRelease(), scope); + } + // unlock_address_openacc((void*)dest, scope); + return current_val; + } +} + +#else // not NVHPC + +#pragma acc routine seq +template +T device_atomic_exchange(T* dest, T value, MemoryOrder, MemoryScope) { + if constexpr (std::is_arithmetic_v) { + T return_val; +#pragma acc atomic capture + { + return_val = *dest; + *dest = value; + } + return return_val; + } else { + // FIXME_OPENACC + printf( + "DESUL error in device_atomic_exchange(): Not supported atomic operation in " + "the OpenACC backend\n"); + // Acquire a lock for the address + // while (!lock_address_openacc((void*)dest, scope)) { + // } + // device_atomic_thread_fence(MemoryOrderAcquire(), scope); + T return_val = *dest; + *dest = value; + // device_atomic_thread_fence(MemoryOrderRelease(), scope); + // unlock_address_openacc((void*)dest, scope); + return return_val; + } +} + +#pragma acc routine seq +template +T device_atomic_compare_exchange( + T* dest, T compare, T value, MemoryOrder, MemoryScope scope) { + // FIXME_OPENACC + printf( + "DESUL error in device_atomic_compare_exchange(): Not supported atomic operation " + "in the OpenACC backend\n"); + T current_val = *dest; + // Acquire a lock for the address + // while (!lock_address_openacc((void*)dest, scope)) { + //} + // device_atomic_thread_fence(MemoryOrderAcquire(), scope); + if (current_val == compare) { + *dest = value; + // device_atomic_thread_fence(MemoryOrderRelease(), scope); + } + // unlock_address_openacc((void*)dest, scope); + return current_val; +} + +#endif + +} // namespace Impl +} // namespace desul + +#endif diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op.hpp index adf75c574371..1b161397c74b 100644 --- a/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op.hpp +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op.hpp @@ -23,6 +23,9 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifdef DESUL_HAVE_OPENMP_ATOMICS #include #endif +#ifdef DESUL_HAVE_OPENACC_ATOMICS +#include +#endif #ifdef DESUL_HAVE_SYCL_ATOMICS #include #endif diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp new file mode 100644 index 000000000000..ab570ac5787a --- /dev/null +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp @@ -0,0 +1,431 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ +#ifndef DESUL_ATOMICS_FETCH_OP_OPENACC_HPP_ +#define DESUL_ATOMICS_FETCH_OP_OPENACC_HPP_ + +#include // min, max +#include +#include + +namespace desul { +namespace Impl { + +#ifdef __NVCOMPILER + +template +inline constexpr bool is_openacc_integral_type_v = + std::is_same_v || std::is_same_v || + std::is_same_v; + +template +inline constexpr bool is_openacc_arithmetic_type_v = std::is_same_v || +#ifndef DESUL_CUDA_ARCH_IS_PRE_PASCAL + std::is_same_v || +#endif + is_openacc_integral_type_v; + +#else + +template +inline constexpr bool is_openacc_integral_type_v = std::is_integral_v; + +template +inline constexpr bool is_openacc_arithmetic_type_v = std::is_arithmetic_v; + +#endif + +// +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_add( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr += val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_inc( + T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr += T(1); + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_sub( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr -= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_dec( + T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr -= T(1); + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_mul( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr *= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_div( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr /= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_lshift( + T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr = *ptr << val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_rshift( + T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr = *ptr >> val; + } + return old; +} + +#ifdef __NVCOMPILER +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_max( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; + old = atomicMax(ptr, val); + return old; +} +#endif + +#ifdef __NVCOMPILER +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_min( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + int old; + old = atomicMin(ptr, val); + return old; +} +#endif + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_and( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr &= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_or( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr |= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_xor( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr ^= val; + } + return old; +} +// + +// +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_add_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr += val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_inc_fetch( + T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr += T(1); + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_sub_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr -= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_dec_fetch( + T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr -= T(1); + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_mul_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr *= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_div_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr /= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_lshift_fetch( + T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr = *ptr << val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_rshift_fetch( + T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr = *ptr >> val; + tmp = *ptr; + } + return tmp; +} + +#ifdef __NVCOMPILER +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_max_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; + tmp = atomicMax(ptr, val); + tmp = std::max(tmp, val); + return tmp; +} +#endif + +#ifdef __NVCOMPILER +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_min_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; + tmp = atomicMin(ptr, val); + tmp = std::min(tmp, val); + return tmp; +} +#endif + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_and_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr &= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_or_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr |= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_xor_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr ^= val; + tmp = *ptr; + } + return tmp; +} +// + +// +#pragma acc routine seq +template +std::enable_if_t, void> device_atomic_store( + T* const ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { +#pragma acc atomic write + *ptr = val; +} + +#pragma acc routine seq +template +std::enable_if_t, void> device_atomic_store( + T* const ptr, const T val, MemoryOrderRelease, MemoryScopeDevice) { + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_store(MemoryOrderRelease): Not supported atomic " + "operation in the OpenACC backend\n"); + } +#pragma acc atomic write + *ptr = val; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_load( + const T* const ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T retval; +#pragma acc atomic read + retval = *ptr; + return retval; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_load( + const T* const ptr, MemoryOrderAcquire, MemoryScopeDevice) { + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_load(MemoryOrderAcquire): Not supported atomic " + "operation in the OpenACC backend\n"); + } + T retval; +#pragma acc atomic read + retval = *ptr; + return retval; +} +// + +} // namespace Impl +} // namespace desul + +#endif diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Generic.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Generic.hpp index fef10222e34e..fa71477c2996 100644 --- a/packages/kokkos/tpls/desul/include/desul/atomics/Generic.hpp +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Generic.hpp @@ -18,11 +18,14 @@ SPDX-License-Identifier: (BSD-3-Clause) namespace desul { +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_thread_fence(MemoryOrder order, MemoryScope scope) { DESUL_IF_ON_DEVICE(return Impl::device_atomic_thread_fence(order, scope);) DESUL_IF_ON_HOST(return Impl::host_atomic_thread_fence(order, scope);) } + +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_exchange(T* dest, T val, MemoryOrder order, MemoryScope scope) { @@ -30,6 +33,7 @@ atomic_exchange(T* dest, T val, MemoryOrder order, MemoryScope scope) { DESUL_IF_ON_HOST(return Impl::host_atomic_exchange(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_compare_exchange(T* dest, T cmp, T val, MemoryOrder order, MemoryScope scope) { @@ -40,6 +44,7 @@ atomic_compare_exchange(T* dest, T cmp, T val, MemoryOrder order, MemoryScope sc } // Fetch_Oper atomics: return value before operation +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_add(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -47,6 +52,7 @@ atomic_fetch_add(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_add(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_sub(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -54,6 +60,7 @@ atomic_fetch_sub(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_sub(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_max(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -61,6 +68,7 @@ atomic_fetch_max(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_max(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_min(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -68,6 +76,7 @@ atomic_fetch_min(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_min(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_mul(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -75,6 +84,7 @@ atomic_fetch_mul(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_mul(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_div(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -82,6 +92,7 @@ atomic_fetch_div(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_div(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_mod(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -89,6 +100,7 @@ atomic_fetch_mod(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_mod(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_and(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -96,6 +108,7 @@ atomic_fetch_and(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_and(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_or(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -103,6 +116,7 @@ atomic_fetch_or(T* const dest, const T val, MemoryOrder order, MemoryScope scope DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_or(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_xor(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -110,6 +124,7 @@ atomic_fetch_xor(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_xor(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_nand(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -117,6 +132,7 @@ atomic_fetch_nand(T* const dest, const T val, MemoryOrder order, MemoryScope sco DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_nand(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_lshift(T* const dest, const unsigned int val, @@ -126,6 +142,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_lshift(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_lshift(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_rshift(T* const dest, const unsigned int val, @@ -136,6 +153,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_rshift(T* const dest, } // Oper Fetch atomics: return value after operation +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_add_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -143,6 +161,7 @@ atomic_add_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_add_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_sub_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -150,6 +169,7 @@ atomic_sub_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_sub_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_max_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -157,6 +177,7 @@ atomic_max_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_max_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_min_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -164,6 +185,7 @@ atomic_min_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_min_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_mul_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -171,6 +193,7 @@ atomic_mul_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_mul_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_div_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -178,6 +201,7 @@ atomic_div_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_div_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_mod_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -185,6 +209,7 @@ atomic_mod_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_mod_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_and_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -192,6 +217,7 @@ atomic_and_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_and_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_or_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -199,6 +225,7 @@ atomic_or_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope DESUL_IF_ON_HOST(return Impl::host_atomic_or_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_xor_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -206,6 +233,7 @@ atomic_xor_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_xor_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_nand_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -213,6 +241,7 @@ atomic_nand_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope sco DESUL_IF_ON_HOST(return Impl::host_atomic_nand_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_lshift_fetch(T* const dest, const unsigned int val, @@ -222,6 +251,7 @@ DESUL_INLINE_FUNCTION T atomic_lshift_fetch(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_lshift_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_rshift_fetch(T* const dest, const unsigned int val, @@ -233,6 +263,7 @@ DESUL_INLINE_FUNCTION T atomic_rshift_fetch(T* const dest, // Other atomics +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_load(const T* const dest, MemoryOrder order, @@ -241,6 +272,7 @@ DESUL_INLINE_FUNCTION T atomic_load(const T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_load(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_store(T* const dest, const T val, @@ -250,6 +282,7 @@ DESUL_INLINE_FUNCTION void atomic_store(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_store(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_add(T* const dest, const T val, @@ -259,6 +292,7 @@ DESUL_INLINE_FUNCTION void atomic_add(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_add(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_sub(T* const dest, const T val, @@ -268,6 +302,7 @@ DESUL_INLINE_FUNCTION void atomic_sub(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_sub(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_mul(T* const dest, const T val, @@ -277,6 +312,7 @@ DESUL_INLINE_FUNCTION void atomic_mul(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_mul(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_div(T* const dest, const T val, @@ -286,6 +322,7 @@ DESUL_INLINE_FUNCTION void atomic_div(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_div(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_min(T* const dest, const T val, @@ -295,6 +332,7 @@ DESUL_INLINE_FUNCTION void atomic_min(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_min(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_max(T* const dest, const T val, @@ -304,6 +342,7 @@ DESUL_INLINE_FUNCTION void atomic_max(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_max(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_inc_fetch(T* const dest, MemoryOrder order, @@ -312,6 +351,7 @@ DESUL_INLINE_FUNCTION T atomic_inc_fetch(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_inc_fetch(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_dec_fetch(T* const dest, MemoryOrder order, @@ -320,6 +360,7 @@ DESUL_INLINE_FUNCTION T atomic_dec_fetch(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_dec_fetch(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_inc(T* const dest, MemoryOrder order, @@ -328,6 +369,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_inc(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_inc(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_inc_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) { @@ -335,6 +377,7 @@ atomic_fetch_inc_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_inc_mod(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_dec(T* const dest, MemoryOrder order, @@ -343,6 +386,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_dec(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_dec(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_dec_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) { @@ -350,6 +394,7 @@ atomic_fetch_dec_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_dec_mod(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_inc(T* const dest, MemoryOrder order, @@ -358,6 +403,7 @@ DESUL_INLINE_FUNCTION void atomic_inc(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_inc(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_dec(T* const dest, MemoryOrder order, @@ -367,6 +413,7 @@ DESUL_INLINE_FUNCTION void atomic_dec(T* const dest, } // FIXME +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template SYCL_SPACE_ATOMIC_LOCKS_DEVICE; +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +SYCL_EXTERNAL extern +#else +static +#endif + sycl_device_global + SYCL_SPACE_ATOMIC_LOCKS_DEVICE; -SYCL_EXTERNAL extern sycl_device_global SYCL_SPACE_ATOMIC_LOCKS_NODE; +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +SYCL_EXTERNAL extern +#else +static +#endif + sycl_device_global + SYCL_SPACE_ATOMIC_LOCKS_NODE; #define SYCL_SPACE_ATOMIC_MASK 0x1FFFF @@ -128,6 +149,34 @@ inline void unlock_address_sycl(void* ptr, MemoryScopeNode) { lock_node_ref.exchange(0); } +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +inline +#else +inline static +#endif + void + copy_sycl_lock_arrays_to_device(sycl::queue q) { + static bool once = [&q]() { +#ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL + q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_DEVICE, + &SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h, + sizeof(int32_t*)); + q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_NODE, + &SYCL_SPACE_ATOMIC_LOCKS_NODE_h, + sizeof(int32_t*)); +#else + auto device_ptr = SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h; + auto node_ptr = SYCL_SPACE_ATOMIC_LOCKS_NODE_h; + q.single_task([=] { + SYCL_SPACE_ATOMIC_LOCKS_DEVICE.get() = device_ptr; + SYCL_SPACE_ATOMIC_LOCKS_NODE.get() = node_ptr; + }); +#endif + return true; + }(); + (void)once; +} + #else // not supported template @@ -155,7 +204,26 @@ inline bool lock_address_sycl(void*, MemoryScopeNode) { inline void unlock_address_sycl(void*, MemoryScopeDevice) { assert(false); } inline void unlock_address_sycl(void*, MemoryScopeNode) { assert(false); } + +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +inline +#else +inline static +#endif + void + copy_sycl_lock_arrays_to_device(sycl::queue) { +} + #endif } // namespace Impl + +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +inline void ensure_sycl_lock_arrays_on_device(sycl::queue) {} +#else +static inline void ensure_sycl_lock_arrays_on_device(sycl::queue q) { + Impl::copy_sycl_lock_arrays_to_device(q); +} +#endif + } // namespace desul #endif diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp index cb97f4a906db..b6a399100b17 100644 --- a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp @@ -17,6 +17,9 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifdef DESUL_HAVE_HIP_ATOMICS #include #endif +#ifdef DESUL_HAVE_OPENACC_ATOMICS +#include +#endif #ifdef DESUL_HAVE_SYCL_ATOMICS #include #endif diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp new file mode 100644 index 000000000000..d4dd74588bda --- /dev/null +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp @@ -0,0 +1,81 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_LOCK_BASED_FETCH_OP_OPENACC_HPP_ +#define DESUL_ATOMICS_LOCK_BASED_FETCH_OP_OPENACC_HPP_ + +#include +#include +#include +#include + +namespace desul { +namespace Impl { + +template = 0> +inline T device_atomic_fetch_oper(const Oper& op, + T* const dest, + dont_deduce_this_parameter_t val, + MemoryOrder /*order*/, + MemoryScope scope) { + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_fetch_oper(): Not supported atomic operation in " + "the OpenACC backend\n"); + } + // Acquire a lock for the address + while (!lock_address((void*)dest, scope)) { + } + + device_atomic_thread_fence(MemoryOrderAcquire(), scope); + T return_val = *dest; + *dest = op.apply(return_val, val); + device_atomic_thread_fence(MemoryOrderRelease(), scope); + unlock_address((void*)dest, scope); + return return_val; +} + +template = 0> +inline T device_atomic_oper_fetch(const Oper& op, + T* const dest, + dont_deduce_this_parameter_t val, + MemoryOrder /*order*/, + MemoryScope scope) { + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_oper_fetch(): Not supported atomic operation in " + "the OpenACC backend\n"); + } + // Acquire a lock for the address + while (!lock_address((void*)dest, scope)) { + } + + device_atomic_thread_fence(MemoryOrderAcquire(), scope); + T return_val = op.apply(*dest, val); + *dest = return_val; + device_atomic_thread_fence(MemoryOrderRelease(), scope); + unlock_address((void*)dest, scope); + return return_val; +} + +} // namespace Impl +} // namespace desul + +#endif diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Macros.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Macros.hpp index 3a14b93d3230..d11beb0c8050 100644 --- a/packages/kokkos/tpls/desul/include/desul/atomics/Macros.hpp +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Macros.hpp @@ -57,6 +57,10 @@ SPDX-License-Identifier: (BSD-3-Clause) #define DESUL_HAVE_OPENMP_ATOMICS #endif +#if defined(DESUL_ATOMICS_ENABLE_OPENACC) +#define DESUL_HAVE_OPENACC_ATOMICS +#endif + // ONLY use GNUC atomics if not explicitly say to use OpenMP atomics #if !defined(DESUL_HAVE_OPENMP_ATOMICS) && defined(__GNUC__) #define DESUL_HAVE_GCC_ATOMICS @@ -123,6 +127,30 @@ static constexpr bool desul_impl_omp_on_host() { return false; } #endif #endif +#if defined(DESUL_HAVE_OPENACC_ATOMICS) +#include +#ifdef __NVCOMPILER +// FIXME_OPENACC We cannot determine in a constant expresion whether we are on host or +// on device with NVHPC. We use the device implementation on both sides. +#define DESUL_IF_ON_DEVICE(CODE) \ + { DESUL_IMPL_STRIP_PARENS(CODE) } +#define DESUL_IF_ON_HOST(CODE) \ + {} +#else +#define DESUL_IF_ON_DEVICE(CODE) \ + if constexpr (acc_on_device(acc_device_not_host)) { \ + DESUL_IMPL_STRIP_PARENS(CODE) \ + } +#define DESUL_IF_ON_HOST(CODE) \ + if constexpr (acc_on_device(acc_device_host)) { \ + DESUL_IMPL_STRIP_PARENS(CODE) \ + } +#endif +#define DESUL_IMPL_ACC_ROUTINE_DIRECTIVE _Pragma("acc routine seq") +#else +#define DESUL_IMPL_ACC_ROUTINE_DIRECTIVE +#endif + #if !defined(DESUL_IF_ON_HOST) && !defined(DESUL_IF_ON_DEVICE) #if (defined(DESUL_ATOMICS_ENABLE_CUDA) && defined(__CUDA_ARCH__)) || \ (defined(DESUL_ATOMICS_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) || \ diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Thread_Fence.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Thread_Fence.hpp index 24078aae07fe..6a741f6d478c 100644 --- a/packages/kokkos/tpls/desul/include/desul/atomics/Thread_Fence.hpp +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Thread_Fence.hpp @@ -26,6 +26,9 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifdef DESUL_HAVE_OPENMP_ATOMICS #include #endif +#ifdef DESUL_HAVE_OPENACC_ATOMICS +#include +#endif #ifdef DESUL_HAVE_SYCL_ATOMICS #include #endif diff --git a/packages/kokkos/tpls/desul/include/desul/atomics/Thread_Fence_OpenACC.hpp b/packages/kokkos/tpls/desul/include/desul/atomics/Thread_Fence_OpenACC.hpp new file mode 100644 index 000000000000..a5c8aa1c8a72 --- /dev/null +++ b/packages/kokkos/tpls/desul/include/desul/atomics/Thread_Fence_OpenACC.hpp @@ -0,0 +1,25 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_THREAD_FENCE_OPENACC_HPP_ +#define DESUL_ATOMICS_THREAD_FENCE_OPENACC_HPP_ + +namespace desul { +namespace Impl { + +#pragma acc routine seq +template +void device_atomic_thread_fence(MemoryOrder, MemoryScope) { + // FIXME_OPENACC: The current OpenACC standard does not support explicit thread fence + // operations. +} + +} // namespace Impl +} // namespace desul + +#endif diff --git a/packages/kokkos/tpls/desul/src/Lock_Array_SYCL.cpp b/packages/kokkos/tpls/desul/src/Lock_Array_SYCL.cpp index 9e84c60e41a5..6660c76e11a3 100644 --- a/packages/kokkos/tpls/desul/src/Lock_Array_SYCL.cpp +++ b/packages/kokkos/tpls/desul/src/Lock_Array_SYCL.cpp @@ -14,10 +14,12 @@ SPDX-License-Identifier: (BSD-3-Clause) namespace desul::Impl { +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION SYCL_EXTERNAL sycl_device_global SYCL_SPACE_ATOMIC_LOCKS_DEVICE; SYCL_EXTERNAL sycl_device_global SYCL_SPACE_ATOMIC_LOCKS_NODE; +#endif int32_t* SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr; int32_t* SYCL_SPACE_ATOMIC_LOCKS_NODE_h = nullptr; @@ -31,19 +33,7 @@ void init_lock_arrays_sycl(sycl::queue q) { SYCL_SPACE_ATOMIC_LOCKS_NODE_h = sycl::malloc_host(SYCL_SPACE_ATOMIC_MASK + 1, q); - // FIXME_SYCL Once supported, the following should be replaced by - // q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_DEVICE, - // &SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h, - // sizeof(int32_t*)); - // q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_NODE, - // &SYCL_SPACE_ATOMIC_LOCKS_NODE_h, - // sizeof(int32_t*)); - auto device_ptr = SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h; - auto node_ptr = SYCL_SPACE_ATOMIC_LOCKS_NODE_h; - q.single_task([=] { - SYCL_SPACE_ATOMIC_LOCKS_DEVICE.get() = device_ptr; - SYCL_SPACE_ATOMIC_LOCKS_NODE.get() = node_ptr; - }); + copy_sycl_lock_arrays_to_device(q); q.memset(SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h, 0, @@ -63,7 +53,10 @@ void finalize_lock_arrays_sycl(sycl::queue q) { sycl::free(SYCL_SPACE_ATOMIC_LOCKS_NODE_h, q); SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr; SYCL_SPACE_ATOMIC_LOCKS_NODE_h = nullptr; +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION + copy_sycl_lock_arrays_to_device(q); +#endif } -} // namespace desul::Impl +} // namespace desul::Impl #endif diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp index ab1561bd47fa..25389a2fa5e7 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp @@ -27,165 +27,165 @@ namespace detail { // For no unique address emulation, this is the case taken when neither are empty. // For real `[[no_unique_address]]`, this case is always taken. -template struct __compressed_pair { - _MDSPAN_NO_UNIQUE_ADDRESS _T __t_val; - _MDSPAN_NO_UNIQUE_ADDRESS _U __u_val; - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept { return __t_val; } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _T const &__first() const noexcept { - return __t_val; +template struct __compressed_pair { + _MDSPAN_NO_UNIQUE_ADDRESS _T1 __t1_val{}; + _MDSPAN_NO_UNIQUE_ADDRESS _T2 __t2_val{}; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T1 &__first() noexcept { return __t1_val; } + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T1 const &__first() const noexcept { + return __t1_val; } - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _U &__second() noexcept { return __u_val; } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _U const &__second() const noexcept { - return __u_val; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T2 &__second() noexcept { return __t2_val; } + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T2 const &__second() const noexcept { + return __t2_val; } MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair() noexcept = default; + constexpr __compressed_pair() = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair const &) noexcept = default; + constexpr __compressed_pair(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair &&) noexcept = default; + constexpr __compressed_pair(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair const &) noexcept = default; + operator=(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair &&) noexcept = default; + operator=(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - ~__compressed_pair() noexcept = default; - template - MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike &&__t, _ULike &&__u) - : __t_val((_TLike &&) __t), __u_val((_ULike &&) __u) {} + ~__compressed_pair() = default; + template + MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_T1Like &&__t1, _T2Like &&__t2) + : __t1_val((_T1Like &&) __t1), __t2_val((_T2Like &&) __t2) {} }; #if !defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) // First empty. -template +template struct __compressed_pair< - _T, _U, - std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T) && !_MDSPAN_TRAIT(std::is_empty, _U)>> - : private _T { - _U __u_val; - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept { - return *static_cast<_T *>(this); + _T1, _T2, + std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T1) && !_MDSPAN_TRAIT(std::is_empty, _T2)>> + : private _T1 { + _T2 __t2_val{}; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T1 &__first() noexcept { + return *static_cast<_T1 *>(this); } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _T const &__first() const noexcept { - return *static_cast<_T const *>(this); + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T1 const &__first() const noexcept { + return *static_cast<_T1 const *>(this); } - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _U &__second() noexcept { return __u_val; } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _U const &__second() const noexcept { - return __u_val; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T2 &__second() noexcept { return __t2_val; } + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T2 const &__second() const noexcept { + return __t2_val; } MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair() noexcept = default; + constexpr __compressed_pair() = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair const &) noexcept = default; + constexpr __compressed_pair(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair &&) noexcept = default; + constexpr __compressed_pair(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair const &) noexcept = default; + operator=(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair &&) noexcept = default; + operator=(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - ~__compressed_pair() noexcept = default; - template - MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike &&__t, _ULike &&__u) - : _T((_TLike &&) __t), __u_val((_ULike &&) __u) {} + ~__compressed_pair() = default; + template + MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_T1Like &&__t1, _T2Like &&__t2) + : _T1((_T1Like &&) __t1), __t2_val((_T2Like &&) __t2) {} }; // Second empty. -template +template struct __compressed_pair< - _T, _U, - std::enable_if_t> - : private _U { - _T __t_val; - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept { return __t_val; } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _T const &__first() const noexcept { - return __t_val; + _T1, _T2, + std::enable_if_t> + : private _T2 { + _T1 __t1_val{}; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T1 &__first() noexcept { return __t1_val; } + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T1 const &__first() const noexcept { + return __t1_val; } - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _U &__second() noexcept { - return *static_cast<_U *>(this); + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T2 &__second() noexcept { + return *static_cast<_T2 *>(this); } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _U const &__second() const noexcept { - return *static_cast<_U const *>(this); + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T2 const &__second() const noexcept { + return *static_cast<_T2 const *>(this); } MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair() noexcept = default; + constexpr __compressed_pair() = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair const &) noexcept = default; + constexpr __compressed_pair(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair &&) noexcept = default; + constexpr __compressed_pair(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair const &) noexcept = default; + operator=(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair &&) noexcept = default; + operator=(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - ~__compressed_pair() noexcept = default; + ~__compressed_pair() = default; - template - MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike &&__t, _ULike &&__u) - : _U((_ULike &&) __u), __t_val((_TLike &&) __t) {} + template + MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_T1Like &&__t1, _T2Like &&__t2) + : _T2((_T2Like &&) __t2), __t1_val((_T1Like &&) __t1) {} }; // Both empty. -template +template struct __compressed_pair< - _T, _U, - std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T) && _MDSPAN_TRAIT(std::is_empty, _U)>> + _T1, _T2, + std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T1) && _MDSPAN_TRAIT(std::is_empty, _T2)>> // We need to use the __no_unique_address_emulation wrapper here to avoid // base class ambiguities. #ifdef _MDSPAN_COMPILER_MSVC // MSVC doesn't allow you to access public static member functions of a type // when you *happen* to privately inherit from that type. - : protected __no_unique_address_emulation<_T, 0>, - protected __no_unique_address_emulation<_U, 1> + : protected __no_unique_address_emulation<_T1, 0>, + protected __no_unique_address_emulation<_T2, 1> #else - : private __no_unique_address_emulation<_T, 0>, - private __no_unique_address_emulation<_U, 1> + : private __no_unique_address_emulation<_T1, 0>, + private __no_unique_address_emulation<_T2, 1> #endif { - using __first_base_t = __no_unique_address_emulation<_T, 0>; - using __second_base_t = __no_unique_address_emulation<_U, 1>; + using __first_base_t = __no_unique_address_emulation<_T1, 0>; + using __second_base_t = __no_unique_address_emulation<_T2, 1>; - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept { + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T1 &__first() noexcept { return this->__first_base_t::__ref(); } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _T const &__first() const noexcept { + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T1 const &__first() const noexcept { return this->__first_base_t::__ref(); } - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _U &__second() noexcept { + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T2 &__second() noexcept { return this->__second_base_t::__ref(); } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _U const &__second() const noexcept { + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T2 const &__second() const noexcept { return this->__second_base_t::__ref(); } MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair() noexcept = default; + constexpr __compressed_pair() = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair const &) noexcept = default; + constexpr __compressed_pair(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair &&) noexcept = default; + constexpr __compressed_pair(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair const &) noexcept = default; + operator=(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair &&) noexcept = default; + operator=(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - ~__compressed_pair() noexcept = default; - template - MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike &&__t, _ULike &&__u) noexcept - : __first_base_t(_T((_TLike &&) __t)), - __second_base_t(_U((_ULike &&) __u)) + ~__compressed_pair() = default; + template + MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_T1Like &&__t1, _T2Like &&__t2) noexcept + : __first_base_t(_T1((_T1Like &&) __t1)), + __second_base_t(_T2((_T2Like &&) __t2)) { } }; diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/config.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/config.hpp index d35e201cebd2..8e42a37ba7c7 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/config.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/config.hpp @@ -35,10 +35,17 @@ #define MDSPAN_CXX_STD_14 201402L #define MDSPAN_CXX_STD_17 201703L #define MDSPAN_CXX_STD_20 202002L +// Note GCC has not updated this in version 13 +#ifdef __clang__ +#define MDSPAN_CXX_STD_23 202302L +#else +#define MDSPAN_CXX_STD_23 202100L +#endif #define MDSPAN_HAS_CXX_14 (_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14) #define MDSPAN_HAS_CXX_17 (_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_17) #define MDSPAN_HAS_CXX_20 (_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_20) +#define MDSPAN_HAS_CXX_23 (_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_23) static_assert(_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14, "mdspan requires C++14 or later."); @@ -224,7 +231,7 @@ static_assert(_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14, "mdspan requires C++14 or #endif #ifndef MDSPAN_CONDITIONAL_EXPLICIT -# if MDSPAN_HAS_CXX_20 && !defined(_MDSPAN_COMPILER_MSVC) +# if MDSPAN_HAS_CXX_20 # define MDSPAN_CONDITIONAL_EXPLICIT(COND) explicit(COND) # else # define MDSPAN_CONDITIONAL_EXPLICIT(COND) diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp index 0dd31c4cd0aa..9a28c3ed5ca3 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp @@ -55,6 +55,14 @@ __check_compatible_extents( return {}; } +template +MDSPAN_INLINE_FUNCTION +static constexpr bool are_valid_indices() { + return + (std::is_convertible::value && ... && true) && + (std::is_nothrow_constructible::value && ... && true); +} + // ------------------------------------------------------------------ // ------------ static_array ---------------------------------------- // ------------------------------------------------------------------ @@ -140,7 +148,8 @@ struct index_sequence_scan_impl { template struct index_sequence_scan_impl { -#if defined(__NVCC__) || defined(__NVCOMPILER) +#if defined(__NVCC__) || defined(__NVCOMPILER) || \ + defined(_MDSPAN_COMPILER_INTEL) // NVCC warns about pointless comparison with 0 for R==0 and r being const // evaluatable and also 0. MDSPAN_INLINE_FUNCTION @@ -167,7 +176,7 @@ template <> struct index_sequence_scan_impl<0> { // all static values. template struct possibly_empty_array { - T vals[N]; + T vals[N]{}; MDSPAN_INLINE_FUNCTION constexpr T &operator[](size_t r) { return vals[r]; } MDSPAN_INLINE_FUNCTION @@ -251,12 +260,17 @@ struct maybe_static_array { #ifdef __cpp_lib_span MDSPAN_TEMPLATE_REQUIRES(class T, size_t N, - /* requires */ (N == m_size_dynamic)) + /* requires */ (N == m_size_dynamic && N > 0)) MDSPAN_INLINE_FUNCTION constexpr maybe_static_array(const std::span &vals) { for (size_t r = 0; r < N; r++) m_dyn_vals[r] = static_cast(vals[r]); } + + MDSPAN_TEMPLATE_REQUIRES(class T, size_t N, + /* requires */ (N == m_size_dynamic && N == 0)) + MDSPAN_INLINE_FUNCTION + constexpr maybe_static_array(const std::span &) : m_dyn_vals{} {} #endif // constructors from all values @@ -423,9 +437,9 @@ template class extents { class OtherIndexType, size_t N, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, OtherIndexType, index_type) && + _MDSPAN_TRAIT(std::is_convertible, const OtherIndexType&, index_type) && _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, - OtherIndexType) && + const OtherIndexType&) && (N == m_rank || N == m_rank_dynamic))) MDSPAN_INLINE_FUNCTION MDSPAN_CONDITIONAL_EXPLICIT(N != m_rank_dynamic) @@ -436,8 +450,8 @@ template class extents { MDSPAN_TEMPLATE_REQUIRES( class OtherIndexType, size_t N, /* requires */ - (_MDSPAN_TRAIT(std::is_convertible, OtherIndexType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, OtherIndexType) && + (_MDSPAN_TRAIT(std::is_convertible, const OtherIndexType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const OtherIndexType&) && (N == m_rank || N == m_rank_dynamic))) MDSPAN_INLINE_FUNCTION MDSPAN_CONDITIONAL_EXPLICIT(N != m_rank_dynamic) @@ -454,6 +468,7 @@ template class extents { size_t DynCount, size_t R, class OtherExtents, class... DynamicValues, /* requires */ ((R < m_rank) && (static_extent(R) == dynamic_extent))) MDSPAN_INLINE_FUNCTION + constexpr vals_t __construct_vals_from_extents(std::integral_constant, std::integral_constant, const OtherExtents &exts, @@ -468,6 +483,7 @@ template class extents { size_t DynCount, size_t R, class OtherExtents, class... DynamicValues, /* requires */ ((R < m_rank) && (static_extent(R) != dynamic_extent))) MDSPAN_INLINE_FUNCTION + constexpr vals_t __construct_vals_from_extents(std::integral_constant, std::integral_constant, const OtherExtents &exts, @@ -481,6 +497,7 @@ template class extents { size_t DynCount, size_t R, class OtherExtents, class... DynamicValues, /* requires */ ((R == m_rank) && (DynCount == m_rank_dynamic))) MDSPAN_INLINE_FUNCTION + constexpr vals_t __construct_vals_from_extents(std::integral_constant, std::integral_constant, const OtherExtents &, @@ -491,17 +508,20 @@ template class extents { public: // Converting constructor from other extents specializations - MDSPAN_TEMPLATE_REQUIRES( - class OtherIndexType, size_t... OtherExtents, - /* requires */ - ( - /* multi-stage check to protect from invalid pack expansion when sizes - don't match? */ - decltype(detail::__check_compatible_extents( - std::integral_constant{}, + MDSPAN_TEMPLATE_REQUIRES( + class OtherIndexType, size_t... OtherExtents, + /* requires */ + ( + /* multi-stage check to protect from invalid pack expansion when sizes + don't match? */ + decltype(detail::__check_compatible_extents( + // using: sizeof...(Extents) == sizeof...(OtherExtents) as the second argument fails with MSVC+NVCC with some obscure expansion error + // MSVC: 19.38.33133 NVCC: 12.0 + std::integral_constant::rank() == extents::rank()>{}, std::integer_sequence{}, - std::integer_sequence{}))::value)) + std::integer_sequence{}))::value + ) + ) MDSPAN_INLINE_FUNCTION MDSPAN_CONDITIONAL_EXPLICIT((((Extents != dynamic_extent) && (OtherExtents == dynamic_extent)) || @@ -518,10 +538,14 @@ template class extents { MDSPAN_INLINE_FUNCTION friend constexpr bool operator==(const extents &lhs, const extents &rhs) noexcept { - bool value = true; - for (size_type r = 0; r < m_rank; r++) - value &= rhs.extent(r) == lhs.extent(r); - return value; + if constexpr (rank() != extents::rank()) { + return false; + } else { + using common_t = std::common_type_t; + for (size_type r = 0; r < m_rank; r++) + if(static_cast(rhs.extent(r)) != static_cast(lhs.extent(r))) return false; + } + return true; } #if !(MDSPAN_HAS_CXX_20) @@ -570,7 +594,7 @@ using dextents = typename detail::__make_dextents::type; template extents(IndexTypes...) -> extents; + ((void) sizeof(IndexTypes), ::MDSPAN_IMPL_STANDARD_NAMESPACE::dynamic_extent)...>; #endif // Helper type traits for identifying a class as extents. diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp index af44494a98d8..83ed9ef7fe36 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp @@ -18,6 +18,9 @@ #include "macros.hpp" #include "trait_backports.hpp" #include "extents.hpp" +#include "../__p2642_bits/layout_padded_fwd.hpp" +#include +#include namespace MDSPAN_IMPL_STANDARD_NAMESPACE { @@ -108,6 +111,36 @@ class layout_left::mapping { */ } +#if MDSPAN_HAS_CXX_17 + /** + * Converting constructor from `layout_left_padded::mapping`. + * + * This overload participates in overload resolution only if _Mapping is a layout_left_padded mapping and + * extents_type is constructible from _Mapping::extents_type. + * + * \note There is currently a difference from p2642r2, where this function is specified as taking + * `layout_left_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::is_layout_left_padded_mapping<_Mapping>::value + && std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v)) + mapping(const _Mapping& __other) noexcept + : __extents(__other.extents()) + { + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail:: + check_padded_layout_converting_constructor_mandates(); + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail:: + check_padded_layout_converting_constructor_preconditions< + extents_type>(__other); + } +#endif + MDSPAN_TEMPLATE_REQUIRES( class OtherExtents, /* requires */ ( @@ -124,13 +157,14 @@ class layout_left::mapping { * other.required_span_size() is a representable value of type index_type */ #if !defined(_MDSPAN_HAS_CUDA) && !defined(_MDSPAN_HAS_HIP) && !defined(NDEBUG) - index_type stride = 1; - for(rank_type r=0; r<__extents.rank(); r++) { - if(stride != static_cast(other.stride(r))) { - // Note this throw will lead to a terminate if triggered since this function is marked noexcept - throw std::runtime_error("Assigning layout_stride to layout_left with invalid strides."); + if constexpr (extents_type::rank() > 0) { + index_type stride = 1; + using common_t = std::common_type_t; + for(rank_type r=0; r<__extents.rank(); r++) { + if(static_cast(stride) != static_cast(other.stride(r))) + std::abort(); // ("Assigning layout_stride to layout_left with invalid strides."); + stride *= __extents.extent(r); } - stride *= __extents.extent(r); } #endif } @@ -155,10 +189,7 @@ class layout_left::mapping { class... Indices, /* requires */ ( (sizeof...(Indices) == extents_type::rank()) && - _MDSPAN_FOLD_AND( - (_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices)) - ) + (detail::are_valid_indices()) ) ) _MDSPAN_HOST_DEVICE @@ -172,9 +203,9 @@ class layout_left::mapping { MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { return true; } MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_exhaustive() noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; } MDSPAN_INLINE_FUNCTION constexpr index_type stride(rank_type i) const noexcept @@ -187,7 +218,10 @@ class layout_left::mapping { return value; } - template + MDSPAN_TEMPLATE_REQUIRES( + class OtherExtents, + /* requires */ ( Extents::rank() == OtherExtents::rank()) + ) MDSPAN_INLINE_FUNCTION friend constexpr bool operator==(mapping const& lhs, mapping const& rhs) noexcept { return lhs.extents() == rhs.extents(); @@ -195,7 +229,10 @@ class layout_left::mapping { // In C++ 20 the not equal exists if equal is found #if !(MDSPAN_HAS_CXX_20) - template + MDSPAN_TEMPLATE_REQUIRES( + class OtherExtents, + /* requires */ ( Extents::rank() == OtherExtents::rank()) + ) MDSPAN_INLINE_FUNCTION friend constexpr bool operator!=(mapping const& lhs, mapping const& rhs) noexcept { return lhs.extents() != rhs.extents(); @@ -215,6 +252,17 @@ class layout_left::mapping { private: _MDSPAN_NO_UNIQUE_ADDRESS extents_type __extents{}; + // [mdspan.submdspan.mapping], submdspan mapping specialization + template + MDSPAN_INLINE_FUNCTION + constexpr auto submdspan_mapping_impl( + SliceSpecifiers... slices) const; + + template + friend constexpr auto submdspan_mapping( + const mapping& src, SliceSpecifiers... slices) { + return src.submdspan_mapping_impl(slices...); + } }; diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp index a0586484202e..3d3927df7bcc 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp @@ -20,6 +20,7 @@ #include "extents.hpp" #include #include "layout_stride.hpp" +#include "../__p2642_bits/layout_padded_fwd.hpp" namespace MDSPAN_IMPL_STANDARD_NAMESPACE { @@ -113,6 +114,34 @@ class layout_right::mapping { */ } + /** + * Converting constructor from `layout_right_padded::mapping`. + * + * This overload participates in overload resolution only if _Mapping is a layout_right_padded mapping and + * extents_type is constructible from _Mapping::extents_type. + * + * \note There is currently a difference from p2642r2, where this function is specified as taking + * `layout_right_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + */ +#if MDSPAN_HAS_CXX_17 + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::is_layout_right_padded_mapping<_Mapping>::value + && std::is_constructible_v)) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v)) + mapping(const _Mapping &__other) noexcept + : __extents(__other.extents()) + { + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail:: + check_padded_layout_converting_constructor_mandates(); + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail:: + check_padded_layout_converting_constructor_preconditions< + extents_type>(__other); + } +#endif + MDSPAN_TEMPLATE_REQUIRES( class OtherExtents, /* requires */ ( @@ -129,13 +158,14 @@ class layout_right::mapping { * other.required_span_size() is a representable value of type index_type */ #if !defined(_MDSPAN_HAS_CUDA) && !defined(_MDSPAN_HAS_HIP) && !defined(NDEBUG) - index_type stride = 1; - for(rank_type r=__extents.rank(); r>0; r--) { - if(stride != static_cast(other.stride(r-1))) { - // Note this throw will lead to a terminate if triggered since this function is marked noexcept - throw std::runtime_error("Assigning layout_stride to layout_right with invalid strides."); + if constexpr (extents_type::rank() > 0) { + index_type stride = 1; + using common_t = std::common_type_t; + for(rank_type r=__extents.rank(); r>0; r--) { + if(static_cast(stride) != static_cast(other.stride(r-1))) + std::abort(); // ("Assigning layout_stride to layout_right with invalid strides."); + stride *= __extents.extent(r-1); } - stride *= __extents.extent(r-1); } #endif } @@ -157,13 +187,10 @@ class layout_right::mapping { //-------------------------------------------------------------------------------- MDSPAN_TEMPLATE_REQUIRES( - class... Indices, + class ... Indices, /* requires */ ( - (sizeof...(Indices) == extents_type::rank()) && - _MDSPAN_FOLD_AND( - (_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices)) - ) + (sizeof...(Indices) == extents_type::rank()) && + (detail::are_valid_indices()) ) ) _MDSPAN_HOST_DEVICE @@ -174,9 +201,9 @@ class layout_right::mapping { MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { return true; } MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { return true; } MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_exhaustive() noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; } MDSPAN_INLINE_FUNCTION constexpr index_type stride(rank_type i) const noexcept @@ -189,7 +216,10 @@ class layout_right::mapping { return value; } - template + MDSPAN_TEMPLATE_REQUIRES( + class OtherExtents, + /* requires */ ( Extents::rank() == OtherExtents::rank()) + ) MDSPAN_INLINE_FUNCTION friend constexpr bool operator==(mapping const& lhs, mapping const& rhs) noexcept { return lhs.extents() == rhs.extents(); @@ -197,7 +227,10 @@ class layout_right::mapping { // In C++ 20 the not equal exists if equal is found #if !(MDSPAN_HAS_CXX_20) - template + MDSPAN_TEMPLATE_REQUIRES( + class OtherExtents, + /* requires */ (Extents::rank() == OtherExtents::rank()) + ) MDSPAN_INLINE_FUNCTION friend constexpr bool operator!=(mapping const& lhs, mapping const& rhs) noexcept { return lhs.extents() != rhs.extents(); @@ -217,6 +250,17 @@ class layout_right::mapping { private: _MDSPAN_NO_UNIQUE_ADDRESS extents_type __extents{}; + // [mdspan.submdspan.mapping], submdspan mapping specialization + template + MDSPAN_INLINE_FUNCTION + constexpr auto submdspan_mapping_impl( + SliceSpecifiers... slices) const; + + template + friend constexpr auto submdspan_mapping( + const mapping& src, SliceSpecifiers... slices) { + return src.submdspan_mapping_impl(slices...); + } }; } // end namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp index 030a494529b6..15ad577d149c 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp @@ -88,7 +88,7 @@ struct layout_stride { : private detail::__no_unique_address_emulation< detail::__compressed_pair< Extents, - std::array + detail::possibly_empty_array > > #endif @@ -109,7 +109,7 @@ struct layout_stride { //---------------------------------------------------------------------------- - using __strides_storage_t = std::array; + using __strides_storage_t = detail::possibly_empty_array; using __member_pair_t = detail::__compressed_pair; #if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) @@ -158,14 +158,16 @@ struct layout_stride { template MDSPAN_INLINE_FUNCTION static constexpr bool _eq_impl(mapping const& self, mapping const& other) noexcept { - return _MDSPAN_FOLD_AND((self.stride(Idxs) == other.stride(Idxs)) /* && ... */) - && _MDSPAN_FOLD_AND((self.extents().extent(Idxs) == other.extents().extent(Idxs)) /* || ... */); + using common_t = std::common_type_t; + return _MDSPAN_FOLD_AND((static_cast(self.stride(Idxs)) == static_cast(other.stride(Idxs))) /* && ... */) + && _MDSPAN_FOLD_AND((static_cast(self.extents().extent(Idxs)) == static_cast(other.extents().extent(Idxs))) /* || ... */); } template MDSPAN_INLINE_FUNCTION static constexpr bool _not_eq_impl(mapping const& self, mapping const& other) noexcept { - return _MDSPAN_FOLD_OR((self.stride(Idxs) != other.stride(Idxs)) /* || ... */) - || _MDSPAN_FOLD_OR((self.extents().extent(Idxs) != other.extents().extent(Idxs)) /* || ... */); + using common_t = std::common_type_t; + return _MDSPAN_FOLD_OR((static_cast(self.stride(Idxs)) != static_cast(other.stride(Idxs))) /* || ... */) + || _MDSPAN_FOLD_OR((static_cast(self.extents().extent(Idxs)) != static_cast(other.extents().extent(Idxs))) /* || ... */); } template @@ -205,6 +207,11 @@ struct layout_stride { } #endif + MDSPAN_INLINE_FUNCTION + static constexpr std::array return_strides(const __strides_storage_t& s) { + return std::array{s[Idxs]...}; + } + template MDSPAN_INLINE_FUNCTION static constexpr size_t __return_zero() { return 0; } @@ -218,6 +225,21 @@ struct layout_stride { // Can't use defaulted parameter in the __deduction_workaround template because of a bug in MSVC warning C4348. using __impl = __deduction_workaround>; + static constexpr __strides_storage_t strides_storage(std::true_type) { + __strides_storage_t s{}; + + extents_type e; + index_type stride = 1; + for(int r = static_cast(extents_type::rank() - 1); r >= 0; r--) { + s[r] = stride; + stride *= e.extent(r); + } + + return s; + } + static constexpr __strides_storage_t strides_storage(std::false_type) { + return {}; + } //---------------------------------------------------------------------------- @@ -233,7 +255,21 @@ struct layout_stride { //-------------------------------------------------------------------------------- - MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping() noexcept = default; + MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping() noexcept +#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) + : __members{ +#else + : __base_t(__base_t{__member_pair_t( +#endif + extents_type(), + __strides_storage_t(strides_storage(std::integral_constant 0)>{})) +#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) + } +#else + )}) +#endif + {} + MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(mapping const&) noexcept = default; MDSPAN_TEMPLATE_REQUIRES( @@ -332,10 +368,10 @@ struct layout_stride { ) #endif MDSPAN_CONDITIONAL_EXPLICIT( - (!std::is_convertible::value) && - (detail::__is_mapping_of || - detail::__is_mapping_of || - detail::__is_mapping_of) + !(std::is_convertible::value && + (detail::__is_mapping_of || + detail::__is_mapping_of || + detail::__is_mapping_of)) ) // needs two () due to comma MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 mapping(StridedLayoutMapping const& other) noexcept // NOLINT(google-explicit-constructor) @@ -374,7 +410,7 @@ struct layout_stride { MDSPAN_INLINE_FUNCTION constexpr std::array< index_type, extents_type::rank() > strides() const noexcept { - return __strides_storage(); + return __impl::return_strides(__strides_storage()); } MDSPAN_INLINE_FUNCTION @@ -393,8 +429,7 @@ struct layout_stride { class... Indices, /* requires */ ( sizeof...(Indices) == Extents::rank() && - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) /*&& ...*/ ) && - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices) /*&& ...*/) + (detail::are_valid_indices()) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -410,17 +445,37 @@ struct layout_stride { MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; } MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 bool is_exhaustive() const noexcept { - return required_span_size() == __get_size(extents(), std::make_index_sequence()); + if constexpr (extents_type::rank() == 0) + return true; + else { + index_type span_size = required_span_size(); + if (span_size == static_cast(0)) { + if constexpr (extents_type::rank() == 1) { + return stride(0) == 1; + } else { + rank_type r_largest = 0; + for (rank_type r = 1; r < extents_type::rank(); r++) { + if (stride(r) > stride(r_largest)) { + r_largest = r; + } + } + for (rank_type r = 0; r < extents_type::rank(); r++) { + if (extents().extent(r) == 0 && r != r_largest) { + return false; + } + } + return true; + } + } else { + return required_span_size() == __get_size(extents(), std::make_index_sequence()); + } + } } MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; } MDSPAN_INLINE_FUNCTION - constexpr index_type stride(rank_type r) const noexcept -#if MDSPAN_HAS_CXX_20 - requires ( Extents::rank() > 0 ) -#endif - { + constexpr index_type stride(rank_type r) const noexcept { return __strides_storage()[r]; } @@ -444,10 +499,13 @@ struct layout_stride { MDSPAN_INLINE_FUNCTION friend constexpr bool operator==(const mapping& x, const StridedLayoutMapping& y) noexcept { bool strides_match = true; - for(rank_type r = 0; r < extents_type::rank(); r++) - strides_match = strides_match && (x.stride(r) == y.stride(r)); + if constexpr (extents_type::rank() > 0) { + using common_t = std::common_type_t; + for(rank_type r = 0; r < extents_type::rank(); r++) + strides_match = strides_match && (static_cast(x.stride(r)) == static_cast(y.stride(r))); + } return (x.extents() == y.extents()) && - (__impl::__OFFSET(y)== static_cast(0)) && + (__impl::__OFFSET(y) == static_cast(0)) && strides_match; } @@ -489,6 +547,17 @@ struct layout_stride { } #endif + // [mdspan.submdspan.mapping], submdspan mapping specialization + template + MDSPAN_INLINE_FUNCTION + constexpr auto submdspan_mapping_impl( + SliceSpecifiers... slices) const; + + template + friend constexpr auto submdspan_mapping( + const mapping& src, SliceSpecifiers... slices) { + return src.submdspan_mapping_impl(slices...); + } }; }; diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp index 6febe3002150..d6ec49e65bf8 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp @@ -55,6 +55,13 @@ class mdspan ReferenceType __callop(mdspan const& __self, const std::array& indices) noexcept { return __self.__accessor_ref().access(__self.__ptr_ref(), __self.__mapping_ref()(indices[Idxs]...)); } +#ifdef __cpp_lib_span + template + MDSPAN_FORCE_INLINE_FUNCTION static constexpr + ReferenceType __callop(mdspan const& __self, const std::span& indices) noexcept { + return __self.__accessor_ref().access(__self.__ptr_ref(), __self.__mapping_ref()(indices[Idxs]...)); + } +#endif }; public: @@ -109,9 +116,8 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeTypes) /* && ... */) && ((sizeof...(SizeTypes) == rank()) || (sizeof...(SizeTypes) == rank_dynamic())) && + (detail::are_valid_indices()) && _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type) && _MDSPAN_TRAIT(std::is_default_constructible, accessor_type) ) @@ -125,8 +131,8 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class SizeType, size_t N, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) && + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) && ((N == rank()) || (N == rank_dynamic())) && _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type) && _MDSPAN_TRAIT(std::is_default_constructible, accessor_type) @@ -142,8 +148,8 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class SizeType, size_t N, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) && + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) && ((N == rank()) || (N == rank_dynamic())) && _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type) && _MDSPAN_TRAIT(std::is_default_constructible, accessor_type) @@ -160,7 +166,7 @@ class mdspan (MDSPAN_INLINE_FUNCTION constexpr), mdspan, (data_handle_type p, const extents_type& exts), , /* requires */ (_MDSPAN_TRAIT(std::is_default_constructible, accessor_type) && - _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type)) + _MDSPAN_TRAIT(std::is_constructible, mapping_type, const extents_type&)) ) : __members(std::move(p), __map_acc_pair_t(mapping_type(exts), accessor_type())) { } @@ -179,10 +185,14 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class OtherElementType, class OtherExtents, class OtherLayoutPolicy, class OtherAccessor, /* requires */ ( - _MDSPAN_TRAIT(std::is_constructible, mapping_type, typename OtherLayoutPolicy::template mapping) && - _MDSPAN_TRAIT(std::is_constructible, accessor_type, OtherAccessor) + _MDSPAN_TRAIT(std::is_constructible, mapping_type, const typename OtherLayoutPolicy::template mapping&) && + _MDSPAN_TRAIT(std::is_constructible, accessor_type, const OtherAccessor&) ) ) + MDSPAN_CONDITIONAL_EXPLICIT( + !_MDSPAN_TRAIT(std::is_convertible, const typename OtherLayoutPolicy::template mapping&, mapping_type) || + !_MDSPAN_TRAIT(std::is_convertible, const OtherAccessor&, accessor_type) + ) MDSPAN_INLINE_FUNCTION constexpr mdspan(const mdspan& other) : __members(other.__ptr_ref(), __map_acc_pair_t(other.__mapping_ref(), other.__accessor_ref())) @@ -226,8 +236,8 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class SizeType, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -240,8 +250,8 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class SizeType, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -271,9 +281,8 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeTypes) /* && ... */) && - extents_type::rank() == sizeof...(SizeTypes) + extents_type::rank() == sizeof...(SizeTypes) && + (detail::are_valid_indices()) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -285,8 +294,8 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class SizeType, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -299,8 +308,8 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class SizeType, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -311,7 +320,7 @@ class mdspan #endif // __cpp_lib_span #endif // MDSPAN_USE_PAREN_OPERATOR - MDSPAN_INLINE_FUNCTION constexpr size_t size() const noexcept { + MDSPAN_INLINE_FUNCTION constexpr size_type size() const noexcept { return __impl::__size(*this); }; @@ -346,13 +355,13 @@ class mdspan //-------------------------------------------------------------------------------- // [mdspan.basic.obs], mdspan observers of the mapping - MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { return mapping_type::is_always_unique(); }; - MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { return mapping_type::is_always_exhaustive(); }; - MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return mapping_type::is_always_strided(); }; + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() { return mapping_type::is_always_unique(); }; + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() { return mapping_type::is_always_exhaustive(); }; + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() { return mapping_type::is_always_strided(); }; - MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const noexcept { return __mapping_ref().is_unique(); }; - MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { return __mapping_ref().is_exhaustive(); }; - MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept { return __mapping_ref().is_strided(); }; + MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const { return __mapping_ref().is_unique(); }; + MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const { return __mapping_ref().is_exhaustive(); }; + MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const { return __mapping_ref().is_strided(); }; MDSPAN_INLINE_FUNCTION constexpr index_type stride(size_t r) const { return __mapping_ref().stride(r); }; private: @@ -374,7 +383,7 @@ class mdspan #if defined(_MDSPAN_USE_CLASS_TEMPLATE_ARGUMENT_DEDUCTION) MDSPAN_TEMPLATE_REQUIRES( class ElementType, class... SizeTypes, - /* requires */ _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_integral, SizeTypes) /* && ... */) && + /* requires */ _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, SizeTypes, size_t) /* && ... */) && (sizeof...(SizeTypes) > 0) ) MDSPAN_DEDUCTION_GUIDE explicit mdspan(ElementType*, SizeTypes...) diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp index 3950273a83dc..bdc5925f7151 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp @@ -103,8 +103,8 @@ class mdarray { MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) && + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) && + _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) && _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type) && (_MDSPAN_TRAIT( std::is_constructible, container_type, size_t) || container_is_array::value) && @@ -133,61 +133,29 @@ class mdarray { ) : map_(m), ctr_(container_is_array::construct(map_)) { } - // Constructors from container - MDSPAN_TEMPLATE_REQUIRES( - class... SizeTypes, - /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) && - _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type) - ) - ) - MDSPAN_INLINE_FUNCTION - explicit constexpr mdarray(const container_type& ctr, SizeTypes... dynamic_extents) - : map_(extents_type(dynamic_extents...)), ctr_(ctr) - { assert(ctr.size() >= static_cast(map_.required_span_size())); } - - MDSPAN_FUNCTION_REQUIRES( (MDSPAN_INLINE_FUNCTION constexpr), - mdarray, (const container_type& ctr, const extents_type& exts), , + mdarray, (const extents_type& exts, const container_type& ctr), , /* requires */ (_MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type)) ) : map_(exts), ctr_(ctr) { assert(ctr.size() >= static_cast(map_.required_span_size())); } - constexpr mdarray(const container_type& ctr, const mapping_type& m) + constexpr mdarray(const mapping_type& m, const container_type& ctr) : map_(m), ctr_(ctr) { assert(ctr.size() >= static_cast(map_.required_span_size())); } - - // Constructors from container - MDSPAN_TEMPLATE_REQUIRES( - class... SizeTypes, - /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) && - _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type) - ) - ) - MDSPAN_INLINE_FUNCTION - explicit constexpr mdarray(container_type&& ctr, SizeTypes... dynamic_extents) - : map_(extents_type(dynamic_extents...)), ctr_(std::move(ctr)) - { assert(ctr_.size() >= static_cast(map_.required_span_size())); } - - MDSPAN_FUNCTION_REQUIRES( (MDSPAN_INLINE_FUNCTION constexpr), - mdarray, (container_type&& ctr, const extents_type& exts), , + mdarray, (const extents_type& exts, container_type&& ctr), , /* requires */ (_MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type)) ) : map_(exts), ctr_(std::move(ctr)) { assert(ctr_.size() >= static_cast(map_.required_span_size())); } - constexpr mdarray(container_type&& ctr, const mapping_type& m) + constexpr mdarray(const mapping_type& m, container_type&& ctr) : map_(m), ctr_(std::move(ctr)) { assert(ctr_.size() >= static_cast(map_.required_span_size())); } - MDSPAN_TEMPLATE_REQUIRES( class OtherElementType, class OtherExtents, class OtherLayoutPolicy, class OtherContainer, /* requires */ ( @@ -229,7 +197,7 @@ class mdarray { _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type)) ) MDSPAN_INLINE_FUNCTION - constexpr mdarray(const container_type& ctr, const extents_type& exts, const Alloc& a) + constexpr mdarray(const extents_type& exts, const container_type& ctr, const Alloc& a) : map_(exts), ctr_(ctr, a) { assert(ctr_.size() >= static_cast(map_.required_span_size())); } @@ -238,7 +206,7 @@ class mdarray { /* requires */ (_MDSPAN_TRAIT( std::is_constructible, container_type, size_t, Alloc)) ) MDSPAN_INLINE_FUNCTION - constexpr mdarray(const container_type& ctr, const mapping_type& map, const Alloc& a) + constexpr mdarray(const mapping_type& map, const container_type& ctr, const Alloc& a) : map_(map), ctr_(ctr, a) { assert(ctr_.size() >= static_cast(map_.required_span_size())); } @@ -248,7 +216,7 @@ class mdarray { _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type)) ) MDSPAN_INLINE_FUNCTION - constexpr mdarray(container_type&& ctr, const extents_type& exts, const Alloc& a) + constexpr mdarray(const extents_type& exts, container_type&& ctr, const Alloc& a) : map_(exts), ctr_(std::move(ctr), a) { assert(ctr_.size() >= static_cast(map_.required_span_size())); } @@ -257,7 +225,7 @@ class mdarray { /* requires */ (_MDSPAN_TRAIT( std::is_constructible, container_type, size_t, Alloc)) ) MDSPAN_INLINE_FUNCTION - constexpr mdarray(container_type&& ctr, const mapping_type& map, const Alloc& a) + constexpr mdarray(const mapping_type& map, container_type&& ctr, const Alloc& a) : map_(map), ctr_(std::move(ctr), a) { assert(ctr_.size() >= map_.required_span_size()); } @@ -344,8 +312,8 @@ class mdarray { MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - extents_type::rank() == sizeof...(SizeTypes) + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) && + extents_type::rank() == sizeof...(SizeTypes) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -356,8 +324,8 @@ class mdarray { MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - extents_type::rank() == sizeof...(SizeTypes) + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) && + extents_type::rank() == sizeof...(SizeTypes) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -433,8 +401,9 @@ class mdarray { class OtherElementType, class OtherExtents, class OtherLayoutType, class OtherAccessorType, /* requires */ ( - _MDSPAN_TRAIT(std::is_assignable, mdspan_type, - mdspan) + _MDSPAN_TRAIT(std::is_assignable, + mdspan, + mdspan_type) ) ) constexpr operator mdspan () { @@ -445,8 +414,9 @@ class mdarray { class OtherElementType, class OtherExtents, class OtherLayoutType, class OtherAccessorType, /* requires */ ( - _MDSPAN_TRAIT(std::is_assignable, const_mdspan_type, - mdspan) + _MDSPAN_TRAIT(std::is_assignable, + mdspan, + const_mdspan_type) ) ) constexpr operator mdspan () const { diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp index 58f38620ba1a..89ba8202fb16 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp @@ -20,7 +20,6 @@ #include namespace MDSPAN_IMPL_STANDARD_NAMESPACE { -namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { namespace { template @@ -29,6 +28,7 @@ namespace { template struct __mdspan_is_integral_constant>: std::true_type {}; } + // Slice Specifier allowing for strides and compile time extent template struct strided_slice { @@ -36,14 +36,13 @@ struct strided_slice { using extent_type = ExtentType; using stride_type = StrideType; - OffsetType offset; - ExtentType extent; - StrideType stride; + _MDSPAN_NO_UNIQUE_ADDRESS OffsetType offset{}; + _MDSPAN_NO_UNIQUE_ADDRESS ExtentType extent{}; + _MDSPAN_NO_UNIQUE_ADDRESS StrideType stride{}; static_assert(std::is_integral_v || __mdspan_is_integral_constant::value); static_assert(std::is_integral_v || __mdspan_is_integral_constant::value); static_assert(std::is_integral_v || __mdspan_is_integral_constant::value); }; -} // MDSPAN_IMPL_PROPOSED_NAMESPACE } // MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp index b9672b7f9ac3..abddd0b59df1 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp @@ -20,23 +20,21 @@ #include "submdspan_mapping.hpp" namespace MDSPAN_IMPL_STANDARD_NAMESPACE { -namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { template MDSPAN_INLINE_FUNCTION constexpr auto submdspan(const mdspan &src, SliceSpecifiers... slices) { - const auto sub_mapping_offset = submdspan_mapping(src.mapping(), slices...); + const auto sub_submdspan_mapping_result = submdspan_mapping(src.mapping(), slices...); // NVCC has a problem with the deduction so lets figure out the type - using sub_mapping_t = std::remove_cv_t; + using sub_mapping_t = std::remove_cv_t; using sub_extents_t = typename sub_mapping_t::extents_type; using sub_layout_t = typename sub_mapping_t::layout_type; using sub_accessor_t = typename AccessorPolicy::offset_policy; return mdspan( - src.accessor().offset(src.data_handle(), sub_mapping_offset.offset), - sub_mapping_offset.mapping, + src.accessor().offset(src.data_handle(), sub_submdspan_mapping_result.offset), + sub_submdspan_mapping_result.mapping, sub_accessor_t(src.accessor())); } -} // namespace MDSPAN_IMPL_PROPOSED_NAMESPACE } // namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp index f56ce023f165..c3b2f78fb998 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp @@ -20,7 +20,6 @@ #include "strided_slice.hpp" namespace MDSPAN_IMPL_STANDARD_NAMESPACE { -namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { namespace detail { // Mapping from submapping ranks to srcmapping ranks @@ -319,5 +318,4 @@ constexpr auto submdspan_extents(const extents &src_exts, return detail::extents_constructor::next_extent( src_exts, slices...); } -} // namespace MDSPAN_IMPL_PROPOSED_NAMESPACE } // namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp index 48778d57e75f..ca6948c9a9f7 100644 --- a/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp @@ -22,21 +22,15 @@ #include // index_sequence namespace MDSPAN_IMPL_STANDARD_NAMESPACE { -namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { //****************************************** // Return type of submdspan_mapping overloads //****************************************** -template struct mapping_offset { - Mapping mapping; +template struct submdspan_mapping_result { + _MDSPAN_NO_UNIQUE_ADDRESS LayoutMapping mapping{}; size_t offset; }; -} // namespace MDSPAN_IMPL_PROPOSED_NAMESPACE namespace detail { -using MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::first_of; -using MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::stride_of; -using MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::inv_map_rank; - // constructs sub strides template MDSPAN_INLINE_FUNCTION @@ -98,17 +92,15 @@ struct preserve_layout_left_mapping, SubRank, #pragma diag_suppress = implicit_return_from_non_void_function #endif // Actual submdspan mapping call -template +template +template MDSPAN_INLINE_FUNCTION constexpr auto -submdspan_mapping(const layout_left::mapping &src_mapping, - SliceSpecifiers... slices) { - using MDSPAN_IMPL_PROPOSED_NAMESPACE::submdspan_extents; - using MDSPAN_IMPL_PROPOSED_NAMESPACE::mapping_offset; +layout_left::mapping::submdspan_mapping_impl(SliceSpecifiers... slices) const { // compute sub extents using src_ext_t = Extents; - auto dst_ext = submdspan_extents(src_mapping.extents(), slices...); + auto dst_ext = submdspan_extents(extents(), slices...); using dst_ext_t = decltype(dst_ext); // figure out sub layout type @@ -121,18 +113,18 @@ submdspan_mapping(const layout_left::mapping &src_mapping, if constexpr (std::is_same_v) { // layout_left case - return mapping_offset{ + return submdspan_mapping_result{ dst_mapping_t(dst_ext), - static_cast(src_mapping(detail::first_of(slices)...))}; + static_cast(this->operator()(detail::first_of(slices)...))}; } else { // layout_stride case auto inv_map = detail::inv_map_rank( std::integral_constant(), std::index_sequence<>(), slices...); - return mapping_offset{ + return submdspan_mapping_result{ dst_mapping_t(dst_ext, detail::construct_sub_strides( - src_mapping, inv_map, + *this, inv_map, // HIP needs deduction guides to have markups so we need to be explicit // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have the issue #if defined(_MDSPAN_HAS_HIP) || (defined(__NVCC__) && (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120) @@ -140,7 +132,7 @@ submdspan_mapping(const layout_left::mapping &src_mapping, #else std::tuple{detail::stride_of(slices)...})), #endif - static_cast(src_mapping(detail::first_of(slices)...))}; + static_cast(this->operator()(detail::first_of(slices)...))}; } #if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__) __builtin_unreachable(); @@ -207,17 +199,15 @@ struct preserve_layout_right_mapping, SubRank, #pragma diagnostic push #pragma diag_suppress = implicit_return_from_non_void_function #endif -template +template +template MDSPAN_INLINE_FUNCTION constexpr auto -submdspan_mapping(const layout_right::mapping &src_mapping, - SliceSpecifiers... slices) { - using MDSPAN_IMPL_PROPOSED_NAMESPACE::submdspan_extents; - using MDSPAN_IMPL_PROPOSED_NAMESPACE::mapping_offset; - +layout_right::mapping::submdspan_mapping_impl( + SliceSpecifiers... slices) const { // get sub extents using src_ext_t = Extents; - auto dst_ext = submdspan_extents(src_mapping.extents(), slices...); + auto dst_ext = submdspan_extents(extents(), slices...); using dst_ext_t = decltype(dst_ext); // determine new layout type @@ -230,18 +220,18 @@ submdspan_mapping(const layout_right::mapping &src_mapping, if constexpr (std::is_same_v) { // layout_right case - return mapping_offset{ + return submdspan_mapping_result{ dst_mapping_t(dst_ext), - static_cast(src_mapping(detail::first_of(slices)...))}; + static_cast(this->operator()(detail::first_of(slices)...))}; } else { // layout_stride case auto inv_map = detail::inv_map_rank( std::integral_constant(), std::index_sequence<>(), slices...); - return mapping_offset{ + return submdspan_mapping_result{ dst_mapping_t(dst_ext, detail::construct_sub_strides( - src_mapping, inv_map, + *this, inv_map, // HIP needs deduction guides to have markups so we need to be explicit // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have the issue #if defined(_MDSPAN_HAS_HIP) || (defined(__NVCC__) && (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120) @@ -249,7 +239,7 @@ submdspan_mapping(const layout_right::mapping &src_mapping, #else std::tuple{detail::stride_of(slices)...})), #endif - static_cast(src_mapping(detail::first_of(slices)...))}; + static_cast(this->operator()(detail::first_of(slices)...))}; } #if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__) __builtin_unreachable(); @@ -270,23 +260,22 @@ submdspan_mapping(const layout_right::mapping &src_mapping, //********************************** // layout_stride submdspan_mapping //********************************* -template +template +template MDSPAN_INLINE_FUNCTION constexpr auto -submdspan_mapping(const layout_stride::mapping &src_mapping, - SliceSpecifiers... slices) { - using MDSPAN_IMPL_PROPOSED_NAMESPACE::submdspan_extents; - using MDSPAN_IMPL_PROPOSED_NAMESPACE::mapping_offset; - auto dst_ext = submdspan_extents(src_mapping.extents(), slices...); +layout_stride::mapping::submdspan_mapping_impl( + SliceSpecifiers... slices) const { + auto dst_ext = submdspan_extents(extents(), slices...); using dst_ext_t = decltype(dst_ext); auto inv_map = detail::inv_map_rank( std::integral_constant(), std::index_sequence<>(), slices...); using dst_mapping_t = typename layout_stride::template mapping; - return mapping_offset{ + return submdspan_mapping_result{ dst_mapping_t(dst_ext, detail::construct_sub_strides( - src_mapping, inv_map, + *this, inv_map, // HIP needs deduction guides to have markups so we need to be explicit // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have the issue #if defined(_MDSPAN_HAS_HIP) || (defined(__NVCC__) && (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120) @@ -294,6 +283,7 @@ submdspan_mapping(const layout_stride::mapping &src_mapping, #else std::tuple(detail::stride_of(slices)...))), #endif - static_cast(src_mapping(detail::first_of(slices)...))}; + static_cast(this->operator()(detail::first_of(slices)...))}; } + } // namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp new file mode 100644 index 000000000000..a80148679238 --- /dev/null +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp @@ -0,0 +1,793 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#pragma once + +#include +#include "layout_padded_fwd.hpp" +#include "../__p0009_bits/dynamic_extent.hpp" +#include "../__p0009_bits/extents.hpp" +#include "../__p0009_bits/mdspan.hpp" +#include "../__p0009_bits/layout_left.hpp" +#include "../__p0009_bits/layout_right.hpp" +#include "../__p0009_bits/layout_stride.hpp" + +namespace MDSPAN_IMPL_STANDARD_NAMESPACE { +namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { + +namespace detail { +template +MDSPAN_INLINE_FUNCTION +constexpr _T +find_next_multiple(_T alignment, _T offset) +{ + if ( alignment == 0 ) { + return _T(0); + } else { + return ( ( offset + alignment - 1 ) / alignment) * alignment; + } +} + +template +MDSPAN_INLINE_FUNCTION constexpr size_t get_actual_static_padding_value() { + constexpr auto rank = _ExtentsType::rank(); + + if constexpr (rank <= typename _ExtentsType::rank_type(1)) { + return 0; + } else if constexpr (_PaddingValue != dynamic_extent && + _ExtentsType::static_extent(_ExtentToPadIdx) != + dynamic_extent) { + static_assert( + (_PaddingValue != 0) || + (_ExtentsType::static_extent(_ExtentToPadIdx) == 0), + "padding stride can be 0 only if " + "extents_type::static_extent(extent-to-pad) is 0 or dynamic_extent"); + return find_next_multiple(_PaddingValue, + _ExtentsType::static_extent(_ExtentToPadIdx)); + } else { + return dynamic_extent; + } +} + +template +struct static_array_type_for_padded_extent +{ + static constexpr size_t padding_value = _PaddingValue; + using index_type = typename _Extents::index_type; + using extents_type = _Extents; + using type = ::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::maybe_static_array< + index_type, size_t, dynamic_extent, + detail::get_actual_static_padding_value()>; +}; + +template +struct static_array_type_for_padded_extent<_PaddingValue, _Extents, + _ExtentToPadIdx, Rank, std::enable_if_t> { + using index_type = typename _Extents::index_type; + using extents_type = _Extents; + using type = + ::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::maybe_static_array< + index_type, size_t, dynamic_extent, 0>; +}; + +template +struct padded_extent { + static constexpr size_t padding_value = _PaddingValue; + using index_type = typename _Extents::index_type; + using extents_type = _Extents; + using static_array_type = typename static_array_type_for_padded_extent< + padding_value, _Extents, _ExtentToPadIdx, _Extents::rank()>::type; + + static constexpr auto static_value() { return static_array_type::static_value(0); } + + MDSPAN_INLINE_FUNCTION + static constexpr static_array_type + init_padding(const _Extents &exts) { + if constexpr ((_Extents::rank() > 1) && (padding_value == dynamic_extent)) { + return {exts.extent(_ExtentToPadIdx)}; + } else { + return init_padding(exts, padding_value); + } + } + + MDSPAN_INLINE_FUNCTION static constexpr static_array_type + init_padding([[maybe_unused]] const _Extents &exts, + [[maybe_unused]] index_type pv) { + if constexpr (_Extents::rank() > 1) { + return {find_next_multiple(pv, + exts.extent(_ExtentToPadIdx))}; + } else { + return {}; + } + } + + template + MDSPAN_INLINE_FUNCTION static constexpr static_array_type + init_padding([[maybe_unused]] const _Mapping &other_mapping, + std::integral_constant) { + if constexpr (_Extents::rank() > 1) { + return {other_mapping.stride(_PaddingStrideIdx)}; + } else { + return {}; + } + } +}; +} // namespace detail + +template +template +class layout_left_padded::mapping { +public: + static constexpr size_t padding_value = PaddingValue; + + using extents_type = Extents; + using index_type = typename extents_type::index_type; + using size_type = typename extents_type::size_type; + using rank_type = typename extents_type::rank_type; + using layout_type = layout_left_padded; + +#ifndef MDSPAN_INTERNAL_TEST +private: +#endif // MDSPAN_INTERNAL_TEST + + static constexpr rank_type padded_stride_idx = detail::layout_padded_constants::padded_stride_idx; + static constexpr rank_type extent_to_pad_idx = detail::layout_padded_constants::extent_to_pad_idx; + + static_assert((padding_value != 0) + || (extents_type::static_extent(extent_to_pad_idx) == 0) + || (extents_type::static_extent(extent_to_pad_idx) == dynamic_extent), + "out of bounds access for rank 0"); + + using padded_stride_type = detail::padded_extent< padding_value, extents_type, extent_to_pad_idx >; + + static constexpr size_t static_padding_stride = padded_stride_type::static_value(); + + typename padded_stride_type::static_array_type padded_stride = {}; + extents_type exts = {}; + + constexpr index_type compute_offset(std::index_sequence<>) const { + return 0; + } + + template + constexpr index_type compute_offset(std::index_sequence, + IndexOffset index_offset) const { + return index_offset; + } + + template + constexpr index_type compute_offset(std::index_sequence, + IndexOffsets... index_offsets) const { + index_type indices[] = {static_cast(index_offsets)...}; + // self-recursive fold trick from + // https://github.com/llvm/llvm-project/blob/96e1914aa2e6d8966acbfbe2f4d184201f1aa318/libcxx/include/mdspan/layout_left.h#L144 + index_type res = 0; + ((res = indices[extents_type::rank() - 1 - Ranks] + + ((extents_type::rank() - 1 - Ranks) == extent_to_pad_idx + ? padded_stride.value(0) + : exts.extent(extents_type::rank() - 1 - Ranks)) * + res), + ...); + return res; + } + +public: +#if !MDSPAN_HAS_CXX_20 + MDSPAN_INLINE_FUNCTION_DEFAULTED + constexpr mapping() + : mapping(extents_type{}) + {} +#else + MDSPAN_INLINE_FUNCTION_DEFAULTED + constexpr mapping() + requires(static_padding_stride != dynamic_extent) = default; + + MDSPAN_INLINE_FUNCTION + constexpr mapping() + requires(static_padding_stride == dynamic_extent) + : mapping(extents_type{}) + {} +#endif + + MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(const mapping&) noexcept = default; + MDSPAN_INLINE_FUNCTION_DEFAULTED mapping& operator=(const mapping&) noexcept = default; + + /** + * Initializes the mapping with the given extents. + * + * \param ext the given extents + */ + MDSPAN_INLINE_FUNCTION + constexpr mapping(const extents_type& ext) + : padded_stride(padded_stride_type::init_padding(ext)), exts(ext) + {} + + /** + * Initializes the mapping with the given extents and the specified padding value. + * + * This overload participates in overload resolution only if `is_convertible_v` + * is `true` and `is_nothrow_constructible_v` is `true` + * + * \param ext the given extents + * \param padding_value the padding value + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Size, + /* requires */ ( + std::is_convertible_v<_Size, index_type> + && std::is_nothrow_constructible_v + ) + ) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const extents_type &ext, _Size dynamic_padding_value) + : padded_stride(padded_stride_type::init_padding(ext, dynamic_padding_value)), exts(ext) + { + assert((padding_value == dynamic_extent) || (static_cast(padding_value) == static_cast(dynamic_padding_value))); + } + + /** + * Converting constructor from `layout_left::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v` is true. + * If `OtherExtents::rank() > 1` then one of `padding_value`, `static_extent(0)`, or `OtherExtents::static_extent(0)` must be `dynamic_extent`; + * otherwise, `OtherExtents::static_extent(0)` must be equal to the least multiple of `padding_value` greater than or equal to `extents_type::static_extent(0)` + */ + MDSPAN_TEMPLATE_REQUIRES( + class _OtherExtents, + /* requires */ ( + std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v<_OtherExtents, extents_type>)) + constexpr mapping(const layout_left::mapping<_OtherExtents> &other_mapping) + : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), + exts(other_mapping.extents()) + { + static_assert((_OtherExtents::rank() > 1) || (static_padding_stride != dynamic_extent) || (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) + || (static_padding_stride == _OtherExtents::static_extent(extent_to_pad_idx))); + } + + /** + * Converting constructor from `layout_stride::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v` is true + */ + MDSPAN_TEMPLATE_REQUIRES( + class _OtherExtents, + /* requires */ ( + std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0)) + constexpr mapping(const layout_stride::mapping<_OtherExtents> &other_mapping) + : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), + exts(other_mapping.extents()) + { + } + + /** + * Converting constructor from `layout_left_padded::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v` is true. + * Either `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or `padding_value == OtherPaddingStride`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_left_padded_mapping<_Mapping>::value + && std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 1 && (padding_value == dynamic_extent || _Mapping::padding_value == dynamic_extent))) + constexpr + mapping(const _Mapping &other_mapping) + : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), + exts(other_mapping.extents()) + { + static_assert(padding_value == dynamic_extent || + _Mapping::padding_value == dynamic_extent || + padding_value == _Mapping::padding_value); + } + + /** + * Converting constructor from `layout_right_padded::mapping`. + * + * This overload participates in overload resolution only if `extents_type::rank()` is 0 or 1 and `is_constructible_v` is `true`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_right_padded_mapping<_Mapping>::value + && extents_type::rank() <= 1 + && std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v)) + constexpr + mapping(const _Mapping &other_mapping) noexcept + : padded_stride(padded_stride_type::init_padding(other_mapping.extents(), other_mapping.extents().extent(extent_to_pad_idx))), + exts(other_mapping.extents()) + {} + + constexpr const extents_type &extents() const noexcept + { + return exts; + } + + constexpr std::array + strides() const noexcept + { + if constexpr ( extents_type::rank() == 0 ) { + return {}; + } else if constexpr ( extents_type::rank() == 1 ) { + return {1}; + } else { + index_type value = 1; + std::array s{}; + s[extent_to_pad_idx] = value; + value *= padded_stride.value(0); + for (rank_type r = extent_to_pad_idx + 1; r < extents_type::rank() - 1; ++r) + { + s[r] = value; + value *= exts.extent(r); + } + s[extents_type::rank() - 1] = value; + return s; + } + } + + constexpr index_type + required_span_size() const noexcept + { + if constexpr ( extents_type::rank() == 0 ) { + return 1; + } else if constexpr ( extents_type::rank() == 1 ) { + return exts.extent(0); + } else { + index_type value = padded_stride.value(0); + for (rank_type r = 1; r < extents_type::rank(); ++r) { + value *= exts.extent(r); + } + return value; + } + } + + /** + * Return the mapping given the provided indices per rank. + * + * This overload participates in overload resolution only if: + * - `sizeof...(Indices) == extents_type::rank()`, + * - `(is_convertible_v && ...) is true`, and + * - (is_nothrow_constructible_v && ...) is true. + */ + MDSPAN_TEMPLATE_REQUIRES( + class... _Indices, + /* requires */ ( + sizeof...(_Indices) == extents_type::rank() && + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) + ) + ) + constexpr size_t operator()(_Indices... idxs) const noexcept + { + return compute_offset(std::index_sequence_for<_Indices...>{}, idxs...); + } + + static constexpr bool is_always_unique() noexcept { return true; } + static constexpr bool is_always_exhaustive() noexcept + { + return (extents_type::rank() <= rank_type(1)) + || (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent + && extents_type::static_extent(extent_to_pad_idx) == padded_stride_type::static_value()); + } + static constexpr bool is_always_strided() noexcept { return true; } + + static constexpr bool is_unique() noexcept { return true; } + constexpr bool is_exhaustive() const noexcept + { + return (extents_type::rank() < 2) + || (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + } + static constexpr bool is_strided() noexcept { return true; } + + constexpr index_type stride(rank_type r) const noexcept + { + assert(r < extents_type::rank()); + if(r == 0) return index_type(1); + + index_type value = padded_stride.value(0); + for (rank_type k = 1; k < r; k++) value *= exts.extent(k); + + return value; + } + + /** + * Equality operator between `layout_left_padded`s + * + * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + * + * \note There is currently a difference from p2642r2, where this function is specified as taking + * `layout_left_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_left_padded_mapping<_Mapping>::value + && (_Mapping::extents_type::rank() == extents_type::rank()) + ) + ) + friend constexpr bool operator==(const mapping &left, const _Mapping &right) noexcept + { + // Workaround for some compilers not short-circuiting properly with compile-time checks + // i.e. we can't access stride(_padding_stride_idx) of a rank 0 mapping + bool strides_equal = true; + if constexpr (extents_type::rank() > rank_type(1)) + { + strides_equal = left.stride(padded_stride_idx) == right.stride(padded_stride_idx); + } + return (left.extents() == right.extents()) && strides_equal; + } + +#if !MDSPAN_HAS_CXX_20 + /** + * Inequality operator between `layout_left_padded`s + * + * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_left_padded_mapping<_Mapping>::value + && (_Mapping::extents_type::rank() == extents_type::rank()) + ) + ) + friend constexpr bool operator!=(const mapping &left, const _Mapping &right) noexcept + { + return !(left == right); + } +#endif +}; + +template +template +class layout_right_padded::mapping { +public: + static constexpr size_t padding_value = PaddingValue; + + using extents_type = Extents; + using index_type = typename extents_type::index_type; + using size_type = typename extents_type::size_type; + using rank_type = typename extents_type::rank_type; + using layout_type = layout_right_padded; + +#ifndef MDSPAN_INTERNAL_TEST + private: +#endif // MDSPAN_INTERNAL_TEST + + static constexpr rank_type padded_stride_idx = detail::layout_padded_constants::padded_stride_idx; + static constexpr rank_type extent_to_pad_idx = detail::layout_padded_constants::extent_to_pad_idx; + + static_assert((padding_value != 0) + || (extents_type::static_extent(extent_to_pad_idx) == 0) + || (extents_type::static_extent(extent_to_pad_idx) == dynamic_extent), + "if padding stride is 0, static_extent(extent-to-pad-rank) must also be 0 or dynamic_extent"); + + using padded_stride_type = detail::padded_extent< padding_value, extents_type, extent_to_pad_idx >; + static constexpr size_t static_padding_stride = padded_stride_type::static_value(); + + typename padded_stride_type::static_array_type padded_stride = {}; + extents_type exts = {}; + + constexpr index_type compute_offset(std::index_sequence<>) const { + return 0; + } + + template + constexpr index_type compute_offset(std::index_sequence, + IndexOffset index_offset) const { + return index_offset; + } + + template + constexpr index_type compute_offset(std::index_sequence, + IndexOffsets... index_offsets) const { + // self-recursive fold trick from + // https://github.com/llvm/llvm-project/blob/4d9771741d40cc9cfcccb6b033f43689d36b705a/libcxx/include/mdspan/layout_right.h#L141 + index_type res = 0; + ((res = static_cast(index_offsets) + + (Ranks == extent_to_pad_idx ? padded_stride.value(0) + : exts.extent(Ranks)) * + res), + ...); + return res; + } + +public: +#if !MDSPAN_HAS_CXX_20 + MDSPAN_INLINE_FUNCTION_DEFAULTED + constexpr mapping() + : mapping(extents_type{}) + {} +#else + MDSPAN_INLINE_FUNCTION_DEFAULTED + constexpr mapping() + requires(static_padding_stride != dynamic_extent) = default; + + MDSPAN_INLINE_FUNCTION + constexpr mapping() + requires(static_padding_stride == dynamic_extent) + : mapping(extents_type{}) + {} +#endif + + MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(const mapping&) noexcept = default; + MDSPAN_INLINE_FUNCTION_DEFAULTED mapping& operator=(const mapping&) noexcept = default; + + /** + * Initializes the mapping with the given extents. + * + * \param ext the given extents + */ + MDSPAN_INLINE_FUNCTION + constexpr mapping(const extents_type &ext) + : padded_stride(padded_stride_type::init_padding(ext)), exts(ext) {} + + /** + * Initializes the mapping with the given extents and the specified padding value. + * + * This overload participates in overload resolution only if `is_convertible_v` + * is `true` and `is_nothrow_constructible_v` is `true` + * + * \param ext the given extents + * \param padding_value the padding value + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Size, + /* requires */ ( + std::is_convertible_v<_Size, index_type> + && std::is_nothrow_constructible_v + ) + ) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const extents_type &ext, _Size dynamic_padding_value) + : padded_stride(padded_stride_type::init_padding(ext, static_cast(dynamic_padding_value))), + exts(ext) { + assert((padding_value == dynamic_extent) || + (static_cast(padding_value) == static_cast(dynamic_padding_value))); + } + + /** + * Converting constructor from `layout_right::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v` is true. + * If `OtherExtents::rank() > 1` then one of `padding_value`, `static_extent(0)`, or `OtherExtents::static_extent(0)` must be `dynamic_extent`; + * otherwise, `OtherExtents::static_extent(0)` must be equal to the least multiple of `padding_value` greater than or equal to `extents_type::static_extent(0)` + */ + MDSPAN_TEMPLATE_REQUIRES( + class _OtherExtents, + /* requires */ ( + std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v<_OtherExtents, extents_type>)) + constexpr mapping(const layout_right::mapping<_OtherExtents> &other_mapping) + : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), + exts(other_mapping.extents()) + { + static_assert((_OtherExtents::rank() > 1) || (padded_stride_type::static_value() != dynamic_extent) || (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) + || (padded_stride_type::static_value() == _OtherExtents::static_extent(extent_to_pad_idx))); + } + + /** + * Converting constructor from `layout_stride::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v` is true + */ + MDSPAN_TEMPLATE_REQUIRES( + class _OtherExtents, + /* requires */ ( + std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0)) + constexpr mapping(const layout_stride::mapping<_OtherExtents> &other_mapping) + : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), + exts(other_mapping.extents()) + {} + + /** + * Converting constructor from `layout_right_padded::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v` is true. + * Either `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or `padding_value == OtherPaddingStride`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_right_padded_mapping<_Mapping>::value + && std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 1 && + (padding_value == dynamic_extent || + _Mapping::padding_value == dynamic_extent))) + constexpr mapping(const _Mapping &other_mapping) + : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), + exts(other_mapping.extents()) + { + static_assert(padding_value == dynamic_extent || + _Mapping::padding_value == dynamic_extent || + padding_value == _Mapping::padding_value); + } + + /** + * Converting constructor from `layout_left_padded::mapping`. + * + * This overload participates in overload resolution only if `extents_type::rank()` is 0 or 1 and `is_constructible_v` is `true`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_left_padded_mapping<_Mapping>::value + && extents_type::rank() <= 1 + && std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v)) + constexpr mapping(const _Mapping &other_mapping) noexcept + : padded_stride(padded_stride_type::init_padding(other_mapping.extents(), other_mapping.extents().extent(extent_to_pad_idx))), + exts(other_mapping.extents()) + {} + + constexpr const extents_type &extents() const noexcept + { + return exts; + } + + constexpr std::array + strides() const noexcept + { + if constexpr ( extents_type::rank() == 0 ) { + return {}; + } else if constexpr ( extents_type::rank() == 1 ) { + return {1}; + } else { + index_type value = 1; + std::array s{}; + s[extent_to_pad_idx] = value; + value *= padded_stride.value(0); + for (rank_type r = extent_to_pad_idx - 1; r > 0; --r) + { + s[r] = value; + value *= exts.extent(r); + } + s[0] = value; + return s; + } + } + + constexpr index_type + required_span_size() const noexcept + { + if constexpr ( extents_type::rank() == 0 ) { + return 1; + } else if constexpr ( extents_type::rank() == 1 ) { + return exts.extent(0); + } else { + index_type value = 1; + for (rank_type r = 0; r < extent_to_pad_idx; ++r) + { + value *= exts.extent(r); + } + return value * padded_stride.value(0); + } + } + + /** + * Return the mapping given the provided indices per rank. + * + * This overload participates in overload resolution only if: + * - `sizeof...(Indices) == extents_type::rank()`, + * - `(is_convertible_v && ...) is true`, and + * - (is_nothrow_constructible_v && ...) is true. + */ + MDSPAN_TEMPLATE_REQUIRES( + class... _Indices, + /* requires */ ( + sizeof...(_Indices) == extents_type::rank() && + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) + ) + ) + constexpr size_t operator()(_Indices... idxs) const noexcept + { + return compute_offset(std::index_sequence_for<_Indices...>{}, idxs...); + } + + static constexpr bool is_always_unique() noexcept { return true; } + static constexpr bool is_always_exhaustive() noexcept + { + return (extents_type::rank() <= rank_type(1)) + || (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent + && extents_type::static_extent(extent_to_pad_idx) == padded_stride_type::static_value()); + } + static constexpr bool is_always_strided() noexcept { return true; } + + static constexpr bool is_unique() noexcept { return true; } + constexpr bool is_exhaustive() const noexcept + { + return (extents_type::rank() < 2) + || (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + } + static constexpr bool is_strided() noexcept { return true; } + + constexpr index_type stride(rank_type r) const noexcept + { + assert(r < extents_type::rank()); + if(r == extents_type::rank() - 1) return index_type(1); + + index_type value = padded_stride.value(0); + for (rank_type k = extents_type::rank() - 2; k > r; k--) value *= exts.extent(k); + + return value; + } + + /** + * Equality operator between `layout_right_padded`s + * + * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + * + * \note There is currently a difference from p2642r2, where this function is specified as taking + * `layout_right_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_right_padded_mapping<_Mapping>::value + && (_Mapping::extents_type::rank() == extents_type::rank()) + ) + ) + friend constexpr bool operator==(const mapping &left, const _Mapping &right) noexcept + { + // Workaround for some compilers not short-circuiting properly with compile-time checks + // i.e. we can't access stride(_padding_stride_idx) of a rank 0 mapping + bool strides_equal = true; + if constexpr (extents_type::rank() > rank_type(1)) + { + strides_equal = left.stride(padded_stride_idx) == right.stride(padded_stride_idx); + } + return (left.extents() == right.extents()) && strides_equal; + } + +#if !MDSPAN_HAS_CXX_20 + /** + * Inequality operator between `layout_right_padded`s + * + * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_right_padded_mapping<_Mapping>::value + && (_Mapping::extents_type::rank() == extents_type::rank()) + ) + ) + friend constexpr bool operator!=(const mapping &left, const _Mapping &right) noexcept + { + return !(left == right); + } +#endif +}; +} +} diff --git a/packages/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded_fwd.hpp b/packages/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded_fwd.hpp new file mode 100644 index 000000000000..945f091a2dc9 --- /dev/null +++ b/packages/kokkos/tpls/mdspan/include/experimental/__p2642_bits/layout_padded_fwd.hpp @@ -0,0 +1,117 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#pragma once + +#include +#include "../__p0009_bits/dynamic_extent.hpp" + +namespace MDSPAN_IMPL_STANDARD_NAMESPACE { +namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { + +template +struct layout_left_padded { + template + class mapping; +}; + +template +struct layout_right_padded { + template + class mapping; +}; + +namespace detail { +// The layout_padded_constants structs are only useful if rank > 1, otherwise they may wrap +template +struct layout_padded_constants; + +template +struct layout_padded_constants, _ExtentsType> +{ + using rank_type = typename _ExtentsType::rank_type; + static constexpr rank_type padded_stride_idx = 1; + static constexpr rank_type extent_to_pad_idx = 0; +}; + +template +struct layout_padded_constants, _ExtentsType> +{ + using rank_type = typename _ExtentsType::rank_type; + static constexpr rank_type padded_stride_idx = _ExtentsType::rank() - 2; + static constexpr rank_type extent_to_pad_idx = _ExtentsType::rank() - 1; +}; + +template +struct is_layout_left_padded : std::false_type {}; + +template +struct is_layout_left_padded> : std::true_type {}; + +template +struct is_layout_left_padded_mapping : std::false_type {}; + +template +struct is_layout_left_padded_mapping<_Mapping, + std::enable_if_t::template mapping>::value>> + : std::true_type {}; + +template +struct is_layout_right_padded : std::false_type {}; + +template +struct is_layout_right_padded> : std::true_type {}; + +template +struct is_layout_right_padded_mapping : std::false_type {}; + +template +struct is_layout_right_padded_mapping<_Mapping, + std::enable_if_t::template mapping>::value>> + : std::true_type {}; + +template +constexpr void check_padded_layout_converting_constructor_mandates() +{ + if constexpr (_LayoutExtentsType::rank() > 1) { + using extents_type = typename _PaddedLayoutMappingType::extents_type; + constexpr auto padding_value = _PaddedLayoutMappingType::padding_value; + constexpr auto idx = layout_padded_constants::extent_to_pad_idx; + if constexpr ((_LayoutExtentsType::static_extent(idx) != dynamic_extent) && + (extents_type::static_extent(idx) != dynamic_extent) && + (padding_value != dynamic_extent)) { + if constexpr (padding_value == 0) { + static_assert(_LayoutExtentsType::static_extent(idx) == 0); + } else { + static_assert( + _LayoutExtentsType::static_extent(idx) % padding_value == 0); + } + } + } +} + +template +constexpr void check_padded_layout_converting_constructor_preconditions([[maybe_unused]] const _OtherMapping &other_mapping) { + if constexpr (_ExtentsType::rank() > 1) { + constexpr auto padded_stride_idx = + layout_padded_constants::padded_stride_idx; + constexpr auto extent_to_pad_idx = layout_padded_constants::extent_to_pad_idx; + assert(other_mapping.stride(padded_stride_idx) == other_mapping.extents().extent(extent_to_pad_idx)); + } +} +} +} +} diff --git a/packages/kokkos/tpls/mdspan/include/mdspan/mdspan.hpp b/packages/kokkos/tpls/mdspan/include/mdspan/mdspan.hpp index b440873526ab..ac72a1a4e64f 100644 --- a/packages/kokkos/tpls/mdspan/include/mdspan/mdspan.hpp +++ b/packages/kokkos/tpls/mdspan/include/mdspan/mdspan.hpp @@ -35,6 +35,7 @@ #include "../experimental/__p0009_bits/layout_right.hpp" #include "../experimental/__p0009_bits/macros.hpp" #if MDSPAN_HAS_CXX_17 +#include "../experimental/__p2642_bits/layout_padded.hpp" #include "../experimental/__p2630_bits/submdspan.hpp" #endif diff --git a/packages/muelu/CMakeLists.txt b/packages/muelu/CMakeLists.txt index 28e913853113..d418d42750a6 100644 --- a/packages/muelu/CMakeLists.txt +++ b/packages/muelu/CMakeLists.txt @@ -239,7 +239,7 @@ IF(${PACKAGE_NAME}_ENABLE_EXPLICIT_INSTANTIATION) # IF(Tpetra_INST_DOUBLE AND Tpetra_INST_INT_LONG) - GLOBAL_SET(${PACKAGE_NAME}_HAVE_GO_INT ON) + GLOBAL_SET(${PACKAGE_NAME}_HAVE_GO_LONG ON) GLOBAL_SET(${PACKAGE_NAME}_INST_DOUBLE_INT_LONGINT ON) GLOBAL_SET(HAVE_${PACKAGE_NAME_UC}_INST_DOUBLE_INT_LONGINT ON) ELSE() @@ -280,7 +280,7 @@ IF(${PACKAGE_NAME}_ENABLE_EXPLICIT_INSTANTIATION) # IF(Tpetra_INST_FLOAT AND Tpetra_INST_INT_INT) - GLOBAL_SET(${PACKAGE_NAME}_HAVE_GO_LONG ON) + GLOBAL_SET(${PACKAGE_NAME}_HAVE_GO_INT ON) GLOBAL_SET(${PACKAGE_NAME}_INST_FLOAT_INT_INT ON) GLOBAL_SET(HAVE_${PACKAGE_NAME_UC}_INST_FLOAT_INT_INT ON) ELSE() diff --git a/packages/muelu/src/Graph/HybridAggregation/MueLu_HybridAggregationFactory_def.hpp b/packages/muelu/src/Graph/HybridAggregation/MueLu_HybridAggregationFactory_def.hpp index c72f7203b879..ecbff335f524 100644 --- a/packages/muelu/src/Graph/HybridAggregation/MueLu_HybridAggregationFactory_def.hpp +++ b/packages/muelu/src/Graph/HybridAggregation/MueLu_HybridAggregationFactory_def.hpp @@ -89,14 +89,13 @@ RCP HybridAggregationFactory validParamList = rcp(new ParameterList()); - typedef Teuchos::StringToIntegralParameterEntryValidator validatorType; #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name)) // From UncoupledAggregationFactory SET_VALID_ENTRY("aggregation: max agg size"); SET_VALID_ENTRY("aggregation: min agg size"); SET_VALID_ENTRY("aggregation: max selected neighbors"); SET_VALID_ENTRY("aggregation: ordering"); - validParamList->getEntry("aggregation: ordering").setValidator(rcp(new validatorType(Teuchos::tuple("natural", "graph", "random"), "aggregation: ordering"))); + validParamList->getEntry("aggregation: ordering").setValidator(rcp(new Teuchos::StringValidator(Teuchos::tuple("natural", "graph", "random")))); SET_VALID_ENTRY("aggregation: enable phase 1"); SET_VALID_ENTRY("aggregation: enable phase 2a"); SET_VALID_ENTRY("aggregation: enable phase 2b"); diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp index 9eb9c002875c..2c421c477bde 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_def.hpp @@ -131,9 +131,8 @@ RCP CoalesceDropFactory validatorType; // "signed classical" is the Ruge-Stuben style (relative to max off-diagonal), "sign classical sa" is the signed version of the sa criterion (relative to the diagonal values) - validParamList->getEntry("aggregation: drop scheme").setValidator(rcp(new validatorType(Teuchos::tuple("signed classical sa", "classical", "distance laplacian", "signed classical", "block diagonal", "block diagonal classical", "block diagonal distance laplacian", "block diagonal signed classical", "block diagonal colored signed classical"), "aggregation: drop scheme"))); + validParamList->getEntry("aggregation: drop scheme").setValidator(rcp(new Teuchos::StringValidator(Teuchos::tuple("signed classical sa", "classical", "distance laplacian", "signed classical", "block diagonal", "block diagonal classical", "block diagonal distance laplacian", "block diagonal signed classical", "block diagonal colored signed classical")))); } SET_VALID_ENTRY("aggregation: distance laplacian algo"); SET_VALID_ENTRY("aggregation: classical algo"); diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_kokkos_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_kokkos_def.hpp index c9473d641cb2..575884676a70 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_kokkos_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_CoalesceDropFactory_kokkos_def.hpp @@ -463,8 +463,7 @@ RCP CoalesceDropFactory_kokkos validatorType; - validParamList->getEntry("aggregation: drop scheme").setValidator(rcp(new validatorType(Teuchos::tuple("classical", "distance laplacian"), "aggregation: drop scheme"))); + validParamList->getEntry("aggregation: drop scheme").setValidator(rcp(new Teuchos::StringValidator(Teuchos::tuple("classical", "distance laplacian")))); } #undef SET_VALID_ENTRY validParamList->set>("A", Teuchos::null, "Generating factory of the matrix A"); diff --git a/packages/muelu/src/Graph/MatrixTransformation/MueLu_SmooVecCoalesceDropFactory_def.hpp b/packages/muelu/src/Graph/MatrixTransformation/MueLu_SmooVecCoalesceDropFactory_def.hpp index 076f5df9361c..b5d8fe1c813c 100644 --- a/packages/muelu/src/Graph/MatrixTransformation/MueLu_SmooVecCoalesceDropFactory_def.hpp +++ b/packages/muelu/src/Graph/MatrixTransformation/MueLu_SmooVecCoalesceDropFactory_def.hpp @@ -98,8 +98,7 @@ RCP SmooVecCoalesceDropFactorysetEntry(name, MasterList::getEntry(name)) SET_VALID_ENTRY("aggregation: drop scheme"); { - typedef Teuchos::StringToIntegralParameterEntryValidator validatorType; - validParamList->getEntry("aggregation: drop scheme").setValidator(rcp(new validatorType(Teuchos::tuple("unsupported vector smoothing"), "aggregation: drop scheme"))); + validParamList->getEntry("aggregation: drop scheme").setValidator(rcp(new Teuchos::StringValidator(Teuchos::tuple("classical", "distance laplacian", "unsupported vector smoothing")))); } SET_VALID_ENTRY("aggregation: number of random vectors"); SET_VALID_ENTRY("aggregation: number of times to pre or post smooth"); diff --git a/packages/muelu/src/Graph/UncoupledAggregation/MueLu_UncoupledAggregationFactory_def.hpp b/packages/muelu/src/Graph/UncoupledAggregation/MueLu_UncoupledAggregationFactory_def.hpp index 28e25372ea3b..1b525cd0158a 100644 --- a/packages/muelu/src/Graph/UncoupledAggregation/MueLu_UncoupledAggregationFactory_def.hpp +++ b/packages/muelu/src/Graph/UncoupledAggregation/MueLu_UncoupledAggregationFactory_def.hpp @@ -87,13 +87,12 @@ RCP UncoupledAggregationFactory validatorType; #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name)) SET_VALID_ENTRY("aggregation: max agg size"); SET_VALID_ENTRY("aggregation: min agg size"); SET_VALID_ENTRY("aggregation: max selected neighbors"); SET_VALID_ENTRY("aggregation: ordering"); - validParamList->getEntry("aggregation: ordering").setValidator(rcp(new validatorType(Teuchos::tuple("natural", "graph", "random"), "aggregation: ordering"))); + validParamList->getEntry("aggregation: ordering").setValidator(rcp(new Teuchos::StringValidator(Teuchos::tuple("natural", "graph", "random")))); SET_VALID_ENTRY("aggregation: deterministic"); SET_VALID_ENTRY("aggregation: coloring algorithm"); SET_VALID_ENTRY("aggregation: enable phase 1"); diff --git a/packages/muelu/src/Interface/MueLu_HierarchyManager.hpp b/packages/muelu/src/Interface/MueLu_HierarchyManager.hpp index e38e8cc1c221..4c60d1438525 100644 --- a/packages/muelu/src/Interface/MueLu_HierarchyManager.hpp +++ b/packages/muelu/src/Interface/MueLu_HierarchyManager.hpp @@ -283,6 +283,7 @@ class HierarchyManager : public HierarchyFactory::Build(Level& fineLev // Reuse pattern if available (multiple solve) RCP APparams = rcp(new ParameterList); if (pL.isSublist("matrixmatrix: kernel params")) - APparams->sublist("matrixmatrix: kernel params") = pL.sublist("matrixmatrix: kernel params"); + APparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params"))); // By default, we don't need global constants for A*P APparams->set("compute global constants: temporaries", APparams->get("compute global constants: temporaries", false)); @@ -189,7 +189,7 @@ void RAPFactory::Build(Level& fineLev // Reuse coarse matrix memory if available (multiple solve) RCP RAPparams = rcp(new ParameterList); if (pL.isSublist("matrixmatrix: kernel params")) - RAPparams->sublist("matrixmatrix: kernel params") = pL.sublist("matrixmatrix: kernel params"); + RAPparams = rcp(new ParameterList(pL.sublist("matrixmatrix: kernel params"))); if (coarseLevel.IsAvailable("RAP reuse data", this)) { GetOStream(static_cast(Runtime0 | Test)) << "Reusing previous RAP data" << std::endl; diff --git a/packages/muelu/src/MueCentral/MueLu_Hierarchy_decl.hpp b/packages/muelu/src/MueCentral/MueLu_Hierarchy_decl.hpp index 22b0f8f41843..170098ae5cdc 100644 --- a/packages/muelu/src/MueCentral/MueLu_Hierarchy_decl.hpp +++ b/packages/muelu/src/MueCentral/MueLu_Hierarchy_decl.hpp @@ -247,6 +247,8 @@ class Hierarchy : public BaseClass { void SetupRe(); + void CheckForEmptySmoothersAndCoarseSolve(); + //! Clear impermanent data from previous setup void Clear(int startLevel = 0); void ExpertClear(); diff --git a/packages/muelu/src/MueCentral/MueLu_Hierarchy_def.hpp b/packages/muelu/src/MueCentral/MueLu_Hierarchy_def.hpp index fab4ce803855..66ae54356f16 100644 --- a/packages/muelu/src/MueCentral/MueLu_Hierarchy_def.hpp +++ b/packages/muelu/src/MueCentral/MueLu_Hierarchy_def.hpp @@ -601,6 +601,8 @@ void Hierarchy::SetupRe() { ResetDescription(); describe(GetOStream(Statistics0), GetVerbLevel()); + + CheckForEmptySmoothersAndCoarseSolve(); } template @@ -673,6 +675,17 @@ void Hierarchy::Setup(const FactoryMa manager.Clean(); describe(GetOStream(Statistics0), GetVerbLevel()); + + CheckForEmptySmoothersAndCoarseSolve(); +} + +template +void Hierarchy::CheckForEmptySmoothersAndCoarseSolve() { + for (LO levelNo = 0; levelNo < as(Levels_.size()); ++levelNo) { + auto level = Levels_[levelNo]; + if ((!level->IsAvailable("PreSmoother")) && (!level->IsAvailable("PostSmoother"))) + GetOStream(Warnings1) << "No " << (levelNo == as(Levels_.size()) - 1 ? "coarse grid solver" : "smoother") << " on level " << level->GetLevelID() << std::endl; + } } template @@ -834,7 +847,6 @@ ConvergenceStatus Hierarchy::Iterate( emptyFineSolve = false; } if (emptyFineSolve == true) { - GetOStream(Warnings1) << "No fine grid smoother" << std::endl; // Fine grid smoother is identity fineX->update(one, B, zero); } @@ -854,7 +866,6 @@ ConvergenceStatus Hierarchy::Iterate( emptyCoarseSolve = false; } if (emptyCoarseSolve == true) { - GetOStream(Warnings1) << "No coarse grid solver" << std::endl; // Coarse operator is identity coarseX->update(one, *coarseRhs, zero); } @@ -1003,7 +1014,6 @@ ConvergenceStatus Hierarchy::Iterate( zeroGuess = false; } if (emptySolve == true) { - GetOStream(Warnings1) << "No coarse grid solver" << std::endl; // Coarse operator is identity X.update(one, B, zero); } diff --git a/packages/muelu/src/Rebalancing/MueLu_RebalanceTransferFactory_def.hpp b/packages/muelu/src/Rebalancing/MueLu_RebalanceTransferFactory_def.hpp index d51e75de0789..27e1285f7367 100644 --- a/packages/muelu/src/Rebalancing/MueLu_RebalanceTransferFactory_def.hpp +++ b/packages/muelu/src/Rebalancing/MueLu_RebalanceTransferFactory_def.hpp @@ -82,8 +82,8 @@ RCP RebalanceTransferFactory validatorType; - RCP typeValidator = rcp(new validatorType(Teuchos::tuple("Interpolation", "Restriction"), "type")); + typedef Teuchos::StringValidator validatorType; + RCP typeValidator = rcp(new validatorType(Teuchos::tuple("Interpolation", "Restriction"))); validParamList->set("type", "Interpolation", "Type of the transfer operator that need to be rebalanced (Interpolation or Restriction)", typeValidator); } diff --git a/packages/muelu/src/Smoothers/MueLu_Amesos2Smoother_def.hpp b/packages/muelu/src/Smoothers/MueLu_Amesos2Smoother_def.hpp index c368c6ead9d7..ba72e83f2e0c 100644 --- a/packages/muelu/src/Smoothers/MueLu_Amesos2Smoother_def.hpp +++ b/packages/muelu/src/Smoothers/MueLu_Amesos2Smoother_def.hpp @@ -333,7 +333,7 @@ void Amesos2Smoother::Setup(Level& cu amesos2_params->setName("Amesos2"); if ((rowMap->getGlobalNumElements() != as((rowMap->getMaxAllGlobalIndex() - rowMap->getMinAllGlobalIndex()) + 1)) || (!rowMap->isContiguous() && (rowMap->getComm()->getSize() == 1))) { - if (!(amesos2_params->sublist(prec_->name()).template isType("IsContiguous"))) + if ((type_ != "Cusolver") && !(amesos2_params->sublist(prec_->name()).template isType("IsContiguous"))) amesos2_params->sublist(prec_->name()).set("IsContiguous", false, "Are GIDs Contiguous"); } prec_->setParameters(amesos2_params); diff --git a/packages/muelu/src/Smoothers/MueLu_Ifpack2Smoother_def.hpp b/packages/muelu/src/Smoothers/MueLu_Ifpack2Smoother_def.hpp index 6d4e606676ac..a16f34d7b38c 100644 --- a/packages/muelu/src/Smoothers/MueLu_Ifpack2Smoother_def.hpp +++ b/packages/muelu/src/Smoothers/MueLu_Ifpack2Smoother_def.hpp @@ -586,6 +586,14 @@ void Ifpack2Smoother::SetupLineSmooth myparamList.set("partitioner: map", TVertLineIdSmoo); myparamList.set("partitioner: local parts", maxPart + 1); } else { + if (myparamList.isParameter("partitioner: block size") && + myparamList.get("partitioner: block size") != -1) { + int block_size = myparamList.get("partitioner: block size"); + TEUCHOS_TEST_FOR_EXCEPTION(numLocalRows % block_size != 0, Exceptions::RuntimeError, + "MueLu::Ifpack2Smoother::Setup(): the number of local nodes is incompatible with the specified block size."); + numLocalRows /= block_size; + } + // we assume a constant number of DOFs per node size_t numDofsPerNode = numLocalRows / TVertLineIdSmoo.size(); diff --git a/packages/muelu/src/Transfers/Classical/MueLu_ClassicalPFactory_def.hpp b/packages/muelu/src/Transfers/Classical/MueLu_ClassicalPFactory_def.hpp index 7e0bfe840603..452bd061c4e6 100644 --- a/packages/muelu/src/Transfers/Classical/MueLu_ClassicalPFactory_def.hpp +++ b/packages/muelu/src/Transfers/Classical/MueLu_ClassicalPFactory_def.hpp @@ -100,8 +100,7 @@ RCP ClassicalPFactory validatorType; - validParamList->getEntry("aggregation: classical scheme").setValidator(rcp(new validatorType(Teuchos::tuple("direct", "ext+i", "classical modified"), "aggregation: classical scheme"))); + validParamList->getEntry("aggregation: classical scheme").setValidator(rcp(new Teuchos::StringValidator(Teuchos::tuple("direct", "ext+i", "classical modified")))); } #undef SET_VALID_ENTRY diff --git a/packages/muelu/src/Transfers/Energy-Minimization/MueLu_EminPFactory_def.hpp b/packages/muelu/src/Transfers/Energy-Minimization/MueLu_EminPFactory_def.hpp index aad7bfb621f7..3e35e667b5d0 100644 --- a/packages/muelu/src/Transfers/Energy-Minimization/MueLu_EminPFactory_def.hpp +++ b/packages/muelu/src/Transfers/Energy-Minimization/MueLu_EminPFactory_def.hpp @@ -26,8 +26,7 @@ RCP EminPFactory SET_VALID_ENTRY("emin: num reuse iterations"); SET_VALID_ENTRY("emin: iterative method"); { - typedef Teuchos::StringToIntegralParameterEntryValidator validatorType; - validParamList->getEntry("emin: iterative method").setValidator(rcp(new validatorType(Teuchos::tuple("cg", "sd", "gmres"), "emin: iterative method"))); + validParamList->getEntry("emin: iterative method").setValidator(rcp(new Teuchos::StringValidator(Teuchos::tuple("cg", "sd", "gmres")))); } #undef SET_VALID_ENTRY diff --git a/packages/muelu/test/interface/default/Output/onelevel_tpetra.gold b/packages/muelu/test/interface/default/Output/onelevel_tpetra.gold index 54222c686b67..f2622a6c57d6 100644 --- a/packages/muelu/test/interface/default/Output/onelevel_tpetra.gold +++ b/packages/muelu/test/interface/default/Output/onelevel_tpetra.gold @@ -4,6 +4,7 @@ Level 0 Setup Smoother (MueLu::Ifpack2Smoother{type = RELAXATION}) smoother -> timer for apply = 0 + relaxation: type = Jacobi -------------------------------------------------------------------------------- --- Multigrid Summary --- diff --git a/packages/muelu/test/interface/default/Output/smoother10_tpetra.gold b/packages/muelu/test/interface/default/Output/smoother10_tpetra.gold index c45e53ce038b..d86d4b594e38 100644 --- a/packages/muelu/test/interface/default/Output/smoother10_tpetra.gold +++ b/packages/muelu/test/interface/default/Output/smoother10_tpetra.gold @@ -5,6 +5,7 @@ Setup Smoother (MueLu::Ifpack2Smoother{type = RELAXATION}) smoother -> relaxation: sweeps = 3 timer for apply = 0 + relaxation: type = Jacobi Level 1 Prolongator smoothing (MueLu::SaPFactory) @@ -39,6 +40,7 @@ Setup Smoother (MueLu::Ifpack2Smoother{type = RELAXATION}) smoother -> relaxation: sweeps = 3 timer for apply = 0 + relaxation: type = Jacobi Level 2 Prolongator smoothing (MueLu::SaPFactory) diff --git a/packages/muelu/test/interface/default/Output/smoother11_tpetra.gold b/packages/muelu/test/interface/default/Output/smoother11_tpetra.gold index 411351fa09ee..89b2f9f61bcb 100644 --- a/packages/muelu/test/interface/default/Output/smoother11_tpetra.gold +++ b/packages/muelu/test/interface/default/Output/smoother11_tpetra.gold @@ -5,10 +5,12 @@ Setup Smoother (MueLu::Ifpack2Smoother{type = RELAXATION}) smoother -> relaxation: sweeps = 3 timer for apply = 0 + relaxation: type = Jacobi Setup Smoother (MueLu::Ifpack2Smoother{type = RELAXATION}) smoother -> relaxation: sweeps = 0 timer for apply = 0 + relaxation: type = Jacobi Level 1 Prolongator smoothing (MueLu::SaPFactory) @@ -43,10 +45,12 @@ Setup Smoother (MueLu::Ifpack2Smoother{type = RELAXATION}) smoother -> relaxation: sweeps = 3 timer for apply = 0 + relaxation: type = Jacobi Setup Smoother (MueLu::Ifpack2Smoother{type = RELAXATION}) smoother -> relaxation: sweeps = 0 timer for apply = 0 + relaxation: type = Jacobi Level 2 Prolongator smoothing (MueLu::SaPFactory) diff --git a/packages/muelu/test/interface/kokkos/Output/onelevel_tpetra.gold b/packages/muelu/test/interface/kokkos/Output/onelevel_tpetra.gold index 54222c686b67..f2622a6c57d6 100644 --- a/packages/muelu/test/interface/kokkos/Output/onelevel_tpetra.gold +++ b/packages/muelu/test/interface/kokkos/Output/onelevel_tpetra.gold @@ -4,6 +4,7 @@ Level 0 Setup Smoother (MueLu::Ifpack2Smoother{type = RELAXATION}) smoother -> timer for apply = 0 + relaxation: type = Jacobi -------------------------------------------------------------------------------- --- Multigrid Summary --- diff --git a/packages/muelu/test/interface/kokkos/Output/smoother10_tpetra.gold b/packages/muelu/test/interface/kokkos/Output/smoother10_tpetra.gold index 6aa7782d703e..454794b53db2 100644 --- a/packages/muelu/test/interface/kokkos/Output/smoother10_tpetra.gold +++ b/packages/muelu/test/interface/kokkos/Output/smoother10_tpetra.gold @@ -5,6 +5,7 @@ Setup Smoother (MueLu::Ifpack2Smoother{type = RELAXATION}) smoother -> relaxation: sweeps = 3 timer for apply = 0 + relaxation: type = Jacobi Level 1 Prolongator smoothing (MueLu::SaPFactory) @@ -36,6 +37,7 @@ Setup Smoother (MueLu::Ifpack2Smoother{type = RELAXATION}) smoother -> relaxation: sweeps = 3 timer for apply = 0 + relaxation: type = Jacobi Level 2 Prolongator smoothing (MueLu::SaPFactory) diff --git a/packages/muelu/test/interface/kokkos/Output/smoother11_tpetra.gold b/packages/muelu/test/interface/kokkos/Output/smoother11_tpetra.gold index 311d5e8dd74a..23fd32664480 100644 --- a/packages/muelu/test/interface/kokkos/Output/smoother11_tpetra.gold +++ b/packages/muelu/test/interface/kokkos/Output/smoother11_tpetra.gold @@ -5,10 +5,12 @@ Setup Smoother (MueLu::Ifpack2Smoother{type = RELAXATION}) smoother -> relaxation: sweeps = 3 timer for apply = 0 + relaxation: type = Jacobi Setup Smoother (MueLu::Ifpack2Smoother{type = RELAXATION}) smoother -> relaxation: sweeps = 0 timer for apply = 0 + relaxation: type = Jacobi Level 1 Prolongator smoothing (MueLu::SaPFactory) @@ -40,10 +42,12 @@ Setup Smoother (MueLu::Ifpack2Smoother{type = RELAXATION}) smoother -> relaxation: sweeps = 3 timer for apply = 0 + relaxation: type = Jacobi Setup Smoother (MueLu::Ifpack2Smoother{type = RELAXATION}) smoother -> relaxation: sweeps = 0 timer for apply = 0 + relaxation: type = Jacobi Level 2 Prolongator smoothing (MueLu::SaPFactory) diff --git a/packages/muelu/test/paramlist/paramlistAdv.cpp b/packages/muelu/test/paramlist/paramlistAdv.cpp index c1b06cdc4afe..d1f0272fde15 100644 --- a/packages/muelu/test/paramlist/paramlistAdv.cpp +++ b/packages/muelu/test/paramlist/paramlistAdv.cpp @@ -54,8 +54,8 @@ class MyFactory : public ParameterListAcceptorAdvImpl { RCP GetValidParameterListSimple() const { RCP validParamList = Teuchos::rcp(new ParameterList()); // output list - typedef Teuchos::StringToIntegralParameterEntryValidator validator_type; - validParamList->set("Solver", "ILUT", "The type of solver to use.", Teuchos::rcp(new validator_type(Teuchos::tuple("ILUT", "ILUK"), "Solver"))); + typedef Teuchos::StringValidator validator_type; + validParamList->set("Solver", "ILUT", "The type of solver to use.", Teuchos::rcp(new validator_type(Teuchos::tuple("ILUT", "ILUK")))); AddILUTParameters(*validParamList); AddILUKParameters(*validParamList); diff --git a/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/BlockCrs1.xml b/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/BlockCrs1.xml index d048ab0d38d8..dc62da3798e2 100644 --- a/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/BlockCrs1.xml +++ b/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/BlockCrs1.xml @@ -5,6 +5,8 @@ --> + + diff --git a/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/BlockCrs2.xml b/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/BlockCrs2.xml index 5954af0d9af4..9a474311a667 100644 --- a/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/BlockCrs2.xml +++ b/packages/muelu/test/unit_tests/ParameterList/ParameterListInterpreter/BlockCrs2.xml @@ -5,6 +5,8 @@ --> + + diff --git a/packages/muelu/test/unit_tests/Regression.cpp b/packages/muelu/test/unit_tests/Regression.cpp index 3bb6e65b2a4f..115faf1b2f8a 100644 --- a/packages/muelu/test/unit_tests/Regression.cpp +++ b/packages/muelu/test/unit_tests/Regression.cpp @@ -105,9 +105,17 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Regression, H2D, Scalar, LocalOrdinal, GlobalO if (Node::is_cpu) { TEST_EQUALITY(Tpetra::Details::DeepCopyCounter::get_count_different_space(), 0); - } else { + } +#ifdef KOKKOS_HAS_SHARED_SPACE + else { + size_t targetNumDeepCopies = std::is_same_v ? 20 : 37; + TEST_EQUALITY(Tpetra::Details::DeepCopyCounter::get_count_different_space(), targetNumDeepCopies); + } +#else + else { TEST_EQUALITY(Tpetra::Details::DeepCopyCounter::get_count_different_space(), 37); } +#endif auto X = Xpetra::MultiVectorFactory::Build(A->getRowMap(), 1); auto B = Xpetra::MultiVectorFactory::Build(A->getRowMap(), 1); diff --git a/packages/muelu/test/unit_tests_kokkos/Regression.cpp b/packages/muelu/test/unit_tests_kokkos/Regression.cpp index 6c36aa96b1e8..d25a82599c5f 100644 --- a/packages/muelu/test/unit_tests_kokkos/Regression.cpp +++ b/packages/muelu/test/unit_tests_kokkos/Regression.cpp @@ -115,7 +115,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL(Regression, H2D, Scalar, LocalOrdinal, GlobalO } #ifdef KOKKOS_HAS_SHARED_SPACE else { - size_t targetNumDeepCopies = std::is_same_v ? 31 : 42; + size_t targetNumDeepCopies = std::is_same_v ? 27 : 42; TEST_EQUALITY(Tpetra::Details::DeepCopyCounter::get_count_different_space(), targetNumDeepCopies); } #else diff --git a/packages/nox/src-loca/src-tpetra/LOCA_BorderedSolver_TpetraHouseholder.cpp b/packages/nox/src-loca/src-tpetra/LOCA_BorderedSolver_TpetraHouseholder.cpp index 884dbc23799b..4c8efea953e9 100644 --- a/packages/nox/src-loca/src-tpetra/LOCA_BorderedSolver_TpetraHouseholder.cpp +++ b/packages/nox/src-loca/src-tpetra/LOCA_BorderedSolver_TpetraHouseholder.cpp @@ -173,11 +173,8 @@ TpetraHouseholder(const Teuchos::RCP& global_data, validParams.set("Constraint Object",Teuchos::RCP(Teuchos::null)); validParams.set("Constraint Parameter Names",Teuchos::RCP>(Teuchos::null)); validParams.set("Scale Augmented Rows", true); - Teuchos::setStringToIntegralParameter("Preconditioner Method", - "Jacobian", - "Matrix to use for Preconditioning", - Teuchos::tuple ("Jacobian","SWM"), - &validParams); + validParams.set("Preconditioner Method", "Jacobian", "Matrix to use for Preconditioning", + rcp(new Teuchos::StringValidator(Teuchos::tuple("Jacobian","SWM")))); validParams.set("Include UV In Preconditioner", false); validParams.set("Use P For Preconditioner", false); solverParams->validateParametersAndSetDefaults(validParams); diff --git a/packages/nox/src-thyra/Thyra_NonlinearSolver_NOX.cpp b/packages/nox/src-thyra/Thyra_NonlinearSolver_NOX.cpp index 85517b980b85..6c43066c49cc 100644 --- a/packages/nox/src-thyra/Thyra_NonlinearSolver_NOX.cpp +++ b/packages/nox/src-thyra/Thyra_NonlinearSolver_NOX.cpp @@ -411,21 +411,17 @@ validateAndParseThyraGroupOptions(Teuchos::ParameterList& thyra_group_options_su ParameterList validParams; { - Teuchos::setStringToIntegralParameter( - "Function Scaling", - "None", + validParams.set( + "Function Scaling", "None", "Determines if function scaling of residual, Jacobian, etc. should be used.", - Teuchos::tuple("None","Row Sum", "User Defined"), - &validParams - ); + rcp(new Teuchos::StringValidator( + Teuchos::tuple("None", "Row Sum", "User Defined")))); - Teuchos::setStringToIntegralParameter( - "Update Row Sum Scaling", - "Before Each Nonlinear Solve", + validParams.set( + "Update Row Sum Scaling", "Before Each Nonlinear Solve", "Determines if function scaling of residual, Jacobian, etc. should be used.", - Teuchos::tuple("Before Each Nonlinear Solve","Before Each Nonlinear Iteration"), - &validParams - ); + rcp(new Teuchos::StringValidator( + Teuchos::tuple("Before Each Nonlinear Solve","Before Each Nonlinear Iteration")))); validParams.set > >("User Defined Scaling", Teuchos::null); validParams.set("Do Right Scaling First", false); diff --git a/packages/nox/src/CMakeLists.txt b/packages/nox/src/CMakeLists.txt index 2839d7a2b624..440fb359bf8d 100644 --- a/packages/nox/src/CMakeLists.txt +++ b/packages/nox/src/CMakeLists.txt @@ -26,6 +26,7 @@ TRIBITS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) APPEND_SET(HEADERS NOX.H NOX_Common.H + NOX_Exceptions.H ) # Abstract diff --git a/packages/panzer/adapters-stk/example/main_driver/user_app_NOXObserverFactory.hpp b/packages/panzer/adapters-stk/example/main_driver/user_app_NOXObserverFactory.hpp index 89672d9fc7bc..159ff1d5fb7c 100644 --- a/packages/panzer/adapters-stk/example/main_driver/user_app_NOXObserverFactory.hpp +++ b/packages/panzer/adapters-stk/example/main_driver/user_app_NOXObserverFactory.hpp @@ -123,20 +123,13 @@ namespace user_app { valid_params_ = Teuchos::rcp(new Teuchos::ParameterList); - Teuchos::setStringToIntegralParameter( - "Write Solution to Exodus File", - "ON", - "Enables or disables writing of solution to Exodus file at end of NOX solve", - Teuchos::tuple("ON","OFF"), - valid_params_.get() - ); - Teuchos::setStringToIntegralParameter( - "Neumann BC Analytic System Test", - "OFF", - "Checks solution values for Neumann BC Analytic System Test", - Teuchos::tuple("ON","OFF"), - valid_params_.get() - ); + valid_params_->set("Write Solution to Exodus File", "ON", + "Enables or disables writing of solution to Exodus file at end of NOX solve", + rcp(new Teuchos::StringValidator(Teuchos::tuple("ON", "OFF")))); + + valid_params_->set("Neumann BC Analytic System Test", "OFF", + "Checks solution values for Neumann BC Analytic System Test", + rcp(new Teuchos::StringValidator(Teuchos::tuple("ON", "OFF")))); } return valid_params_; diff --git a/packages/panzer/adapters-stk/src/Panzer_STK_ModelEvaluatorFactory_impl.hpp b/packages/panzer/adapters-stk/src/Panzer_STK_ModelEvaluatorFactory_impl.hpp index c5c85b9c71b2..5073e57808c1 100644 --- a/packages/panzer/adapters-stk/src/Panzer_STK_ModelEvaluatorFactory_impl.hpp +++ b/packages/panzer/adapters-stk/src/Panzer_STK_ModelEvaluatorFactory_impl.hpp @@ -1508,13 +1508,8 @@ namespace panzer_stk { { Teuchos::ParameterList validPL; { - Teuchos::setStringToIntegralParameter( - "Start Time Type", - "From Input File", - "Set the start time", - Teuchos::tuple("From Input File","From Exodus File"), - &validPL - ); + validPL.set("Start Time Type", "From Input File", "Set the start time", + rcp(new Teuchos::StringValidator(Teuchos::tuple("From Input File","From Exodus File")))); validPL.set("Start Time",0.0); } diff --git a/packages/panzer/adapters-stk/src/stk_interface/Panzer_STK_ExodusReaderFactory.cpp b/packages/panzer/adapters-stk/src/stk_interface/Panzer_STK_ExodusReaderFactory.cpp index f18daf0f0b73..18c23bbba1e7 100644 --- a/packages/panzer/adapters-stk/src/stk_interface/Panzer_STK_ExodusReaderFactory.cpp +++ b/packages/panzer/adapters-stk/src/stk_interface/Panzer_STK_ExodusReaderFactory.cpp @@ -404,16 +404,13 @@ Teuchos::RCP STK_ExodusReaderFactory::getValidPara validParams->set("Restart Index",-1,"Index of solution to read in", Teuchos::rcp(new Teuchos::AnyNumberParameterEntryValidator(Teuchos::AnyNumberParameterEntryValidator::PREFER_INT,Teuchos::AnyNumberParameterEntryValidator::AcceptedTypes(true)))); - Teuchos::setStringToIntegralParameter("File Type", - "Exodus", - "Choose input file type - either \"Exodus\", \"Exodus Refinement\" or \"Pamgen\"", - Teuchos::tuple("Exodus","Pamgen" + validParams->set("File Type", "Exodus", + "Choose input file type - either \"Exodus\", \"Exodus Refinement\" or \"Pamgen\"", + rcp(new Teuchos::StringValidator(Teuchos::tuple("Exodus", "Pamgen" #ifdef PANZER_HAVE_UMR - ,"Exodus Refinement" -#endif - ), - validParams.get() - ); + ,"Exodus Refinement" +#endif + )))); validParams->set("Scale Factor", 1.0, "Scale factor to apply to mesh after read", Teuchos::rcp(new Teuchos::AnyNumberParameterEntryValidator(Teuchos::AnyNumberParameterEntryValidator::PREFER_DOUBLE,Teuchos::AnyNumberParameterEntryValidator::AcceptedTypes(true)))); diff --git a/packages/panzer/adapters-stk/src/stk_interface/Panzer_STK_Quad8ToQuad4MeshFactory.cpp b/packages/panzer/adapters-stk/src/stk_interface/Panzer_STK_Quad8ToQuad4MeshFactory.cpp index 2f2d4763619f..960a2116aef2 100644 --- a/packages/panzer/adapters-stk/src/stk_interface/Panzer_STK_Quad8ToQuad4MeshFactory.cpp +++ b/packages/panzer/adapters-stk/src/stk_interface/Panzer_STK_Quad8ToQuad4MeshFactory.cpp @@ -184,12 +184,9 @@ Teuchos::RCP Quad8ToQuad4MeshFactory::getValidPara if(defaultParams == Teuchos::null) { defaultParams = rcp(new Teuchos::ParameterList); - Teuchos::setStringToIntegralParameter( - "Offset mesh GIDs above 32-bit int limit", - "OFF", + defaultParams->set("Offset mesh GIDs above 32-bit int limit", "OFF", "If 64-bit GIDs are supported, the mesh element and node global indices will start at a value greater than 32-bit limit.", - Teuchos::tuple("OFF", "ON"), - defaultParams.get()); + rcp(new Teuchos::StringValidator(Teuchos::tuple("OFF", "ON")))); // default to false for backward compatibility defaultParams->set("Create Edge Blocks",false,"Create edge blocks in the mesh"); diff --git a/packages/panzer/adapters-stk/src/stk_interface/Panzer_STK_QuadraticToLinearMeshFactory.cpp b/packages/panzer/adapters-stk/src/stk_interface/Panzer_STK_QuadraticToLinearMeshFactory.cpp index 39f0d1285369..c7ce0a1802ee 100644 --- a/packages/panzer/adapters-stk/src/stk_interface/Panzer_STK_QuadraticToLinearMeshFactory.cpp +++ b/packages/panzer/adapters-stk/src/stk_interface/Panzer_STK_QuadraticToLinearMeshFactory.cpp @@ -227,12 +227,9 @@ Teuchos::RCP QuadraticToLinearMeshFactory::getVali if(defaultParams == Teuchos::null) { defaultParams = rcp(new Teuchos::ParameterList); - Teuchos::setStringToIntegralParameter( - "Offset mesh GIDs above 32-bit int limit", - "OFF", + defaultParams->set("Offset mesh GIDs above 32-bit int limit", "OFF", "If 64-bit GIDs are supported, the mesh element and node global indices will start at a value greater than 32-bit limit.", - Teuchos::tuple("OFF", "ON"), - defaultParams.get()); + rcp(new Teuchos::StringValidator(Teuchos::tuple("OFF", "ON")))); // default to false for backward compatibility defaultParams->set("Create Edge Blocks",false,"Create edge blocks in the mesh"); diff --git a/packages/panzer/adapters-stk/src/stk_interface/Panzer_STK_SquareQuadMeshFactory.cpp b/packages/panzer/adapters-stk/src/stk_interface/Panzer_STK_SquareQuadMeshFactory.cpp index 6f5d3ec41b23..71e8302d6969 100644 --- a/packages/panzer/adapters-stk/src/stk_interface/Panzer_STK_SquareQuadMeshFactory.cpp +++ b/packages/panzer/adapters-stk/src/stk_interface/Panzer_STK_SquareQuadMeshFactory.cpp @@ -238,12 +238,9 @@ Teuchos::RCP SquareQuadMeshFactory::getValidParame // default to false for backward compatibility defaultParams->set("Create Edge Blocks",false,"Create edge blocks in the mesh"); - Teuchos::setStringToIntegralParameter( - "Offset mesh GIDs above 32-bit int limit", - "OFF", + defaultParams->set("Offset mesh GIDs above 32-bit int limit", "OFF", "If 64-bit GIDs are supported, the mesh element and node global indices will start at a value greater than 32-bit limit.", - Teuchos::tuple("OFF", "ON"), - defaultParams.get()); + rcp(new Teuchos::StringValidator(Teuchos::tuple("OFF", "ON")))); Teuchos::ParameterList & bcs = defaultParams->sublist("Periodic BCs"); bcs.set("Count",0); // no default periodic boundary conditions diff --git a/packages/panzer/disc-fe/src/Panzer_BasisValues2_impl.hpp b/packages/panzer/disc-fe/src/Panzer_BasisValues2_impl.hpp index b1f775e5ac47..d0fea45bb755 100644 --- a/packages/panzer/disc-fe/src/Panzer_BasisValues2_impl.hpp +++ b/packages/panzer/disc-fe/src/Panzer_BasisValues2_impl.hpp @@ -1082,7 +1082,6 @@ getBasisValues(const bool weighted, const int num_cells = num_cells_; const int num_points = basis_layout->numPoints(); const int num_card = basis_layout->cardinality(); - const int num_dim = basis_layout->dimension(); if(weighted){ TEUCHOS_ASSERT(cubature_weights_.size() > 0); @@ -1119,7 +1118,6 @@ getBasisValues(const bool weighted, TEUCHOS_ASSERT(cubature_jacobian_determinant_.size() > 0); } - auto cell_basis_ref_scalar = af.buildStaticArray("cell_basis_ref_scalar",num_card,num_points); auto tmp_basis_scalar = af.buildStaticArray("basis_scalar",num_cells,num_card,num_points); if(hasUniformReferenceSpace()){ @@ -1127,6 +1125,7 @@ getBasisValues(const bool weighted, auto cubature_points_uniform_ref = PHX::getNonConstDynRankViewFromConstMDField(cubature_points_uniform_ref_); // Apply a single reference representation to all cells + auto cell_basis_ref_scalar = af.buildStaticArray("cell_basis_ref_scalar",num_card,num_points); intrepid_basis->getValues(cell_basis_ref_scalar.get_view(),cubature_points_uniform_ref,Intrepid2::OPERATOR_VALUE); const std::pair cell_range(0,num_evaluate_cells_); @@ -1134,75 +1133,40 @@ getBasisValues(const bool weighted, // Apply transformation (HGRAD version is just a copy operation) using fst=Intrepid2::FunctionSpaceTools; - if(element_space == PureBasis::HVOL){ + if (element_space == PureBasis::HVOL){ auto s_cjd = Kokkos::subview(cubature_jacobian_determinant_.get_view(), cell_range, Kokkos::ALL()); fst::HVOLtransformVALUE(s_aux,s_cjd,cell_basis_ref_scalar.get_view()); - }else if(element_space == PureBasis::HGRAD || element_space == PureBasis::CONST) + } else if (element_space == PureBasis::HGRAD || element_space == PureBasis::CONST) { fst::HGRADtransformVALUE(s_aux,cell_basis_ref_scalar.get_view()); - + } PHX::Device().fence(); } else { - - // This is ugly. The algorithm is restricted to host/serial due - // to a call to intrepid tools that require uniform reference - // representation. For DG, CVFEM and sidesets this reference is - // nonuniform. - - // Local allocation used for each cell - auto cell_basis_scalar = af.buildStaticArray("cell_basis_scalar",1,num_card,num_points); - auto cell_cub_points = af.buildStaticArray("cell_cub_points",num_points,num_dim); - auto cell_jac_det = af.buildStaticArray("cell_jac_det",1,num_points); - - // The array factory is difficult to extend to host space - // without extra template magic and changing a ton of code in a - // non-backwards compatible way, so we use some of the arrays - // above only to get derivative array sized correctly and then - // create the mirror on host. - auto cell_basis_scalar_host = Kokkos::create_mirror_view(cell_basis_scalar.get_view()); - auto cell_cub_points_host = Kokkos::create_mirror_view(cell_cub_points.get_view()); - auto cell_jac_det_host = Kokkos::create_mirror_view(cell_jac_det.get_view()); - auto cell_basis_ref_scalar_host = Kokkos::create_mirror_view(cell_basis_ref_scalar.get_view()); + // getValues currently assumes a single reference cell. Calling + // it serially on host until the function supports multiple + // reference cells to avoid a kernel launch per cell. auto cubature_points_ref_host = Kokkos::create_mirror_view(cubature_points_ref_.get_view()); Kokkos::deep_copy(cubature_points_ref_host,cubature_points_ref_.get_view()); - auto cubature_jacobian_determinant_host = Kokkos::create_mirror_view(cubature_jacobian_determinant_.get_view()); - Kokkos::deep_copy(cubature_jacobian_determinant_host,cubature_jacobian_determinant_.get_view()); auto tmp_basis_scalar_host = Kokkos::create_mirror_view(tmp_basis_scalar.get_view()); + auto intrepid_basis_host = intrepid_basis->getHostBasis(); - // We have to iterate through cells and apply a separate reference representation for each - for(int cell=0; cellgetValues(cell_basis_ref_scalar.get_view(),cell_cub_points.get_view(),Intrepid2::OPERATOR_VALUE); - Kokkos::deep_copy(cell_basis_ref_scalar_host,cell_basis_ref_scalar.get_view()); - - using fst=Intrepid2::FunctionSpaceTools; - - if(element_space == PureBasis::HVOL){ - // Need the jacobian determinant for HVOL - for(int p=0;pgetValues(my_cell_basis_host,my_cell_cub_points_ref_host); + } + auto tmp_basis_scalar_ref = af.buildStaticArray("tmp_basis_scalar_ref",num_cells,num_card,num_points); + Kokkos::deep_copy(tmp_basis_scalar_ref.get_view(),tmp_basis_scalar_host); - Kokkos::deep_copy(tmp_basis_scalar.get_view(),tmp_basis_scalar_host); + using fst=Intrepid2::FunctionSpaceTools; + if(element_space == PureBasis::HVOL){ + const std::pair cell_range(0,num_evaluate_cells_); + auto s_cjd = Kokkos::subview(cubature_jacobian_determinant_.get_view(), cell_range, Kokkos::ALL()); + fst::HVOLtransformVALUE(tmp_basis_scalar.get_view(),s_cjd,tmp_basis_scalar_ref.get_view()); + } else if(element_space == PureBasis::HGRAD || element_space == PureBasis::CONST) { + fst::HGRADtransformVALUE(tmp_basis_scalar.get_view(),tmp_basis_scalar_ref.get_view()); } + PHX::Device().fence(); } // NOTE: weighted already has orientations applied, so this code @@ -1280,7 +1244,6 @@ getVectorBasisValues(const bool weighted, TEUCHOS_ASSERT(cubature_jacobian_.size() > 0 && cubature_jacobian_determinant_.size() > 0); } - auto cell_basis_ref_vector = af.buildStaticArray("cell_basis_ref_scalar",num_card,num_points,num_dim); auto tmp_basis_vector = af.buildStaticArray("basis_vector",num_cells,num_card,num_points,num_dim); if(hasUniformReferenceSpace()){ @@ -1288,6 +1251,7 @@ getVectorBasisValues(const bool weighted, auto cubature_points_uniform_ref = PHX::getNonConstDynRankViewFromConstMDField(cubature_points_uniform_ref_); // Apply a single reference representation to all cells + auto cell_basis_ref_vector = af.buildStaticArray("cell_basis_ref_scalar",num_card,num_points,num_dim); intrepid_basis->getValues(cell_basis_ref_vector.get_view(),cubature_points_uniform_ref,Intrepid2::OPERATOR_VALUE); const std::pair cell_range(0,num_evaluate_cells_); @@ -1296,102 +1260,47 @@ getVectorBasisValues(const bool weighted, // Apply transformation (HGRAD version is just a copy operation) using fst=Intrepid2::FunctionSpaceTools; if(element_space == PureBasis::HCURL){ - auto s_jac_inv = Kokkos::subview(cubature_jacobian_inverse_.get_view(), cell_range, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); - fst::HCURLtransformVALUE(s_aux,s_jac_inv,cell_basis_ref_vector.get_view()); } else if(element_space == PureBasis::HDIV){ - auto s_jac = Kokkos::subview(cubature_jacobian_.get_view(), cell_range, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); auto s_jac_det = Kokkos::subview(cubature_jacobian_determinant_.get_view(), cell_range, Kokkos::ALL()); - fst::HDIVtransformVALUE(s_aux,s_jac, s_jac_det, cell_basis_ref_vector.get_view()); } - PHX::Device().fence(); } else { - // This is ugly. The algorithm is restricted to host/serial due - // to intrepid tools that requiring uniform reference - // representation. For DG, CVFEM and sidesets this reference is - // nonuniform. - - // Local allocation used for each cell - auto cell_basis_vector = af.buildStaticArray("cell_basis_vector",1,num_card,num_points,num_dim); - auto cell_cub_points = af.buildStaticArray("cell_cub_points",num_points,num_dim); - auto cell_jac_det = af.buildStaticArray("cell_jac_det",1,num_points); - auto cell_jac = af.buildStaticArray("cell_jac",1,num_points,num_dim,num_dim); - auto cell_jac_inv = af.buildStaticArray("cell_jac_inv",1,num_points,num_dim,num_dim); - - // The array factory is difficult to extend to host space - // without extra template magic and changing a ton of code in a - // non-backwards compatible way, so we use some of the arrays - // above only to get derivative array sized correctly and then - // create the mirror on host. - auto cell_basis_vector_host = Kokkos::create_mirror_view(cell_basis_vector.get_view()); - auto cell_cub_points_host = Kokkos::create_mirror_view(cell_cub_points.get_view()); - auto cell_jac_det_host = Kokkos::create_mirror_view(cell_jac_det.get_view()); - auto cell_jac_host = Kokkos::create_mirror_view(cell_jac.get_view()); - auto cell_jac_inv_host = Kokkos::create_mirror_view(cell_jac_inv.get_view()); - auto cell_basis_ref_vector_host = Kokkos::create_mirror_view(cell_basis_ref_vector.get_view()); + // getValues currently assumes a single reference cell. Calling + // it serially on host until the function supports multiple + // reference cells to avoid a kernel launch per cell. auto cubature_points_ref_host = Kokkos::create_mirror_view(cubature_points_ref_.get_view()); Kokkos::deep_copy(cubature_points_ref_host,cubature_points_ref_.get_view()); - auto cubature_jacobian_host = Kokkos::create_mirror_view(cubature_jacobian_.get_view()); - Kokkos::deep_copy(cubature_jacobian_host,cubature_jacobian_.get_view()); - auto cubature_jacobian_inverse_host = Kokkos::create_mirror_view(cubature_jacobian_inverse_.get_view()); - Kokkos::deep_copy(cubature_jacobian_inverse_host,cubature_jacobian_inverse_.get_view()); - auto cubature_jacobian_determinant_host = Kokkos::create_mirror_view(cubature_jacobian_determinant_.get_view()); - Kokkos::deep_copy(cubature_jacobian_determinant_host,cubature_jacobian_determinant_.get_view()); auto tmp_basis_vector_host = Kokkos::create_mirror_view(tmp_basis_vector.get_view()); - // We have to iterate through cells and apply a separate reference representation for each - for(int cell=0; cellgetValues(cell_basis_ref_vector.get_view(),cell_cub_points.get_view(),Intrepid2::OPERATOR_VALUE); - Kokkos::deep_copy(cell_basis_ref_vector_host,cell_basis_ref_vector.get_view()); - - using fst=Intrepid2::FunctionSpaceTools; - - if(element_space == PureBasis::HCURL){ - // Need the jacobian inverse for HCurl - for(int p=0;pgetHostBasis(); + for(int cell=0; cellgetValues(my_cell_basis_host,my_cell_cub_points_ref_host); } + auto tmp_basis_vector_ref = af.buildStaticArray("tmp_basis_vector_ref",num_cells,num_card,num_points,num_dim); + Kokkos::deep_copy(tmp_basis_vector_ref.get_view(),tmp_basis_vector_host); + + const std::pair cell_range(0,num_evaluate_cells_); + auto s_aux = Kokkos::subview(tmp_basis_vector.get_view(), cell_range, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + + using fst=Intrepid2::FunctionSpaceTools; + if(element_space == PureBasis::HCURL){ + auto s_jac_inv = Kokkos::subview(cubature_jacobian_inverse_.get_view(), cell_range, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + fst::HCURLtransformVALUE(s_aux,s_jac_inv,tmp_basis_vector_ref.get_view()); + } else if(element_space == PureBasis::HDIV){ + auto s_jac = Kokkos::subview(cubature_jacobian_.get_view(), cell_range, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + auto s_jac_det = Kokkos::subview(cubature_jacobian_determinant_.get_view(), cell_range, Kokkos::ALL()); + fst::HDIVtransformVALUE(s_aux,s_jac, s_jac_det, tmp_basis_vector_ref.get_view()); + } + PHX::Device().fence(); + } if(orientations_.size() > 0) @@ -1481,66 +1390,32 @@ getGradBasisValues(const bool weighted, } else { - // This is ugly. The algorithm is restricted to host/serial due - // to intrepid tools that requiring uniform reference - // representation. For DG, CVFEM and sidesets this reference is - // nonuniform. - - // Local allocation used for each cell - auto cell_grad_basis = af.buildStaticArray("cell_grad_basis",1,num_card,num_points,num_dim); - auto cell_cub_points = af.buildStaticArray("cell_cub_points",num_points,num_dim); - auto cell_jac_inv = af.buildStaticArray("cell_jac_inv",1,num_points,num_dim,num_dim); - - // The array factory is difficult to extend to host space - // without extra template magic and changing a ton of code in a - // non-backwards compatible way, so we use some of the arrays - // above only to get derivative array sized correctly and then - // create the mirror on host. - - // auto cell_basis_ref_vector = af.buildStaticArray("cell_basis_ref_scalar",num_card,num_points,num_dim); - - auto cell_cub_points_host = Kokkos::create_mirror_view(cell_cub_points.get_view()); + // getValues currently assumes a single reference cell. Calling + // it serially on host until the function supports multiple + // reference cells to avoid a kernel launch per cell. auto cubature_points_ref_host = Kokkos::create_mirror_view(cubature_points_ref_.get_view()); Kokkos::deep_copy(cubature_points_ref_host,cubature_points_ref_.get_view()); - auto cell_jac_inv_host = Kokkos::create_mirror_view(cell_jac_inv.get_view()); - auto cubature_jacobian_inverse_host = Kokkos::create_mirror_view(cubature_jacobian_inverse_.get_view()); - Kokkos::deep_copy(cubature_jacobian_inverse_host,cubature_jacobian_inverse_.get_view()); - auto cell_grad_basis_ref_host = Kokkos::create_mirror_view(cell_grad_basis_ref.get_view()); - auto cell_grad_basis_host = Kokkos::create_mirror_view(cell_grad_basis.get_view()); auto tmp_grad_basis_host = Kokkos::create_mirror_view(tmp_grad_basis.get_view()); - // We have to iterate through cells and apply a separate reference representation for each - for(int cell=0; cellgetValues(cell_grad_basis_ref.get_view(),cell_cub_points.get_view(),Intrepid2::OPERATOR_GRAD); - Kokkos::deep_copy(cell_grad_basis_ref_host,cell_grad_basis_ref.get_view()); - - for(int p=0;p; - fst::HGRADtransformGRAD(cell_grad_basis_host,cell_jac_inv_host,cell_grad_basis_ref_host); - // PHX::Device().fence(); - - // Copy cell quantity back into main array - for(int b=0; bgetHostBasis(); + for(int cell=0; cellgetValues(my_cell_grad_basis_host,my_cell_cub_points_ref_host,Intrepid2::OPERATOR_GRAD); } + auto tmp_grad_basis_ref = af.buildStaticArray("tmp_grad_basis_ref",num_cells,num_card,num_points,num_dim); + Kokkos::deep_copy(tmp_grad_basis_ref.get_view(),tmp_grad_basis_host); + + const std::pair cell_range(0,num_evaluate_cells_); + auto s_aux = Kokkos::subview(tmp_grad_basis.get_view(), cell_range, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + auto s_jac_inv = Kokkos::subview(cubature_jacobian_inverse_.get_view(), cell_range, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + + // Apply transformation + using fst=Intrepid2::FunctionSpaceTools; + fst::HGRADtransformGRAD(s_aux, s_jac_inv,tmp_grad_basis_ref.get_view()); + + PHX::Device().fence(); + } if(orientations_.size() > 0) @@ -1609,11 +1484,11 @@ getCurl2DVectorBasis(const bool weighted, const auto element_space = getElementSpace(); TEUCHOS_ASSERT(element_space == PureBasis::HCURL); - auto cell_curl_basis_ref_scalar = af.buildStaticArray("cell_curl_basis_ref_scalar",num_card,num_points); auto tmp_curl_basis_scalar = af.buildStaticArray("curl_basis_scalar",num_cells,num_card,num_points); if(hasUniformReferenceSpace()){ + auto cell_curl_basis_ref_scalar = af.buildStaticArray("cell_curl_basis_ref_scalar",num_card,num_points); auto cubature_points_uniform_ref = PHX::getNonConstDynRankViewFromConstMDField(cubature_points_uniform_ref_); intrepid_basis->getValues(cell_curl_basis_ref_scalar.get_view(),cubature_points_uniform_ref,Intrepid2::OPERATOR_CURL); @@ -1631,63 +1506,33 @@ getCurl2DVectorBasis(const bool weighted, } else { - // This is ugly. The algorithm is restricted to host/serial due - // to intrepid tools that requiring uniform reference - // representation. For DG, CVFEM and sidesets this reference is - // nonuniform. - - // Local allocation used for each cell - auto cell_curl = af.buildStaticArray("cell_curl",1,num_card,num_points); - auto cell_cub_points = af.buildStaticArray("cell_cub_points",num_points,num_dim); - auto cell_jac_det = af.buildStaticArray("cell_jac_det",1,num_points); - - // The array factory is difficult to extend to host space - // without extra template magic and changing a ton of code in a - // non-backwards compatible way, so we use some of the arrays - // above only to get derivative array sized correctly and then - // create the mirror on host. - auto cell_cub_points_host = Kokkos::create_mirror_view(cell_cub_points.get_view()); + // getValues currently assumes a single reference cell. Calling + // it serially on host until the function supports multiple + // reference cells to avoid a kernel launch per cell. auto cubature_points_ref_host = Kokkos::create_mirror_view(cubature_points_ref_.get_view()); Kokkos::deep_copy(cubature_points_ref_host,cubature_points_ref_.get_view()); - auto cell_jac_det_host = Kokkos::create_mirror_view(cell_jac_det.get_view()); - auto cubature_jacobian_determinant_host = Kokkos::create_mirror_view(cubature_jacobian_determinant_.get_view()); - Kokkos::deep_copy(cubature_jacobian_determinant_host,cubature_jacobian_determinant_.get_view()); - auto cell_curl_basis_ref_scalar_host = Kokkos::create_mirror_view(cell_curl_basis_ref_scalar.get_view()); - auto cell_curl_host = Kokkos::create_mirror_view(cell_curl.get_view()); auto tmp_curl_basis_scalar_host = Kokkos::create_mirror_view(tmp_curl_basis_scalar.get_view()); - // We have to iterate through cells and apply a separate reference representation for each - for(int cell=0; cellgetValues(cell_curl_basis_ref_scalar.get_view(),cell_cub_points.get_view(),Intrepid2::OPERATOR_CURL); - Kokkos::deep_copy(cell_curl_basis_ref_scalar_host,cell_curl_basis_ref_scalar.get_view()); - - // note only volume deformation is needed! - // this relates directly to this being in - // the divergence space in 2D! - using fst=Intrepid2::FunctionSpaceTools; - fst::HDIVtransformDIV(cell_curl_host,cell_jac_det_host,cell_curl_basis_ref_scalar_host); - PHX::Device().fence(); - - // Copy cell quantity back into main array - for(int b=0; bgetHostBasis(); + for(int cell=0; cellgetValues(my_cell_curl_basis_host,my_cell_cub_points_ref_host,Intrepid2::OPERATOR_CURL); } + auto tmp_curl_basis_scalar_ref = af.buildStaticArray("tmp_curl_basis_scalar_ref",num_cells,num_card,num_points); + Kokkos::deep_copy(tmp_curl_basis_scalar_ref.get_view(),tmp_curl_basis_scalar_host); + + const std::pair cell_range(0,num_evaluate_cells_); + auto s_aux = Kokkos::subview(tmp_curl_basis_scalar.get_view(), cell_range, Kokkos::ALL(), Kokkos::ALL()); + auto s_jac_det = Kokkos::subview(cubature_jacobian_determinant_.get_view(), cell_range, Kokkos::ALL()); + + // note only volume deformation is needed! + // this relates directly to this being in + // the divergence space in 2D! + using fst=Intrepid2::FunctionSpaceTools; + fst::HDIVtransformDIV(s_aux,s_jac_det,tmp_curl_basis_scalar_ref.get_view()); + PHX::Device().fence(); + } if(orientations_.size() > 0) @@ -1756,11 +1601,11 @@ getCurlVectorBasis(const bool weighted, const auto element_space = getElementSpace(); TEUCHOS_ASSERT(element_space == PureBasis::HCURL); - auto cell_curl_basis_ref_vector = af.buildStaticArray("cell_curl_basis_ref_vector",num_card,num_points,num_dim); auto tmp_curl_basis_vector = af.buildStaticArray("curl_basis_vector",num_cells,num_card,num_points,num_dim); if(hasUniformReferenceSpace()){ + auto cell_curl_basis_ref_vector = af.buildStaticArray("cell_curl_basis_ref_vector",num_card,num_points,num_dim); auto cubature_points_uniform_ref = PHX::getNonConstDynRankViewFromConstMDField(cubature_points_uniform_ref_); intrepid_basis->getValues(cell_curl_basis_ref_vector.get_view(),cubature_points_uniform_ref,Intrepid2::OPERATOR_CURL); @@ -1776,68 +1621,31 @@ getCurlVectorBasis(const bool weighted, } else { - // This is ugly. The algorithm is restricted to host/serial due - // to intrepid tools that requiring uniform reference - // representation. For DG, CVFEM and sidesets this reference is - // nonuniform. - - // Local allocation used for each cell - auto cell_curl = af.buildStaticArray("cell_curl",1,num_card,num_points,num_dim); - auto cell_cub_points = af.buildStaticArray("cell_cub_points",num_points,num_dim); - auto cell_jac = af.buildStaticArray("cell_jac",1,num_points,num_dim,num_dim); - auto cell_jac_det = af.buildStaticArray("cell_jac_det",1,num_points); - - // The array factory is difficult to extend to host space - // without extra template magic and changing a ton of code in a - // non-backwards compatible way, so we use some of the arrays - // above only to get derivative array sized correctly and then - // create the mirror on host. - auto cell_cub_points_host = Kokkos::create_mirror_view(cell_cub_points.get_view()); + // getValues currently assumes a single reference cell. Calling + // it serially on host until the function supports multiple + // reference cells to avoid a kernel launch per cell. auto cubature_points_ref_host = Kokkos::create_mirror_view(cubature_points_ref_.get_view()); Kokkos::deep_copy(cubature_points_ref_host,cubature_points_ref_.get_view()); - auto cell_jac_host = Kokkos::create_mirror_view(cell_jac.get_view()); - auto cubature_jacobian_host = Kokkos::create_mirror_view(cubature_jacobian_.get_view()); - Kokkos::deep_copy(cubature_jacobian_host,cubature_jacobian_.get_view()); - auto cell_jac_det_host = Kokkos::create_mirror_view(cell_jac_det.get_view()); - auto cubature_jacobian_determinant_host = Kokkos::create_mirror_view(cubature_jacobian_determinant_.get_view()); - Kokkos::deep_copy(cubature_jacobian_determinant_host,cubature_jacobian_determinant_.get_view()); - auto cell_curl_basis_ref_vector_host = Kokkos::create_mirror_view(cell_curl_basis_ref_vector.get_view()); - auto cell_curl_host = Kokkos::create_mirror_view(cell_curl.get_view()); auto tmp_curl_basis_vector_host = Kokkos::create_mirror_view(tmp_curl_basis_vector.get_view()); - // We have to iterate through cells and apply a separate reference representation for each - for(int cell=0; cellgetValues(cell_curl_basis_ref_vector.get_view(),cell_cub_points.get_view(),Intrepid2::OPERATOR_CURL); - Kokkos::deep_copy(cell_curl_basis_ref_vector_host,cell_curl_basis_ref_vector.get_view()); - - using fst=Intrepid2::FunctionSpaceTools; - fst::HCURLtransformCURL(cell_curl_host,cell_jac_host,cell_jac_det_host,cell_curl_basis_ref_vector_host); - // PHX::Device().fence(); - - // Copy cell quantity back into main array - for(int b=0; bgetHostBasis(); + for(int cell=0; cellgetValues(my_cell_curl_basis_host,my_cell_cub_points_ref_host,Intrepid2::OPERATOR_CURL); } + auto tmp_curl_basis_vector_ref = af.buildStaticArray("tmp_curl_basis_scalar_ref",num_cells,num_card,num_points,num_dim); + Kokkos::deep_copy(tmp_curl_basis_vector_ref.get_view(),tmp_curl_basis_vector_host); + + const std::pair cell_range(0,num_evaluate_cells_); + auto s_aux = Kokkos::subview(tmp_curl_basis_vector.get_view(), cell_range, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + auto s_jac = Kokkos::subview(cubature_jacobian_.get_view(), cell_range, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + auto s_jac_det = Kokkos::subview(cubature_jacobian_determinant_.get_view(), cell_range, Kokkos::ALL()); + + using fst=Intrepid2::FunctionSpaceTools; + fst::HCURLtransformCURL(s_aux, s_jac, s_jac_det, tmp_curl_basis_vector_ref.get_view()); + PHX::Device().fence(); + } if(orientations_.size() > 0) @@ -1873,7 +1681,6 @@ getDivVectorBasis(const bool weighted, const int num_cells = num_cells_; const int num_points = basis_layout->numPoints(); const int num_card = basis_layout->cardinality(); - const int num_dim = basis_layout->dimension(); if(weighted){ @@ -1905,11 +1712,11 @@ getDivVectorBasis(const bool weighted, const auto element_space = getElementSpace(); TEUCHOS_ASSERT(element_space == PureBasis::HDIV); - auto cell_div_basis_ref = af.buildStaticArray("cell_div_basis_ref",num_card,num_points); auto tmp_div_basis = af.buildStaticArray("div_basis",num_cells,num_card,num_points); if(hasUniformReferenceSpace()){ + auto cell_div_basis_ref = af.buildStaticArray("cell_div_basis_ref",num_card,num_points); auto cubature_points_uniform_ref = PHX::getNonConstDynRankViewFromConstMDField(cubature_points_uniform_ref_); intrepid_basis->getValues(cell_div_basis_ref.get_view(),cubature_points_uniform_ref,Intrepid2::OPERATOR_DIV); @@ -1924,60 +1731,30 @@ getDivVectorBasis(const bool weighted, } else { - // This is ugly. The algorithm is restricted to host/serial due - // to intrepid tools that requiring uniform reference - // representation. For DG, CVFEM and sidesets this reference is - // nonuniform. - - // Local allocation used for each cell - auto cell_div_basis = af.buildStaticArray("cell_div_basis",1,num_card,num_points); - auto cell_cub_points = af.buildStaticArray("cell_cub_points",num_points,num_dim); - auto cell_jac_det = af.buildStaticArray("cell_jac_det",1,num_points); - - // The array factory is difficult to extend to host space - // without extra template magic and changing a ton of code in a - // non-backwards compatible way, so we use some of the arrays - // above only to get derivative array sized correctly and then - // create the mirror on host. - auto cell_cub_points_host = Kokkos::create_mirror_view(cell_cub_points.get_view()); + // getValues currently assumes a single reference cell. Calling + // it serially on host until the function supports multiple + // reference cells to avoid a kernel launch per cell. auto cubature_points_ref_host = Kokkos::create_mirror_view(cubature_points_ref_.get_view()); Kokkos::deep_copy(cubature_points_ref_host,cubature_points_ref_.get_view()); - auto cell_jac_det_host = Kokkos::create_mirror_view(cell_jac_det.get_view()); - auto cubature_jacobian_determinant_host = Kokkos::create_mirror_view(cubature_jacobian_determinant_.get_view()); - Kokkos::deep_copy(cubature_jacobian_determinant_host,cubature_jacobian_determinant_.get_view()); - auto cell_div_basis_ref_host = Kokkos::create_mirror_view(cell_div_basis_ref.get_view()); - auto cell_div_basis_host = Kokkos::create_mirror_view(cell_div_basis.get_view()); auto tmp_div_basis_host = Kokkos::create_mirror_view(tmp_div_basis.get_view()); - // We have to iterate through cells and apply a separate reference representation for each - for(int cell=0; cellgetValues(cell_div_basis_ref.get_view(),cell_cub_points.get_view(),Intrepid2::OPERATOR_DIV); - Kokkos::deep_copy(cell_div_basis_ref_host,cell_div_basis_ref.get_view()); - - using fst=Intrepid2::FunctionSpaceTools; - fst::HDIVtransformDIV(cell_div_basis.get_view(),cell_jac_det.get_view(),cell_div_basis_ref.get_view()); - Kokkos::deep_copy(cell_div_basis_host, cell_div_basis.get_static_view()); - - - // Copy cell quantity back into main array - for(int b=0; bgetHostBasis(); + for(int cell=0; cellgetValues(my_cell_div_basis_host,my_cell_cub_points_ref_host,Intrepid2::OPERATOR_DIV); } + auto tmp_div_basis_ref = af.buildStaticArray("tmp_div_basis_ref",num_cells,num_card,num_points); + Kokkos::deep_copy(tmp_div_basis_ref.get_view(),tmp_div_basis_host); + + const std::pair cell_range(0,num_evaluate_cells_); + auto s_aux = Kokkos::subview(tmp_div_basis.get_view(), cell_range, Kokkos::ALL(), Kokkos::ALL()); + auto s_jac_det = Kokkos::subview(cubature_jacobian_determinant_.get_view(), cell_range, Kokkos::ALL()); + + using fst=Intrepid2::FunctionSpaceTools; + fst::HDIVtransformDIV(s_aux,s_jac_det,tmp_div_basis_ref.get_view()); + PHX::Device().fence(); + } if(orientations_.size() > 0) diff --git a/packages/panzer/disc-fe/test/equation_set/user_app_EquationSet_Energy_impl.hpp b/packages/panzer/disc-fe/test/equation_set/user_app_EquationSet_Energy_impl.hpp index b583e9896180..6e36984c9292 100644 --- a/packages/panzer/disc-fe/test/equation_set/user_app_EquationSet_Energy_impl.hpp +++ b/packages/panzer/disc-fe/test/equation_set/user_app_EquationSet_Energy_impl.hpp @@ -84,14 +84,9 @@ EquationSet_Energy(const Teuchos::RCP& params, valid_parameters.set("Basis Type","HGrad","Type of Basis to use"); valid_parameters.set("Basis Order",1,"Order of the basis"); valid_parameters.set("Integration Order",-1,"Order of the integration rule"); - - Teuchos::setStringToIntegralParameter( - "CONVECTION", - "OFF", + valid_parameters.set("CONVECTION", "OFF", "Enables or disables convection term in the energy equation", - Teuchos::tuple("ON","OFF"), - &valid_parameters - ); + rcp(new Teuchos::StringValidator(Teuchos::tuple("ON", "OFF")))); params->validateParametersAndSetDefaults(valid_parameters); } diff --git a/packages/seacas/libraries/exodus/src/ex_put_concat_elem_block.c b/packages/seacas/libraries/exodus/src/ex_put_concat_elem_block.c index 0d839323d52b..f5c7ad17bfc4 100644 --- a/packages/seacas/libraries/exodus/src/ex_put_concat_elem_block.c +++ b/packages/seacas/libraries/exodus/src/ex_put_concat_elem_block.c @@ -1,5 +1,5 @@ /* - * Copyright(C) 1999-2020, 2022, 2023 National Technology & Engineering Solutions + * Copyright(C) 1999-2020, 2022, 2023, 2024 National Technology & Engineering Solutions * of Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with * NTESS, the U.S. Government retains certain rights in this software. * @@ -271,8 +271,10 @@ int ex_put_concat_elem_block(int exoid, const void_int *elem_blk_id, char *const ex_err_fn(exoid, __func__, errmsg, status); goto error_ret; /* exit define mode and return */ } +#if defined(EX_CAN_USE_NC_DEF_VAR_FILL) int fill = NC_FILL_CHAR; nc_def_var_fill(exoid, temp, 0, &fill); +#endif eb_array[iblk] = temp; dims[0] = numelbdim; diff --git a/packages/seacas/libraries/ioss/cmake/Dependencies.cmake b/packages/seacas/libraries/ioss/cmake/Dependencies.cmake index 1349df1d8879..f16f73919038 100644 --- a/packages/seacas/libraries/ioss/cmake/Dependencies.cmake +++ b/packages/seacas/libraries/ioss/cmake/Dependencies.cmake @@ -1,14 +1,15 @@ -if(CMAKE_PROJECT_NAME STREQUAL "Trilinos") -TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( - LIB_OPTIONAL_PACKAGES SEACASExodus Pamgen Zoltan Kokkos - LIB_OPTIONAL_TPLS HDF5 CGNS ParMETIS Faodel Cereal DLlib Pthread DataWarp ADIOS2 Catalyst2 ${SEACAS_GTest_TPL_name} -) -else() +if(CMAKE_PROJECT_NAME STREQUAL "Seacas" OR CMAKE_PROJECT_NAME STREQUAL "SEACAS" ) TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_REQUIRED_TPLS fmt LIB_OPTIONAL_PACKAGES SEACASExodus Zoltan LIB_OPTIONAL_TPLS HDF5 Pamgen CGNS ParMETIS Faodel Cereal DLlib Pthread ADIOS2 Catalyst2 ${SEACAS_GTest_TPL_name} Kokkos DataWarp Catch2 ) +else() +# Typically for Trilinos since don't have fmt as TPL, but instead have embedded versions. +TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( + LIB_OPTIONAL_PACKAGES SEACASExodus Pamgen Zoltan Kokkos + LIB_OPTIONAL_TPLS HDF5 CGNS ParMETIS Faodel Cereal DLlib Pthread DataWarp ADIOS2 Catalyst2 ${SEACAS_GTest_TPL_name} +) endif() TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib) diff --git a/packages/seacas/libraries/ioss/src/Ioss_Region.h b/packages/seacas/libraries/ioss/src/Ioss_Region.h index b6aeb52fd155..689584d5c234 100644 --- a/packages/seacas/libraries/ioss/src/Ioss_Region.h +++ b/packages/seacas/libraries/ioss/src/Ioss_Region.h @@ -22,9 +22,6 @@ #include "Ioss_Utils.h" #include "Ioss_VariableType.h" #include "ioss_export.h" -#if !defined BUILT_IN_SIERRA -#include -#endif #include // for less #include // for ostream #include // for map, map<>::value_compare @@ -435,17 +432,9 @@ namespace Ioss { if (found && field.get_role() != role) { std::ostringstream errmsg; -#if defined BUILT_IN_SIERRA errmsg << "ERROR: Field " << field.get_name() << " with role " << field.role_string() << " on entity " << entity->name() << " does not match previously found role " << Ioss::Field::role_string(role) << ".\n", -#else - fmt::print(errmsg, - "ERROR: Field {} with role {} on entity {} does not match previously found " - "role {}.\n", - field.get_name(), field.role_string(), entity->name(), - Ioss::Field::role_string(role)); -#endif IOSS_ERROR(errmsg); } diff --git a/packages/seacas/libraries/ioss/src/Ioss_StructuredBlock.h b/packages/seacas/libraries/ioss/src/Ioss_StructuredBlock.h index ece3ea2b6a83..640e37b8bd45 100644 --- a/packages/seacas/libraries/ioss/src/Ioss_StructuredBlock.h +++ b/packages/seacas/libraries/ioss/src/Ioss_StructuredBlock.h @@ -14,8 +14,6 @@ #include "Ioss_ZoneConnectivity.h" #include #include -#include -#include #include #include #include @@ -370,10 +368,3 @@ namespace Ioss { }; } // namespace Ioss -#if FMT_VERSION >= 90000 -namespace fmt { - template <> struct formatter : ostream_formatter - { - }; -} // namespace fmt -#endif diff --git a/packages/seacas/libraries/ioss/src/Ioss_ZoneConnectivity.h b/packages/seacas/libraries/ioss/src/Ioss_ZoneConnectivity.h index 9927e1b3d93a..dda947eb6ee9 100644 --- a/packages/seacas/libraries/ioss/src/Ioss_ZoneConnectivity.h +++ b/packages/seacas/libraries/ioss/src/Ioss_ZoneConnectivity.h @@ -10,8 +10,6 @@ #include #include #include -#include -#include #include #include #include @@ -153,11 +151,3 @@ namespace Ioss { IOSS_EXPORT std::ostream &operator<<(std::ostream &os, const ZoneConnectivity &zgc); } // namespace Ioss - -#if FMT_VERSION >= 90000 -namespace fmt { - template <> struct formatter : ostream_formatter - { - }; -} // namespace fmt -#endif diff --git a/packages/seacas/libraries/ioss/src/main/io_info.C b/packages/seacas/libraries/ioss/src/main/io_info.C index 706b1db90cd9..2d0552882270 100644 --- a/packages/seacas/libraries/ioss/src/main/io_info.C +++ b/packages/seacas/libraries/ioss/src/main/io_info.C @@ -1,4 +1,4 @@ -// Copyright(C) 1999-2023 National Technology & Engineering Solutions +// Copyright(C) 1999-2024 National Technology & Engineering Solutions // of Sandia, LLC (NTESS). Under the terms of Contract DE-NA0003525 with // NTESS, the U.S. Government retains certain rights in this software. // @@ -305,11 +305,7 @@ namespace { if (!sb->m_zoneConnectivity.empty()) { fmt::print("\tConnectivity with other blocks:\n"); for (const auto &zgc : sb->m_zoneConnectivity) { -#if defined __NVCC__ std::cout << zgc << "\n"; -#else - fmt::print("{}\n", zgc); -#endif } } if (!sb->m_boundaryConditions.empty()) { @@ -323,11 +319,7 @@ namespace { }); for (const auto &bc : sb_bc) { -#if defined __NVCC__ std::cout << bc << "\n"; -#else - fmt::print("{}\n", bc); -#endif } } if (interFace.compute_bbox()) { diff --git a/packages/stokhos/test/UnitTest/Stokhos_KokkosArrayKernelsUnitTest_OpenMP.cpp b/packages/stokhos/test/UnitTest/Stokhos_KokkosArrayKernelsUnitTest_OpenMP.cpp index 10dcc962727d..1358a0305a9d 100644 --- a/packages/stokhos/test/UnitTest/Stokhos_KokkosArrayKernelsUnitTest_OpenMP.cpp +++ b/packages/stokhos/test/UnitTest/Stokhos_KokkosArrayKernelsUnitTest_OpenMP.cpp @@ -81,19 +81,11 @@ TEUCHOS_UNIT_TEST( Kokkos_SG_SpMv, double_OpenMP_CrsMatrixFree_MKL ) { #endif int main( int argc, char* argv[] ) { + // Setup the MPI session Teuchos::GlobalMPISession mpiSession(&argc, &argv); - const size_t team_count = - Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa(); - const size_t threads_per_team = - Kokkos::hwloc::get_available_threads_per_core(); - - // Initialize openmp - Kokkos::InitializationSettings init_args; - init_args.set_num_threads(team_count*threads_per_team); - Kokkos::initialize( init_args ); - //Kokkos::print_configuration( std::cout ); + // Initialize Kokkos + Kokkos::initialize(argc, argv); // Setup (has to happen after initialization) setup.setup(); diff --git a/packages/stokhos/test/UnitTest/Stokhos_KokkosCrsMatrixMPVectorUnitTest_OpenMP.cpp b/packages/stokhos/test/UnitTest/Stokhos_KokkosCrsMatrixMPVectorUnitTest_OpenMP.cpp index f7e7f226bd51..90744f4eb615 100644 --- a/packages/stokhos/test/UnitTest/Stokhos_KokkosCrsMatrixMPVectorUnitTest_OpenMP.cpp +++ b/packages/stokhos/test/UnitTest/Stokhos_KokkosCrsMatrixMPVectorUnitTest_OpenMP.cpp @@ -114,19 +114,11 @@ TEUCHOS_UNIT_TEST_TEMPLATE_2_DECL( CRS_MATRIX_MP_VECTOR_MULTIPLY_TESTS_ORDINAL_SCALAR_DEVICE(int, double, OpenMP) int main( int argc, char* argv[] ) { + // Setup the MPI session Teuchos::GlobalMPISession mpiSession(&argc, &argv); - // Initialize threads - num_cores = - Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa(); - num_hyper_threads = - Kokkos::hwloc::get_available_threads_per_core(); - - Kokkos::InitializationSettings init_args; - init_args.set_num_threads(num_cores*num_hyper_threads); - Kokkos::initialize( init_args ); - //Kokkos::print_configuration(std::cout); + // Initialize Kokkos + Kokkos::initialize(argc, argv); // Run tests int ret = Teuchos::UnitTestRepository::runUnitTestsFromMain(argc, argv); diff --git a/packages/stokhos/test/UnitTest/Stokhos_KokkosCrsMatrixUQPCEUnitTest_OpenMP.cpp b/packages/stokhos/test/UnitTest/Stokhos_KokkosCrsMatrixUQPCEUnitTest_OpenMP.cpp index 28a15d2e9cde..6cd991ee4c25 100644 --- a/packages/stokhos/test/UnitTest/Stokhos_KokkosCrsMatrixUQPCEUnitTest_OpenMP.cpp +++ b/packages/stokhos/test/UnitTest/Stokhos_KokkosCrsMatrixUQPCEUnitTest_OpenMP.cpp @@ -52,19 +52,11 @@ using Kokkos::OpenMP; CRSMATRIX_UQ_PCE_TESTS_DEVICE( OpenMP ) int main( int argc, char* argv[] ) { + // Setup the MPI session Teuchos::GlobalMPISession mpiSession(&argc, &argv); - // Initialize threads - const size_t num_cores = - Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa(); - const size_t num_hyper_threads = - Kokkos::hwloc::get_available_threads_per_core(); - - Kokkos::InitializationSettings init_args; - init_args.set_num_threads(num_cores*num_hyper_threads); - Kokkos::initialize( init_args ); - Kokkos::print_configuration(std::cout); + // Initialize Kokkos + Kokkos::initialize(argc, argv); // Run tests int ret = Teuchos::UnitTestRepository::runUnitTestsFromMain(argc, argv); diff --git a/packages/stokhos/test/UnitTest/Stokhos_KokkosViewFadMPVectorUnitTest_OpenMP.cpp b/packages/stokhos/test/UnitTest/Stokhos_KokkosViewFadMPVectorUnitTest_OpenMP.cpp index 101d6965dc1e..b629774c026d 100644 --- a/packages/stokhos/test/UnitTest/Stokhos_KokkosViewFadMPVectorUnitTest_OpenMP.cpp +++ b/packages/stokhos/test/UnitTest/Stokhos_KokkosViewFadMPVectorUnitTest_OpenMP.cpp @@ -52,19 +52,11 @@ using Kokkos::OpenMP; VIEW_FAD_MP_VECTOR_TESTS_DEVICE( OpenMP ) int main( int argc, char* argv[] ) { +// Setup the MPI session Teuchos::GlobalMPISession mpiSession(&argc, &argv); - // Initialize threads - size_t num_cores = - Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa(); - size_t num_hyper_threads = - Kokkos::hwloc::get_available_threads_per_core(); - - Kokkos::InitializationSettings init_args; - init_args.set_num_threads(num_cores*num_hyper_threads); - Kokkos::initialize( init_args ); - //Kokkos::print_configuration(std::cout); + // Initialize Kokkos + Kokkos::initialize(argc, argv); // Run tests int ret = Teuchos::UnitTestRepository::runUnitTestsFromMain(argc, argv); diff --git a/packages/stokhos/test/UnitTest/Stokhos_KokkosViewMPVectorUnitTest_OpenMP.cpp b/packages/stokhos/test/UnitTest/Stokhos_KokkosViewMPVectorUnitTest_OpenMP.cpp index d0cce43ede2d..c206ec7dc18c 100644 --- a/packages/stokhos/test/UnitTest/Stokhos_KokkosViewMPVectorUnitTest_OpenMP.cpp +++ b/packages/stokhos/test/UnitTest/Stokhos_KokkosViewMPVectorUnitTest_OpenMP.cpp @@ -52,19 +52,11 @@ using Kokkos::OpenMP; VIEW_MP_VECTOR_TESTS_DEVICE( OpenMP ) int main( int argc, char* argv[] ) { + // Setup the MPI session Teuchos::GlobalMPISession mpiSession(&argc, &argv); - // Initialize threads - size_t num_cores = - Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa(); - size_t num_hyper_threads = - Kokkos::hwloc::get_available_threads_per_core(); - - Kokkos::InitializationSettings init_args; - init_args.set_num_threads(num_cores*num_hyper_threads); - Kokkos::initialize( init_args ); - //Kokkos::print_configuration(std::cout); + // Initialize Kokkos + Kokkos::initialize(argc, argv); // Run tests int ret = Teuchos::UnitTestRepository::runUnitTestsFromMain(argc, argv); diff --git a/packages/stokhos/test/UnitTest/Stokhos_KokkosViewUQPCEUnitTest_OpenMP.cpp b/packages/stokhos/test/UnitTest/Stokhos_KokkosViewUQPCEUnitTest_OpenMP.cpp index 50e00e086f0e..cababd922cba 100644 --- a/packages/stokhos/test/UnitTest/Stokhos_KokkosViewUQPCEUnitTest_OpenMP.cpp +++ b/packages/stokhos/test/UnitTest/Stokhos_KokkosViewUQPCEUnitTest_OpenMP.cpp @@ -52,21 +52,11 @@ using Kokkos::OpenMP; VIEW_UQ_PCE_TESTS_DEVICE( OpenMP ) int main( int argc, char* argv[] ) { +// Setup the MPI session Teuchos::GlobalMPISession mpiSession(&argc, &argv); - // Initialize threads - const size_t num_cores = - Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa(); - const size_t num_hyper_threads = - Kokkos::hwloc::get_available_threads_per_core(); - // const size_t num_cores = 1; - // const size_t num_hyper_threads = 1; - - Kokkos::InitializationSettings init_args; - init_args.set_num_threads(num_cores*num_hyper_threads); - Kokkos::initialize( init_args ); - //Kokkos::print_configuration(std::cout); + // Initialize Kokkos + Kokkos::initialize(argc, argv); // Run tests int ret = Teuchos::UnitTestRepository::runUnitTestsFromMain(argc, argv); diff --git a/packages/stokhos/test/UnitTest/Stokhos_TpetraCrsMatrixMPVectorUnitTest_OpenMP.cpp b/packages/stokhos/test/UnitTest/Stokhos_TpetraCrsMatrixMPVectorUnitTest_OpenMP.cpp index 830e2615cfea..d274c3b0770c 100644 --- a/packages/stokhos/test/UnitTest/Stokhos_TpetraCrsMatrixMPVectorUnitTest_OpenMP.cpp +++ b/packages/stokhos/test/UnitTest/Stokhos_TpetraCrsMatrixMPVectorUnitTest_OpenMP.cpp @@ -53,22 +53,13 @@ typedef Tpetra::KokkosCompat::KokkosDeviceWrapperNode OpenMPWrap CRSMATRIX_MP_VECTOR_TESTS_N( OpenMPWrapperNode ) int main( int argc, char* argv[] ) { + // Setup the MPI session Teuchos::GlobalMPISession mpiSession(&argc, &argv); Kokkos::global_sacado_mp_vector_size = VectorSize; - // Initialize threads - const size_t num_cores = - Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa(); - const size_t num_hyper_threads = - Kokkos::hwloc::get_available_threads_per_core(); - // const size_t num_cores = 1; - // const size_t num_hyper_threads = 1; - Kokkos::InitializationSettings init_args; - init_args.set_num_threads(num_cores*num_hyper_threads); - Kokkos::initialize( init_args ); - //Kokkos::print_configuration(std::cout); + // Initialize Kokkos + Kokkos::initialize(argc, argv); // Run tests Teuchos::UnitTestRepository::setGloballyReduceTestResult(true); diff --git a/packages/stokhos/test/UnitTest/Stokhos_TpetraCrsMatrixUQPCEUnitTest_OpenMP.cpp b/packages/stokhos/test/UnitTest/Stokhos_TpetraCrsMatrixUQPCEUnitTest_OpenMP.cpp index 1cd3664efd43..dd8ee24ef052 100644 --- a/packages/stokhos/test/UnitTest/Stokhos_TpetraCrsMatrixUQPCEUnitTest_OpenMP.cpp +++ b/packages/stokhos/test/UnitTest/Stokhos_TpetraCrsMatrixUQPCEUnitTest_OpenMP.cpp @@ -53,20 +53,11 @@ typedef Tpetra::KokkosCompat::KokkosDeviceWrapperNode OpenMPWrap CRSMATRIX_UQ_PCE_TESTS_N( OpenMPWrapperNode ) int main( int argc, char* argv[] ) { +// Setup the MPI session Teuchos::GlobalMPISession mpiSession(&argc, &argv); - // Initialize threads - const size_t num_cores = - Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa(); - const size_t num_hyper_threads = - Kokkos::hwloc::get_available_threads_per_core(); - // const size_t num_cores = 1; - // const size_t num_hyper_threads = 1; - Kokkos::InitializationSettings init_args; - init_args.set_num_threads(num_cores*num_hyper_threads); - Kokkos::initialize( init_args ); - //Kokkos::print_configuration(std::cout); + // Initialize Kokkos + Kokkos::initialize(argc, argv); // Run tests Teuchos::UnitTestRepository::setGloballyReduceTestResult(true); diff --git a/packages/stratimikos/adapters/aztecoo/src/AztecOOParameterList.cpp b/packages/stratimikos/adapters/aztecoo/src/AztecOOParameterList.cpp index d5b29ad2fd25..fbf77cf770bf 100644 --- a/packages/stratimikos/adapters/aztecoo/src/AztecOOParameterList.cpp +++ b/packages/stratimikos/adapters/aztecoo/src/AztecOOParameterList.cpp @@ -68,7 +68,6 @@ inline std::istream& operator>>(std::istream& is, EAztecPreconditioner& prec){ prec = (EAztecPreconditioner)intval; return is; } - const std::string Overlap_name = "Overlap"; @@ -122,14 +121,18 @@ void setAztecOOParameters( ) ) { + // This is the only place that EAztecPreconditioner is used. Everywhere + // else the code expects a string value. case AZTEC_PREC_NONE: solver->SetAztecOption(AZ_precond,AZ_none); + pl->set(AztecPreconditioner_name, "none"); break; case AZTEC_PREC_ILU: solver->SetAztecOption(AZ_precond,AZ_dom_decomp); solver->SetAztecOption(AZ_overlap,getParameter(*pl,Overlap_name)); solver->SetAztecOption(AZ_subdomain_solve,AZ_ilu); solver->SetAztecOption(AZ_graph_fill,getParameter(*pl,GraphFill_name)); + pl->set(AztecPreconditioner_name, "ilu"); break; case AZTEC_PREC_ILUT: solver->SetAztecOption(AZ_precond,AZ_dom_decomp); @@ -137,22 +140,27 @@ void setAztecOOParameters( solver->SetAztecOption(AZ_subdomain_solve,AZ_ilut); solver->SetAztecParam(AZ_drop,getParameter(*pl,DropTolerance_name)); solver->SetAztecParam(AZ_ilut_fill,getParameter(*pl,FillFactor_name)); + pl->set(AztecPreconditioner_name, "ilut"); break; case AZTEC_PREC_JACOBI: solver->SetAztecOption(AZ_precond,AZ_Jacobi); solver->SetAztecOption(AZ_poly_ord,getParameter(*pl,Steps_name)); + pl->set(AztecPreconditioner_name, "Jacobi"); break; case AZTEC_PREC_SYMMGS: solver->SetAztecOption(AZ_precond,AZ_sym_GS); solver->SetAztecOption(AZ_poly_ord,getParameter(*pl,Steps_name)); + pl->set(AztecPreconditioner_name, "Symmetric Gauss-Seidel"); break; case AZTEC_PREC_POLY: solver->SetAztecOption(AZ_precond,AZ_Neumann); solver->SetAztecOption(AZ_poly_ord,getParameter(*pl,PolynomialOrder_name)); + pl->set(AztecPreconditioner_name, "Polynomial"); break; case AZTEC_PREC_LSPOLY: solver->SetAztecOption(AZ_precond,AZ_ls); solver->SetAztecOption(AZ_poly_ord,getParameter(*pl,PolynomialOrder_name)); + pl->set(AztecPreconditioner_name, "Least-squares Polynomial"); break; default: TEUCHOS_TEST_FOR_EXCEPT(true); // Should never get here! diff --git a/packages/teuchos/CMakeLists.txt b/packages/teuchos/CMakeLists.txt index de0afbb026f5..e08961c1bdba 100644 --- a/packages/teuchos/CMakeLists.txt +++ b/packages/teuchos/CMakeLists.txt @@ -349,6 +349,13 @@ ENDIF() # ${PACKAGE_NAME}_ENABLE_COMPLEX # C) Set up package-specific options # +TRIBITS_ADD_OPTION_AND_DEFINE( + ${PACKAGE_NAME}_MODIFY_DEFAULTS_DURING_VALIDATION + HAVE_TEUCHOS_MODIFY_DEFAULTS_DURING_VALIDATION + "Modify default parameters during validation." + ON + ) + TRIBITS_ADD_OPTION_AND_DEFINE( ${PACKAGE_NAME}_ENABLE_DEBUG_RCP_NODE_TRACING HAVE_TEUCHOS_DEBUG_RCP_NODE_TRACING diff --git a/packages/teuchos/core/cmake/Teuchos_config.h.in b/packages/teuchos/core/cmake/Teuchos_config.h.in index 15e26f12cf26..3764c1a1bd26 100644 --- a/packages/teuchos/core/cmake/Teuchos_config.h.in +++ b/packages/teuchos/core/cmake/Teuchos_config.h.in @@ -124,6 +124,8 @@ #cmakedefine HAVE_TEUCHOS_DEBUG_RCP_NODE_TRACING +#cmakedefine HAVE_TEUCHOS_MODIFY_DEFAULTS_DURING_VALIDATION + #cmakedefine HAVE_TEUCHOS_DEMANGLE #cmakedefine HAVE_TEUCHOS_EXPAT diff --git a/packages/teuchos/parameterlist/src/Teuchos_ParameterList.cpp b/packages/teuchos/parameterlist/src/Teuchos_ParameterList.cpp index 914d06950bbe..25f1d18f6260 100644 --- a/packages/teuchos/parameterlist/src/Teuchos_ParameterList.cpp +++ b/packages/teuchos/parameterlist/src/Teuchos_ParameterList.cpp @@ -696,7 +696,15 @@ void ParameterList::validateParametersAndSetDefaults( validEntry.getAny(), true // isDefault ); - newEntry.setValidator(validEntry.validator()); + RCP validator; + if (nonnull(validator=validEntry.validator())) { +#if defined(HAVE_TEUCHOS_MODIFY_DEFAULTS_DURING_VALIDATION) + validEntry.validator()->validateAndModify(this->name(itr), validEntryName, &newEntry); + // validateAndModify changes the default status so we reset it + newEntry.setAnyValue(newEntry.getAny(), true); +#endif + newEntry.setValidator(validator); + } this->setEntry(validEntryName,newEntry); } } diff --git a/packages/teuchos/parameterlist/src/Teuchos_StandardDependencies.hpp b/packages/teuchos/parameterlist/src/Teuchos_StandardDependencies.hpp index 95c9d588a88c..01067243a86a 100644 --- a/packages/teuchos/parameterlist/src/Teuchos_StandardDependencies.hpp +++ b/packages/teuchos/parameterlist/src/Teuchos_StandardDependencies.hpp @@ -229,7 +229,7 @@ class TEUCHOSPARAMETERLIST_LIB_DLL_EXPORT ValidatorDependency : public Dependenc }; /** - * \brief A string visual depdencies says the following about the + * \brief A string visual dependency says the following about the * relationship between two elements in a Parameter List: * Depending on whether or not the dependee has a particular value, * the dependent may or may not be displayed to the user in a UI. diff --git a/packages/teuchos/parameterlist/src/Teuchos_StandardParameterEntryValidators.hpp b/packages/teuchos/parameterlist/src/Teuchos_StandardParameterEntryValidators.hpp index da878712c40f..682ab3c6ed7c 100644 --- a/packages/teuchos/parameterlist/src/Teuchos_StandardParameterEntryValidators.hpp +++ b/packages/teuchos/parameterlist/src/Teuchos_StandardParameterEntryValidators.hpp @@ -317,6 +317,14 @@ class StringToIntegralParameterEntryValidator : public ParameterEntryValidator { std::string const& paramName, std::string const& sublistName ) const; + +#if defined(HAVE_TEUCHOS_MODIFY_DEFAULTS_DURING_VALIDATION) + void validateAndModify( + std::string const& paramName, + std::string const& sublistName, + ParameterEntry * entry + ) const; +#endif //@} @@ -328,9 +336,25 @@ class StringToIntegralParameterEntryValidator : public ParameterEntryValidator { typedef std::map map_t; map_t map_; + typedef std::map inv_map_t; + inv_map_t inv_map_; const bool caseSensitive_; + /** \brief Auxiliary method to simplify constructors + * + * \param strings [in] Array of unique names for the enum or integer + * values. These are the strings which users will see and use + * when setting parameters. strings[i] will be + * associated with the enum or integer value + * integralValues[i]. + * + * \param integralValues [in] Array of the enum or integer values + * associated with strings[]. + */ + void init(const ArrayView& strings, + const ArrayView& integralValues); + void setValidValues( ArrayView const& strings, ArrayView const* stringsDocs = NULL @@ -434,8 +458,8 @@ stringToIntegralParameterEntryValidator( * * The function getIntegralValue() can then be used to extract the * integral value of the std::string parameter. In this case, the integral - * value return will just be the zero-based index of the std::string value in - * the list strings. + * value returned will just be the zero-based index of the std::string value + * in the list strings. * * \relates ParameterList */ @@ -497,9 +521,9 @@ void setStringToIntegralParameter( /** \brief Get an integral value for a parameter that is assumed to already be * set. * - * This function does a dynamic cast to get the underlying valiator of type + * This function does a dynamic cast to get the underlying validator of type * StringToIntegralParameterEntryValidator. If this dynamic - * cast failes then an Exceptions::InvalidParameterType + * cast fails then an Exceptions::InvalidParameterType * std::exception is thrown with an excellent error message. * * \relates ParameterList @@ -2423,16 +2447,10 @@ StringToIntegralParameterEntryValidator (ArrayView const& str defaultParameterName_ (defaultParameterName), caseSensitive_ (caseSensitive) { - typedef typename map_t::value_type val_t; - for (int i = 0; i < static_cast (strings.size ()); ++i) { - const bool unique = caseSensitive_ ? - map_.insert (val_t (strings[i], static_cast (i))).second : - map_.insert (val_t (upperCase (strings[i]), static_cast (i))).second; - TEUCHOS_TEST_FOR_EXCEPTION( - ! unique, std::logic_error, - "For parameter \"" << defaultParameterName_ << "\": " - "strings[" << i << "] = \"" << strings[i] << "\" is a duplicate."); - } + const int length = static_cast(strings.size()); + Array integralValues(length); + for (int i = 0; i < length; ++i) integralValues[i] = static_cast(i); + init(strings, integralValues); setValidValues (strings); } @@ -2450,22 +2468,7 @@ StringToIntegralParameterEntryValidator (ArrayView const& str #ifdef TEUCHOS_DEBUG TEUCHOS_ASSERT_EQUALITY( strings.size(), integralValues.size() ); #endif - TEUCHOS_TEST_FOR_EXCEPTION( - strings.size() != integralValues.size(), - std::logic_error, - "The input arrays strings and integralValues must have the same length."); - - typedef typename map_t::value_type val_t; - for (int i = 0; i < static_cast (strings.size ()); ++i) { - const bool unique = caseSensitive_ ? - map_.insert (val_t (strings[i], integralValues[i])).second : - map_.insert (val_t (upperCase (strings[i]), integralValues[i])).second; - - TEUCHOS_TEST_FOR_EXCEPTION( - ! unique, std::logic_error, - "For parameter \"" << defaultParameterName_ << "\": " - "strings[" << i << "] = \"" << strings[i] << "\" is a duplicate."); - } + init(strings, integralValues); setValidValues (strings); } @@ -2482,30 +2485,44 @@ StringToIntegralParameterEntryValidator (ArrayView const& { #ifdef TEUCHOS_DEBUG TEUCHOS_ASSERT_EQUALITY( strings.size(), stringsDocs.size() ); - TEUCHOS_ASSERT_EQUALITY( strings.size(), integralValues.size() ); #endif - TEUCHOS_TEST_FOR_EXCEPTION( - strings.size() != integralValues.size(), - std::logic_error, - "The input arrays strings and integralValues must have the same length."); - TEUCHOS_TEST_FOR_EXCEPTION( strings.size() != stringsDocs.size(), std::logic_error, "The input arrays strings and stringsDocs must have the same length."); + init(strings, integralValues); + setValidValues(strings,&stringsDocs); +} + +template +void StringToIntegralParameterEntryValidator::init( + ArrayView const &strings, + ArrayView const &integralValues) { + +#ifdef TEUCHOS_DEBUG + TEUCHOS_ASSERT_EQUALITY(strings.size(), integralValues.size()); +#endif + + TEUCHOS_TEST_FOR_EXCEPTION( + strings.size() != integralValues.size(), std::logic_error, + "The input arrays strings and integralValues must have the same length."); + typedef typename map_t::value_type val_t; - for (int i = 0; i < static_cast (strings.size ()); ++i) { - const bool unique = caseSensitive_ ? - map_.insert (val_t (strings[i], integralValues[i])).second : - map_.insert (val_t (upperCase (strings[i]), integralValues[i])).second; - TEUCHOS_TEST_FOR_EXCEPTION( - ! unique, std::logic_error, - "For parameter \"" << defaultParameterName_ << "\": " - "strings[" << i << "] = \"" << strings[i] << "\" is a duplicate."); + typedef typename inv_map_t::value_type inv_val_t; + for (int i = 0; i < static_cast(strings.size()); ++i) { + const std::string name = + caseSensitive_ ? strings[i] : upperCase(strings[i]); + const bool unique = map_.insert(val_t(name, integralValues[i])).second; + TEUCHOS_TEST_FOR_EXCEPTION(!unique, std::logic_error, + "For parameter \"" << defaultParameterName_ + << "\": " + "strings[" + << i << "] = \"" << strings[i] + << "\" is a duplicate."); + inv_map_.insert(inv_val_t(integralValues[i], name)); } - setValidValues(strings,&stringsDocs); } // Lookup functions @@ -2540,18 +2557,22 @@ StringToIntegralParameterEntryValidator::getIntegralValue( ,const std::string &sublistName, const bool activeQuery ) const { - const bool validType = ( entry.getAny(activeQuery).type() == typeid(std::string) ); - TEUCHOS_TEST_FOR_EXCEPTION_PURE_MSG( - !validType, Exceptions::InvalidParameterType - ,"Error, the parameter {paramName=\""<<(paramName.length()?paramName:defaultParameterName_) - << "\",type=\""<(entry.getAny(activeQuery)); // This cast should not fail! - return getIntegralValue(strValue,paramName,sublistName); // This will validate the value and throw! + if (entry.isType()){ + return any_cast(entry.getAny(activeQuery)); + } else{ + const bool validType = ( entry.getAny(activeQuery).type() == typeid(std::string) ); + TEUCHOS_TEST_FOR_EXCEPTION_PURE_MSG( + !validType, Exceptions::InvalidParameterType + ,"Error, the parameter {paramName=\""<<(paramName.length()?paramName:defaultParameterName_) + << "\",type=\""<(entry.getAny(activeQuery)); // This cast should not fail! + return getIntegralValue(strValue,paramName,sublistName); // This will validate the value and throw! + } } @@ -2562,10 +2583,18 @@ StringToIntegralParameterEntryValidator::getStringValue( ,const std::string &sublistName, const bool activeQuery ) const { - // Validate the parameter's type and value - this->getIntegralValue(entry,paramName,sublistName,activeQuery); - // Return the std::string value which is now validated! - return any_cast(entry.getAny(activeQuery)); // This cast should not fail! + if (entry.isType()){ + const IntegralType intVal = any_cast(entry.getAny(activeQuery)); + typename inv_map_t::const_iterator itr = inv_map_.find(intVal); + // typename inv_map_t::const_iterator itr = inv_map_.find(intVal); + // TODO: Maybe do a test on intVal but it should be valid by construction + return (*itr).second; + } else{ + // Validate the parameter's type and value + this->getIntegralValue(entry,paramName,sublistName,activeQuery); + // Return the std::string value which is now validated! + return any_cast(entry.getAny(activeQuery)); // This cast should not fail! + } } @@ -2592,7 +2621,7 @@ StringToIntegralParameterEntryValidator::getStringValue( { const std::string& strValue = paramList.get (paramName, - caseSensitive_ ? defaultValue : upperCase (defaultValue)); + caseSensitive_ ? defaultValue : upperCase (defaultValue)); getIntegralValue(strValue,paramName,paramList.name()); // Validate! return strValue; } @@ -2676,6 +2705,19 @@ void StringToIntegralParameterEntryValidator::validate( } +#if defined(HAVE_TEUCHOS_MODIFY_DEFAULTS_DURING_VALIDATION) +template +void StringToIntegralParameterEntryValidator::validateAndModify( + std::string const& paramName, + std::string const& sublistName, + ParameterEntry * entry + ) const +{ + entry->setValue(this->getIntegralValue(*entry, paramName, sublistName, false)); +} +#endif + + // private template @@ -2889,12 +2931,16 @@ IntegralType Teuchos::getIntegralValue( ) { const ParameterEntry &entry = paramList.getEntry(paramName); - RCP > - integralValidator = getStringToIntegralParameterEntryValidator( - entry, paramList, paramName - ); - return integralValidator->getIntegralValue( - entry, paramName, paramList.name(), true ); + if (entry.isType()){ + return getValue(entry); + } else{ + RCP > + integralValidator = getStringToIntegralParameterEntryValidator( + entry, paramList, paramName + ); + return integralValidator->getIntegralValue( + entry, paramName, paramList.name(), true ); + } } diff --git a/packages/teuchos/parameterlist/test/Dependencies/Dependencies_UnitTests.cpp b/packages/teuchos/parameterlist/test/Dependencies/Dependencies_UnitTests.cpp index 626feed17233..bff2f925652c 100644 --- a/packages/teuchos/parameterlist/test/Dependencies/Dependencies_UnitTests.cpp +++ b/packages/teuchos/parameterlist/test/Dependencies/Dependencies_UnitTests.cpp @@ -472,7 +472,7 @@ TEUCHOS_UNIT_TEST(Teuchos_Dependencies, testVisualDeps){ /* - * Testing the StringVisualDepenency + * Testing the StringVisualDependency */ ParameterList& stringVisDepList = My_deplist->sublist( diff --git a/packages/teuchos/parameterlist/test/ParameterList/ParameterList_UnitTests.cpp b/packages/teuchos/parameterlist/test/ParameterList/ParameterList_UnitTests.cpp index 48a460f71200..cdead39cbfca 100644 --- a/packages/teuchos/parameterlist/test/ParameterList/ParameterList_UnitTests.cpp +++ b/packages/teuchos/parameterlist/test/ParameterList/ParameterList_UnitTests.cpp @@ -866,14 +866,39 @@ TEUCHOS_UNIT_TEST( ParameterList, validateAgainstSelf ) } -TEUCHOS_UNIT_TEST( ParameterList, validateParametersAndSetDefaults ) +TEUCHOS_UNIT_TEST( ParameterList, validateParametersAndSetDefaults_default ) { + // Test for proper behavior when the user doesn't set `Nonlinear Solver` ParameterList PL_Main = createMainPL(); ParameterList PL_Main_valid = createValidMainPL(); ECHO(PL_Main.validateParametersAndSetDefaults(PL_Main_valid)); TEST_NOTHROW( rcp_dynamic_cast >( PL_Main.getEntry("Nonlinear Solver").validator(), true ) ); + // Make sure the parameter entry is set to default and unused after validation + const ParameterEntry &default_entry = PL_Main.getEntry("Nonlinear Solver"); + TEST_EQUALITY(default_entry.isDefault(), true); + TEST_EQUALITY(default_entry.isUsed(), false); + // Make sure the value is stored as an integer after validation +#if defined(HAVE_TEUCHOS_MODIFY_DEFAULTS_DURING_VALIDATION) + TEST_NOTHROW(Teuchos::any_cast(default_entry.getAny())); +#endif +} + + +TEUCHOS_UNIT_TEST( ParameterList, validateParametersAndSetDefaults_noDefault ) +{ + // Now make sure we have the correct behavior when not using a default value + ParameterList PL_Main = createMainPL(); + PL_Main.set("Nonlinear Solver", "Trust Region Based"); + ParameterList PL_Main_valid = createValidMainPL(); + PL_Main.validateParametersAndSetDefaults(PL_Main_valid); + const ParameterEntry &entry = PL_Main.getEntry("Nonlinear Solver"); + TEST_EQUALITY(entry.isDefault(), false); + TEST_EQUALITY(entry.isUsed(), false); +#if defined(HAVE_TEUCHOS_MODIFY_DEFAULTS_DURING_VALIDATION) + TEST_NOTHROW(Teuchos::any_cast(entry.getAny())); +#endif } diff --git a/packages/teuchos/parameterlist/test/Validators/Validator_UnitTest.cpp b/packages/teuchos/parameterlist/test/Validators/Validator_UnitTest.cpp index 38f322eb06e8..be44944bfcb9 100644 --- a/packages/teuchos/parameterlist/test/Validators/Validator_UnitTest.cpp +++ b/packages/teuchos/parameterlist/test/Validators/Validator_UnitTest.cpp @@ -495,6 +495,43 @@ TEUCHOS_UNIT_TEST(Teuchos_Validators, stringValidator) } +/* + * Testing StringToIntegralParameterEntryValidator. + */ +TEUCHOS_UNIT_TEST(Teuchos_Validators, StringToIntegralParameterEntryValidator) { + Array strVals = tuple("str1", "str2", "str3"); + Array strDocs = tuple("a str1", "a str2", "a str3"); + Array intVals = tuple(1, 2, 3); + bool caseSensitive = true; + typedef StringToIntegralParameterEntryValidator ret_type; + // Note that validator1 maps the strings to {0, 1, 2} not {1, 2, 3} as in `intVals` + RCP validator1 = rcp(new ret_type(strVals, "str1", caseSensitive)); + RCP validator2 = rcp(new ret_type(strVals, intVals, "str1", caseSensitive)); + RCP validator3 = rcp(new ret_type(strVals, strDocs, intVals, "str1", caseSensitive)); + TEST_EQUALITY(strDocs, *validator3->getStringDocs()); + ParameterList valid_pl = ParameterList(); + valid_pl.set("Param1", "str1", "Parameter 1", validator1); + valid_pl.set("Param2", "str1", "Parameter 2", validator2); + valid_pl.set("Param3", "str1", "Parameter 3", validator3); + ParameterList user_pl = ParameterList(); + user_pl.set("Param1", "str1"); + user_pl.set("Param2", "str2"); + user_pl.set("Param3", "str3"); + // Test `getStringValue` and `getIntegralValue` before validation on `valid_pl` + TEST_EQUALITY(0, getIntegralValue(valid_pl, "Param1")); + TEST_EQUALITY(intVals[0], getIntegralValue(valid_pl, "Param2")); + TEST_EQUALITY(strVals[0], getStringValue(valid_pl, "Param2")); + // Test `getStringValue` and `getIntegralValue` after validation on `user_pl` + user_pl.validateParametersAndSetDefaults(valid_pl); + TEST_EQUALITY(0, getIntegralValue(user_pl, "Param1")); + TEST_EQUALITY(intVals[1], getIntegralValue(user_pl, "Param2")); + TEST_EQUALITY(intVals[2], getIntegralValue(user_pl, "Param3")); + TEST_EQUALITY(strVals[0], getStringValue(user_pl, "Param1")); + TEST_EQUALITY(strVals[1], getStringValue(user_pl, "Param2")); + TEST_EQUALITY(strVals[2], getStringValue(user_pl, "Param3")); +} + + /* * Testing FileNameValidator. */ diff --git a/packages/tpetra/CMakeLists.txt b/packages/tpetra/CMakeLists.txt index 09a05be23d93..0d7df3785907 100644 --- a/packages/tpetra/CMakeLists.txt +++ b/packages/tpetra/CMakeLists.txt @@ -24,7 +24,7 @@ TRIBITS_ADD_OPTION_AND_DEFINE( # Supported Kokkos version in Trilinos # NOTE: When we snapshot Kokkos into Trilinos, we have to update these numbers to maintain # compatibility with external Kokkos -SET(Tpetra_SUPPORTED_KOKKOS_VERSION "4.2.1") +SET(Tpetra_SUPPORTED_KOKKOS_VERSION "4.3.0") # Option to allow developers to ignore incompatible Kokkos versions diff --git a/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_Helpers_decl.hpp b/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_Helpers_decl.hpp index 31d048d0e9f4..2b838977e957 100644 --- a/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_Helpers_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_Helpers_decl.hpp @@ -44,6 +44,7 @@ #include "Tpetra_BlockCrsMatrix_fwd.hpp" #include "Tpetra_CrsMatrix_fwd.hpp" +#include "Tpetra_CrsGraph_fwd.hpp" #include "Tpetra_Map_fwd.hpp" #include "Teuchos_RCP.hpp" #include @@ -88,6 +89,18 @@ namespace Tpetra { template void writeMatrixStrip(BlockCrsMatrix const &A, std::ostream &os, Teuchos::ParameterList const ¶ms); + /// \brief Non-member constructor that creates the CrsGraph of a BlockCrsMatrix + /// from an existing point CrsMatrix and a block size. + /// + /// This function accepts an already constructed point version of the block matrix. + /// Assumptions: + /// - All point entries in a logical block must be stored in the CrsMatrix, even + /// if the values are zero. + /// - Point rows corresponding to a particular mesh node must be stored consecutively. + template + Teuchos::RCP> + getBlockCrsGraph(const Tpetra::CrsMatrix& pointMatrix, const LO &blockSize); + /// \brief Non-member constructor that creates a BlockCrsMatrix from an existing point CrsMatrix. /// /// This function accepts an already constructed point version of the block matrix. diff --git a/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_Helpers_def.hpp b/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_Helpers_def.hpp index ed5b3fcba0d5..9c8d7dddb924 100644 --- a/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_Helpers_def.hpp +++ b/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_Helpers_def.hpp @@ -292,8 +292,8 @@ namespace Tpetra { } template - Teuchos::RCP > - convertToBlockCrsMatrix(const Tpetra::CrsMatrix& pointMatrix, const LO &blockSize) + Teuchos::RCP > + getBlockCrsGraph(const Tpetra::CrsMatrix& pointMatrix, const LO &blockSize) { /* @@ -308,11 +308,19 @@ namespace Tpetra { using Teuchos::ArrayView; using Teuchos::RCP; - typedef Tpetra::BlockCrsMatrix block_crs_matrix_type; typedef Tpetra::Map map_type; typedef Tpetra::CrsGraph crs_graph_type; typedef Tpetra::CrsMatrix crs_matrix_type; + using local_graph_device_type = typename crs_matrix_type::local_graph_device_type; + using row_map_type = typename local_graph_device_type::row_map_type::non_const_type; + using entries_type = typename local_graph_device_type::entries_type::non_const_type; + + using offset_type = typename row_map_type::non_const_value_type; + + using execution_space = typename Node::execution_space; + using range_type = Kokkos::RangePolicy; + const map_type &pointRowMap = *(pointMatrix.getRowMap()); RCP meshRowMap = createMeshMap(blockSize, pointRowMap); @@ -326,116 +334,116 @@ namespace Tpetra { const map_type &pointRangeMap = *(pointMatrix.getRangeMap()); RCP meshRangeMap = createMeshMap(blockSize, pointRangeMap); - // Use graph ctor that provides column map and upper bound on nonzeros per row. - // We can use static profile because the point graph should have at least as many entries per - // row as the mesh graph. - RCP meshCrsGraph = rcp(new crs_graph_type(meshRowMap, meshColMap, - pointMatrix.getGlobalMaxNumRowEntries())); - // Fill the graph by walking through the matrix. For each mesh row, we query the collection of point - // rows associated with it. The point column ids are converted to mesh column ids and put into an array. - // As each point row collection is finished, the mesh column ids are sorted, made unique, and inserted - // into the mesh graph. - typename crs_matrix_type::local_inds_host_view_type pointColInds; - typename crs_matrix_type::values_host_view_type pointVals; - Array meshColGids; - meshColGids.reserve(pointMatrix.getGlobalMaxNumRowEntries()); - - //again, I assume that point GIDs associated with a mesh GID are consecutive. - //if they are not, this will break!! - GO indexBase = pointColMap.getIndexBase(); - for (size_t i=0; igetLocalElement(meshColInd) == Teuchos::OrdinalTraits::invalid()) { - std::ostringstream oss; - oss<< "["<getLocalGraphDevice(); + auto pointRowptr = pointLocalGraph.row_map; + auto pointColind = pointLocalGraph.entries; + + TEUCHOS_TEST_FOR_EXCEPTION(pointColind.extent(0) % bs2 != 0, + std::runtime_error, "Tpetra::getBlockCrsGraph: " + "local number of non zero entries is not a multiple of blockSize^2 "); + + LO block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0)-1)/blockSize; + row_map_type blockRowptr("blockRowptr", block_rows+1); + entries_type blockColind("blockColind", pointColind.extent(0)/(bs2)); + + Kokkos::parallel_for("fillMesh",range_type(0,block_rows), KOKKOS_LAMBDA(const LO i) { + + const LO offset_b = pointRowptr(i*blockSize)/bs2; + const LO offset_b_max = pointRowptr((i+1)*blockSize)/bs2; + + if (i==block_rows-1) + blockRowptr(i+1) = offset_b_max; + blockRowptr(i) = offset_b; + + const LO offset_p = pointRowptr(i*blockSize); - meshColGids.push_back(meshColInd); + for (LO k=0; kinsertGlobalIndices(meshRowMap->getGlobalElement(i), meshColGids()); - meshColGids.clear(); + }); + + meshCrsGraph = rcp(new crs_graph_type(meshRowMap, meshColMap, blockRowptr, blockColind)); + meshCrsGraph->fillComplete(meshDomainMap,meshRangeMap); + Kokkos::DefaultExecutionSpace().fence(); } - meshCrsGraph->fillComplete(meshDomainMap,meshRangeMap); - - //create and populate the block matrix - RCP blockMatrix = rcp(new block_crs_matrix_type(*meshCrsGraph, blockSize)); - - /// temporary pack - Array tmpBlock(blockSize*blockSize); - - //preallocate the maximum number of (dense) block entries needed by any row - int maxBlockEntries = blockMatrix->getLocalMaxNumRowEntries(); - Array> blocks(maxBlockEntries); - for (int i=0; i bcol2bentry; //maps block column index to dense block entries - std::map::iterator iter; - //Fill the block matrix. We must do this in local index space. - //TODO: Optimization: We assume the blocks are fully populated in the point matrix. This means - //TODO: on the first point row in the block row, we know that we're hitting new block col indices. - //TODO: on other rows, we know the block col indices have all been seen before - //int offset; - //if (pointMatrix.getIndexBase()) offset = 0; - //else offset = 1; - for (size_t i=0; isecond; - blocks[littleBlock].push_back(pointVals[k]); + + return meshCrsGraph; + } + + template + Teuchos::RCP > + convertToBlockCrsMatrix(const Tpetra::CrsMatrix& pointMatrix, const LO &blockSize) + { + + /* + ASSUMPTIONS: + + 1) In point matrix, all entries associated with a little block are present (even if they are zero). + 2) For given mesh DOF, point DOFs appear consecutively and in ascending order in row & column maps. + 3) Point column map and block column map are ordered consistently. + */ + + using Teuchos::Array; + using Teuchos::ArrayView; + using Teuchos::RCP; + + typedef Tpetra::BlockCrsMatrix block_crs_matrix_type; + typedef Tpetra::CrsMatrix crs_matrix_type; + + using local_graph_device_type = typename crs_matrix_type::local_graph_device_type; + using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type; + using row_map_type = typename local_graph_device_type::row_map_type::non_const_type; + using values_type = typename local_matrix_device_type::values_type::non_const_type; + + using offset_type = typename row_map_type::non_const_value_type; + + using execution_space = typename Node::execution_space; + using range_type = Kokkos::RangePolicy; + + RCP blockMatrix; + + const offset_type bs2 = blockSize * blockSize; + + auto meshCrsGraph = getBlockCrsGraph(pointMatrix, blockSize); + { + TEUCHOS_FUNC_TIME_MONITOR("Tpetra::convertToBlockCrsMatrix::fillBlockCrsMatrix"); + auto pointLocalGraph = pointMatrix.getCrsGraph()->getLocalGraphDevice(); + auto pointRowptr = pointLocalGraph.row_map; + auto pointColind = pointLocalGraph.entries; + + offset_type block_rows = pointRowptr.extent(0) == 0 ? 0 : (pointRowptr.extent(0)-1)/blockSize; + values_type blockValues("values", meshCrsGraph->getLocalNumEntries()*bs2); + auto pointValues = pointMatrix.getLocalValuesDevice (Access::ReadOnly); + auto blockRowptr = meshCrsGraph->getLocalGraphDevice().row_map; + + Kokkos::parallel_for("copyblockValues",range_type(0,block_rows),KOKKOS_LAMBDA(const LO i) { + const offset_type blkBeg = blockRowptr[i]; + const offset_type numBlocks = blockRowptr[i+1] - blkBeg; + + // For each block in the row... + for (offset_type block=0; block < numBlocks; block++) { + + // For each entry in the block... + for(LO little_row=0; little_rowfirst; - Scalar *vals = (blocks[iter->second]).getRawPtr(); - if (std::is_same::value) { - /// col major - for (LO ii=0;iireplaceLocalValues(i, &localBlockCol, tmp_vals, 1); - } else { - /// row major - blockMatrix->replaceLocalValues(i, &localBlockCol, vals, 1); - } - } - //Done with block row. Zero everything out. - for (int j=0; j:: std::runtime_error, "X and Y may not alias one another."); } - LocalOrdinal nrows = getLocalNumRows(); - LocalOrdinal maxRowImbalance = 0; - if(nrows != 0) - maxRowImbalance = getLocalMaxNumRowEntries() - (getLocalNumEntries() / nrows); - #if KOKKOSKERNELS_VERSION >= 40299 auto A_lcl = getLocalMatrixDevice(); if(!applyHelper.get()) { // The apply helper does not exist, so create it. - // This is when we can choose the spmv algorithm. - bool exceedsImbalanceThreshold = size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold(); - KokkosSparse::SPMVAlgorithm algo = - exceedsImbalanceThreshold ? KokkosSparse::SPMV_MERGE_PATH : KokkosSparse::SPMV_DEFAULT; - applyHelper = std::make_shared(A_lcl.nnz(), A_lcl.graph.row_map, algo); + applyHelper = std::make_shared(A_lcl.nnz(), A_lcl.graph.row_map); } // Translate mode (Teuchos enum) to KokkosKernels (1-character string) @@ -5129,6 +5120,11 @@ CrsMatrix:: impl_scalar_type(alpha), A_lcl, X_lcl, impl_scalar_type(beta), Y_lcl); } #else + LocalOrdinal nrows = getLocalNumRows(); + LocalOrdinal maxRowImbalance = 0; + if(nrows != 0) + maxRowImbalance = getLocalMaxNumRowEntries() - (getLocalNumEntries() / nrows); + auto matrix_lcl = getLocalMultiplyOperator(); if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold()) matrix_lcl->applyImbalancedRows (X_lcl, Y_lcl, mode, alpha, beta); diff --git a/packages/tpetra/core/src/Tpetra_Details_Behavior.cpp b/packages/tpetra/core/src/Tpetra_Details_Behavior.cpp index d2943f5b75d3..5baaaffd1445 100644 --- a/packages/tpetra/core/src/Tpetra_Details_Behavior.cpp +++ b/packages/tpetra/core/src/Tpetra_Details_Behavior.cpp @@ -129,6 +129,8 @@ constexpr const std::string_view HIERARCHICAL_UNPACK = "TPETRA_HIERARCHICAL_UNPACK"; constexpr const std::string_view SKIP_COPY_AND_PERMUTE = "TPETRA_SKIP_COPY_AND_PERMUTE"; +constexpr const std::string_view FUSED_RESIDUAL = + "TPETRA_FUSED_RESIDUAL"; constexpr const std::string_view OVERLAP = "TPETRA_OVERLAP"; constexpr const std::string_view SPACES_ID_WARN_LIMIT = "TPETRA_SPACES_ID_WARN_LIMIT"; @@ -156,7 +158,8 @@ constexpr const auto RECOGNIZED_VARS = make_array( MULTIVECTOR_USE_MERGE_PATH, VECTOR_DEVICE_THRESHOLD, HIERARCHICAL_UNPACK_BATCH_SIZE, HIERARCHICAL_UNPACK_TEAM_SIZE, USE_TEUCHOS_TIMERS, USE_KOKKOS_PROFILING, DEBUG, VERBOSE, TIMING, - HIERARCHICAL_UNPACK, SKIP_COPY_AND_PERMUTE, OVERLAP, SPACES_ID_WARN_LIMIT, + HIERARCHICAL_UNPACK, SKIP_COPY_AND_PERMUTE, FUSED_RESIDUAL, + OVERLAP, SPACES_ID_WARN_LIMIT, TIME_KOKKOS_DEEP_COPY, TIME_KOKKOS_DEEP_COPY_VERBOSE1, TIME_KOKKOS_DEEP_COPY_VERBOSE2, TIME_KOKKOS_FENCE, TIME_KOKKOS_FUNCTIONS); @@ -672,6 +675,16 @@ bool Behavior::skipCopyAndPermuteIfPossible() { defaultValue); } +bool Behavior::fusedResidual() { + constexpr bool defaultValue(true); + + static bool value_ = defaultValue; + static bool initialized_ = false; + return idempotentlyGetEnvironmentVariable( + value_, initialized_, BehaviorDetails::FUSED_RESIDUAL, + defaultValue); +} + bool Behavior::overlapCommunicationAndComputation() { constexpr bool defaultValue(false); diff --git a/packages/tpetra/core/src/Tpetra_Details_Behavior.hpp b/packages/tpetra/core/src/Tpetra_Details_Behavior.hpp index 167e39070c30..99855768b6e1 100644 --- a/packages/tpetra/core/src/Tpetra_Details_Behavior.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_Behavior.hpp @@ -259,10 +259,17 @@ class Behavior { /// \brief Use Kokkos::Profiling in Tpetra::ProfilingRegion /// /// This is enabled by default if KOKKOS_ENABLE_PROFILING is defined. - /// You mau control this at run time via the TPETRA_USE_KOKKOS_PROFILING + /// You may control this at run time via the TPETRA_USE_KOKKOS_PROFILING /// environment variable. static bool profilingRegionUseKokkosProfiling(); + /// \brief Fusing SpMV and update in residual instead of using 2 kernel launches. + /// Fusing kernels implies that no TPLs (CUSPARSE, ROCSPARSE, ...) will be used for the residual. + /// + /// This is enabled by default. You may control this at run time via the + /// TPETRA_FUSED_RESIDUAL environment variable. + static bool fusedResidual(); + /// \brief Skip copyAndPermute if possible /// /// This is disabled by default. You may control this at run time via the diff --git a/packages/tpetra/core/src/Tpetra_Details_residual.hpp b/packages/tpetra/core/src/Tpetra_Details_residual.hpp index 8e6e20e1c73c..6858f3748fb1 100644 --- a/packages/tpetra/core/src/Tpetra_Details_residual.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_residual.hpp @@ -338,16 +338,18 @@ void localResidual(const CrsMatrix & A, (X_colmap_lcl.data () == B_lcl.data () && X_colmap_lcl.data () != nullptr), std::runtime_error, "X, Y and R may not alias one another."); } - -#ifdef TPETRA_DETAILS_USE_REFERENCE_RESIDUAL - SC one = Teuchos::ScalarTraits::one(); - SC negone = -one; - SC zero = Teuchos::ScalarTraits::zero(); - // This is currently a "reference implementation" waiting until Kokkos Kernels provides - // a residual kernel. - A.localApply(X_colmap,R,Teuchos::NO_TRANS, one, zero); - R.update(one,B,negone); -#else + + const bool fusedResidual = ::Tpetra::Details::Behavior::fusedResidual (); + if (!fusedResidual) { + SC one = Teuchos::ScalarTraits::one(); + SC negone = -one; + SC zero = Teuchos::ScalarTraits::zero(); + // This is currently a "reference implementation" waiting until Kokkos Kernels provides + // a residual kernel. + A.localApply(X_colmap,R,Teuchos::NO_TRANS, one, zero); + R.update(one,B,negone); + return; + } if (A_lcl.numRows() == 0) { return; @@ -412,7 +414,6 @@ void localResidual(const CrsMatrix & A, } } -#endif } diff --git a/packages/zoltan2/core/src/environment/Zoltan2_Environment.cpp b/packages/zoltan2/core/src/environment/Zoltan2_Environment.cpp index b1804502bd26..025ec3dab4d6 100644 --- a/packages/zoltan2/core/src/environment/Zoltan2_Environment.cpp +++ b/packages/zoltan2/core/src/environment/Zoltan2_Environment.cpp @@ -316,11 +316,6 @@ void Environment::commitParameters() params_.validateParametersAndSetDefaults(validParams, 0); - // For all of the string to integer parameters, convert - // them to the integer. I would have - // expected validateAndModify() to do this. - - convertStringToInt(params_); } ///////////////////////////////////////////////////////////////////// @@ -419,46 +414,6 @@ void Environment::commitParameters() params_.get("error_check_level", BASIC_ASSERTION)); #endif } - -void Environment::convertStringToInt(Teuchos::ParameterList ¶ms) -{ - using Teuchos::ParameterList; - using Teuchos::ParameterEntry; - using Teuchos::RCP; - using Teuchos::rcp_dynamic_cast; - ParameterList::ConstIterator next = params.begin(); - - // Data type of these parameters will now change from string to int - - std::string validatorNameInt("StringIntegralValidator(int)"); - std::string validatorNameBool("StringIntegralValidator(bool)"); - typedef Teuchos::StringToIntegralParameterEntryValidator s2int_t; - - while (next != params.end()){ - - const std::string &name = next->first; - ParameterEntry &entry = params.getEntry(name); - - if (entry.isList()){ - ParameterList *dummy = NULL; - ParameterList &pl = entry.getValue(dummy); - convertStringToInt(pl); - } - else{ - if ((entry.validator()).get()){ - if (entry.validator()->getXMLTypeName() == validatorNameInt){ - std::string dummy(""); - std::string &entryValue = entry.getValue(&dummy); - RCP s2int = - Teuchos::rcp_dynamic_cast(entry.validator(), true); - int val = s2int->getIntegralValue(entryValue); - entry.setValue(val); - } - } - } - ++next; - } -} } //namespace Zoltan2 diff --git a/packages/zoltan2/core/src/environment/Zoltan2_Environment.hpp b/packages/zoltan2/core/src/environment/Zoltan2_Environment.hpp index 5795f2f2f33b..2dc52ccfddec 100644 --- a/packages/zoltan2/core/src/environment/Zoltan2_Environment.hpp +++ b/packages/zoltan2/core/src/environment/Zoltan2_Environment.hpp @@ -598,15 +598,8 @@ class Environment{ * StringToIntegral parameters have been converted * to integer values or bool values. * - * Given a parameter list, this function converts all of the entries that - * have valiator of type StringToIntegralParameterEntryValidator - * from their string value to their int value. - * Also StringToIntegralParameterEntryValidator to bool - * */ - static void convertStringToInt(Teuchos::ParameterList ¶ms); - private: /*! \brief Set up the Environment for the constructor. diff --git a/packages/zoltan2/sphynx/src/Zoltan2_SphynxProblem.hpp b/packages/zoltan2/sphynx/src/Zoltan2_SphynxProblem.hpp index 0f29ed0fd56e..eab485f5b848 100644 --- a/packages/zoltan2/sphynx/src/Zoltan2_SphynxProblem.hpp +++ b/packages/zoltan2/sphynx/src/Zoltan2_SphynxProblem.hpp @@ -168,7 +168,6 @@ namespace Zoltan2 { Z2_FORWARD_EXCEPTIONS sphynxParams_->validateParametersAndSetDefaults(validParams, 0); - this->env_->convertStringToInt(*sphynxParams_.get()); int nparts = -1; const Teuchos::ParameterEntry *pe = this->params_->getEntryPtr("num_global_parts"); diff --git a/packages/zoltan2/test/core/unit/environment/AllParameters.cpp b/packages/zoltan2/test/core/unit/environment/AllParameters.cpp index f7a39b14e46d..e2ce7294c097 100644 --- a/packages/zoltan2/test/core/unit/environment/AllParameters.cpp +++ b/packages/zoltan2/test/core/unit/environment/AllParameters.cpp @@ -281,7 +281,6 @@ int main(int narg, char *arg[]) try{ Zoltan2::createValidatorList(myParams, validParameters); myParams.validateParametersAndSetDefaults(validParameters); - Zoltan2::Environment::convertStringToInt(myParams); } catch(std::exception &e){ std::cerr << "Validate parameters generated an error:" << std::endl; diff --git a/packages/zoltan2/test/core/unit/environment/Parameters.cpp b/packages/zoltan2/test/core/unit/environment/Parameters.cpp index 23b16fc96f09..e2127c8c0988 100644 --- a/packages/zoltan2/test/core/unit/environment/Parameters.cpp +++ b/packages/zoltan2/test/core/unit/environment/Parameters.cpp @@ -86,7 +86,6 @@ int main(int narg, char *arg[]) try{ Zoltan2::createValidatorList(myParams, validParameters); myParams.validateParametersAndSetDefaults(validParameters); - Zoltan2::Environment::convertStringToInt(myParams); } catch(std::exception &e){ std::cerr << "Validate parameters generated an error:" << std::endl; @@ -170,7 +169,6 @@ int main(int narg, char *arg[]) try{ Zoltan2::createValidatorList(all, validParameters); all.validateParametersAndSetDefaults(validParameters); - Zoltan2::Environment::convertStringToInt(all); } catch(std::exception &e){ std::cerr << "Validate parameters generated an error:" << std::endl;