diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml new file mode 100644 index 00000000000..5a7e0e1c662 --- /dev/null +++ b/.github/workflows/clang-format-check.yml @@ -0,0 +1,15 @@ +name: clang-format check + +on: [push, pull_request] + +permissions: read-all + +jobs: + formatting-check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Run clang-format style check. + uses: DoozyX/clang-format-lint-action@v0.16.2 + with: + clangFormatVersion: 8 diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 00000000000..2ed86a14751 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,51 @@ +name: "CodeQL" + +on: + push: + branches: [ "master", "develop", "release-*" ] + pull_request: + branches: [ "develop" ] + +permissions: read-all + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + timeout-minutes: 360 + permissions: + # required for all workflows + security-events: write + + # only required for workflows in private repositories + actions: read + contents: read + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: c-cpp + + - name: configure + run: + cmake -B build . + -DKokkos_ENABLE_OPENMP=ON + -DCMAKE_CXX_STANDARD=17 + -DKokkos_ENABLE_DEPRECATED_CODE_4=OFF + -DKokkos_ENABLE_TESTS=ON + -DKokkos_ENABLE_EXAMPLES=ON + -DKokkos_ENABLE_BENCHMARKS=ON + -DCMAKE_BUILD_TYPE=Debug + - name: build + run: + cmake --build build --parallel 2 + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v3 + with: + category: "/language:c-cpp" diff --git a/.github/workflows/continuous-integration-workflow-32bit.yml b/.github/workflows/continuous-integration-workflow-32bit.yml index 68fbdbe8a47..87c21d3a6e7 100644 --- a/.github/workflows/continuous-integration-workflow-32bit.yml +++ b/.github/workflows/continuous-integration-workflow-32bit.yml @@ -9,6 +9,8 @@ on: - '**/*.md' types: [ opened, reopened, synchronize ] +permissions: read-all + concurrency: group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{github.event_name == 'pull_request'}} @@ -21,7 +23,7 @@ jobs: image: ghcr.io/kokkos/ci-containers/ubuntu:latest steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: install_multilib run: sudo apt-get update && sudo apt-get install -y gcc-multilib g++-multilib gfortran-multilib - name: Configure Kokkos diff --git a/.github/workflows/continuous-integration-workflow-hpx.yml b/.github/workflows/continuous-integration-workflow-hpx.yml index 8b39350dc87..35704a28cf3 100644 --- a/.github/workflows/continuous-integration-workflow-hpx.yml +++ b/.github/workflows/continuous-integration-workflow-hpx.yml @@ -13,6 +13,8 @@ concurrency: group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{github.event_name == 'pull_request'}} +permissions: read-all + jobs: hpx: name: hpx @@ -20,7 +22,7 @@ jobs: steps: - name: checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: path: kokkos - name: setup hpx dependencies @@ -33,12 +35,12 @@ jobs: libboost-all-dev \ ninja-build - name: checkout hpx - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: STELLAR-GROUP/hpx - ref: 1.8.0 + ref: v1.9.0 path: hpx - - uses: actions/cache@v3 + - uses: actions/cache@v4 id: cache-hpx with: path: ./hpx/install diff --git a/.github/workflows/continuous-integration-workflow.yml b/.github/workflows/continuous-integration-workflow.yml index 8c226c3766c..6425cc2668e 100644 --- a/.github/workflows/continuous-integration-workflow.yml +++ b/.github/workflows/continuous-integration-workflow.yml @@ -13,6 +13,8 @@ concurrency: group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{github.event_name == 'pull_request'}} +permissions: read-all + jobs: CI: continue-on-error: true @@ -25,31 +27,39 @@ jobs: backend: ['OPENMP'] clang-tidy: [''] include: - - distro: 'fedora:intel' + - distro: 'ubuntu:intel' cxx: 'icpc' cxx_extra_flags: '-diag-disable=177,10441' cmake_build_type: 'Release' backend: 'OPENMP' - - distro: 'fedora:intel' + - distro: 'ubuntu:intel' cxx: 'icpc' cxx_extra_flags: '-diag-disable=177,10441' cmake_build_type: 'Debug' backend: 'OPENMP' - - distro: 'fedora:intel' + - distro: 'ubuntu:intel' cxx: 'icpx' cxx_extra_flags: '-fp-model=precise -Wno-pass-failed' cmake_build_type: 'Release' backend: 'OPENMP' - - distro: 'fedora:intel' + - distro: 'ubuntu:intel' cxx: 'icpx' cxx_extra_flags: '-fp-model=precise -Wno-pass-failed' cmake_build_type: 'Debug' backend: 'OPENMP' - distro: 'ubuntu:latest' cxx: 'clang++' + cxx_extra_flags: '-fsanitize=address' + extra_linker_flags: '-fsanitize=address' cmake_build_type: 'RelWithDebInfo' backend: 'THREADS' clang-tidy: '-DCMAKE_CXX_CLANG_TIDY="clang-tidy;-warnings-as-errors=*"' + - distro: 'ubuntu:latest' + cxx: 'clang++' + cxx_extra_flags: '-fsanitize=address' + extra_linker_flags: '-fsanitize=address' + cmake_build_type: 'RelWithDebInfo' + backend: 'SERIAL' - distro: 'ubuntu:latest' cxx: 'g++' cmake_build_type: 'RelWithDebInfo' @@ -59,7 +69,7 @@ jobs: image: ghcr.io/kokkos/ci-containers/${{ matrix.distro }} steps: - name: Checkout desul - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: desul/desul ref: 477da9c8f40f8db369c28dd3f93a67e376d8511b @@ -74,8 +84,8 @@ jobs: cmake -DDESUL_ENABLE_TESTS=OFF -DCMAKE_INSTALL_PREFIX=/usr/desul-install .. sudo cmake --build . --target install --parallel 2 - name: Checkout code - uses: actions/checkout@v3 - - uses: actions/cache@v3 + uses: actions/checkout@v4 + - uses: actions/cache@v4 with: path: ~/.cache/ccache key: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${{ github.ref }}-${{ github.sha }} @@ -106,6 +116,7 @@ jobs: -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ -DKokkos_ENABLE_IMPL_MDSPAN=ON \ -DCMAKE_CXX_FLAGS="-Werror ${{ matrix.cxx_extra_flags }}" \ + -DCMAKE_EXE_LINKER_FLAGS="${{ matrix.extra_linker_flags }}" \ -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} @@ -118,6 +129,7 @@ jobs: working-directory: builddir run: ctest --output-on-failure - name: Test linking against build dir + if: ${{ matrix.cxx_extra_flags != '-fsanitize=address' }} working-directory: example/build_cmake_installed run: | cmake -B builddir_buildtree -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} -DKokkos_ROOT=../../builddir @@ -128,6 +140,7 @@ jobs: - name: Install run: sudo cmake --build builddir --target install - name: Test install + if: ${{ matrix.cxx_extra_flags != '-fsanitize=address' }} working-directory: example/build_cmake_installed run: | cmake -B builddir -DCMAKE_CXX_COMPILER=${{ matrix.cxx }} diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 85b079e56c8..2bcf41a0d30 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -13,6 +13,8 @@ concurrency: group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{github.event_name == 'pull_request'}} +permissions: read-all + jobs: osxci: name: osx-ci @@ -31,7 +33,7 @@ jobs: cmake_build_type: "Release" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: configure run: cmake -B build . diff --git a/.github/workflows/performance-benchmark.yml b/.github/workflows/performance-benchmark.yml index 59eed4f6096..bfbbeea4a85 100644 --- a/.github/workflows/performance-benchmark.yml +++ b/.github/workflows/performance-benchmark.yml @@ -8,6 +8,8 @@ on: - '**/*.md' types: [ opened, reopened, synchronize ] +permissions: read-all + jobs: CI: continue-on-error: true @@ -23,8 +25,8 @@ jobs: BUILD_ID: ${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.backend }} steps: - name: Checkout code - uses: actions/checkout@v3 - - uses: actions/cache@v3 + uses: actions/checkout@v4 + - uses: actions/cache@v4 with: path: ~/.cache/ccache key: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.backend }}-${{ github.ref }}-${{ github.sha }} diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml new file mode 100644 index 00000000000..3d7ede20773 --- /dev/null +++ b/.github/workflows/scorecard.yml @@ -0,0 +1,73 @@ +# This workflow uses actions that are not certified by GitHub. They are provided +# by a third-party and are governed by separate terms of service, privacy +# policy, and support documentation. + +name: Scorecard supply-chain security +on: + # For Branch-Protection check. Only the default branch is supported. See + # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection + branch_protection_rule: + # To guarantee Maintained check is occasionally updated. See + # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained + schedule: + # Weekly on Saturdays. + - cron: '30 1 * * 6' + push: + branches: [ master, develop ] + +# Declare default permissions as read only. +permissions: read-all + +jobs: + analysis: + name: Scorecard analysis + runs-on: ubuntu-latest + permissions: + # Needed to upload the results to code-scanning dashboard. + security-events: write + # Needed to publish results and get a badge (see publish_results below). + id-token: write + # Uncomment the permissions below if installing in a private repository. + # contents: read + # actions: read + + steps: + - name: "Checkout code" + uses: actions/checkout@8e5e7e5ab8b370d6c329ec480221332ada57f0ab # v3.5.2 + with: + persist-credentials: false + + - name: "Run analysis" + uses: ossf/scorecard-action@80e868c13c90f172d68d1f4501dee99e2479f7af # v2.1.3 + with: + results_file: results.sarif + results_format: sarif + # (Optional) "write" PAT token. Uncomment the `repo_token` line below if: + # - you want to enable the Branch-Protection check on a *public* repository, or + # - you are installing Scorecard on a *private* repository + # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat. + # repo_token: ${{ secrets.SCORECARD_TOKEN }} + + # Public repositories: + # - Publish results to OpenSSF REST API for easy access by consumers + # - Allows the repository to include the Scorecard badge. + # - See https://github.com/ossf/scorecard-action#publishing-results. + # For private repositories: + # - `publish_results` will always be set to `false`, regardless + # of the value entered here. + publish_results: true + + # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF + # format to the repository Actions tab. + - name: "Upload artifact" + uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2 + with: + name: SARIF file + path: results.sarif + retention-days: 5 + + # Upload the results to GitHub's code scanning dashboard. + - name: "Upload SARIF results to code scanning" + uses: github/codeql-action/upload-sarif@83f0fe6c4988d98a455712a27f0255212bba9bd4 # v2.3.6 + with: + sarif_file: results.sarif diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml new file mode 100644 index 00000000000..1bea94a721b --- /dev/null +++ b/.github/workflows/windows.yml @@ -0,0 +1,35 @@ +name: github-windows + +on: + push: + pull_request: + +concurrency: + group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{github.event_name == 'pull_request'}} + +permissions: read-all + +jobs: + windows-cuda: + # Cuda build on Windows + name: Windows Cuda + runs-on: windows-2022 + + steps: + - uses: Jimver/cuda-toolkit@v0.2.14 + id: cuda-toolkit + with: + cuda: '12.1.0' + - uses: actions/checkout@v4 + - name: configure + shell: bash + run: | + mkdir build + mkdir c:/project + cd build + cmake -DKokkos_ENABLE_CUDA=ON -DKokkos_ARCH_VOLTA70=ON -DKokkos_ENABLE_TESTS=ON -DKokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE=ON .. + - name: build library + shell: bash + run: | + cmake --build build --parallel 2 --config Release diff --git a/.jenkins b/.jenkins index 6f5cf80033f..ae3bffd92d7 100644 --- a/.jenkins +++ b/.jenkins @@ -8,16 +8,21 @@ pipeline { } options { + disableConcurrentBuilds(abortPrevious: true) timeout(time: 6, unit: 'HOURS') } + triggers { + issueCommentTrigger('.*test this please.*') + } + stages { stage('Clang-Format') { agent { dockerfile { filename 'Dockerfile.clang' dir 'scripts/docker' - label 'nvidia-docker || rocm-docker || docker' + label 'nvidia-docker || docker' args '-v /tmp/ccache.kokkos:/tmp/ccache' } } @@ -102,12 +107,11 @@ pipeline { } steps { sh 'ccache --zero-stats' - sh '''. /opt/intel/oneapi/setvars.sh --include-intel-llvm && \ - rm -rf build && mkdir -p build && cd build && \ + sh '''rm -rf build && mkdir -p build && cd build && \ cmake \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ - -DCMAKE_CXX_COMPILER=/opt/intel/oneapi/compiler/2023.0.0/linux/bin-llvm/clang++ \ + -DCMAKE_CXX_COMPILER=clang++ \ -DCMAKE_CXX_FLAGS="-fsycl-device-code-split=per_kernel -Wno-deprecated-declarations -Werror -Wno-gnu-zero-variadic-macro-arguments -Wno-unknown-cuda-version -Wno-sycl-target" \ -DKOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED=0 \ -DKokkos_ARCH_NATIVE=ON \ @@ -135,8 +139,8 @@ pipeline { dockerfile { filename 'Dockerfile.hipcc' dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.2' - label 'rocm-docker && vega' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.2-complete' + label 'rocm-docker ' args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' } } @@ -177,8 +181,8 @@ pipeline { dockerfile { filename 'Dockerfile.hipcc' dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.6' - label 'rocm-docker && vega' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:5.6-complete' + label 'rocm-docker' args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' } } @@ -390,7 +394,6 @@ pipeline { -DKokkos_ENABLE_CUDA_LAMBDA=OFF \ -DKokkos_ENABLE_CUDA_UVM=ON \ -DKokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON \ - -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DKokkos_ENABLE_IMPL_MDSPAN=ON \ @@ -493,7 +496,6 @@ pipeline { -DCMAKE_CXX_FLAGS=-Werror \ -DKokkos_ARCH_NATIVE=ON \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DKokkos_ENABLE_DEPRECATED_CODE_3=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_4=ON \ -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \ -DKokkos_ENABLE_TESTS=ON \ diff --git a/CHANGELOG.md b/CHANGELOG.md index 40e3c95f24f..f7b8af7695c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,105 @@ # CHANGELOG +## [4.3.00](https://github.com/kokkos/kokkos/tree/4.3.00) (2024-03-19) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.01...4.3.00) + +### Features: +* Add `Experimental::sort_by_key(exec, keys, values)` algorithm [\#6801](https://github.com/kokkos/kokkos/pull/6801) + +### Backend and Architecture Enhancements: + +#### CUDA: +* Experimental multi-GPU support (from the same process) [\#6782](https://github.com/kokkos/kokkos/pull/6782) +* Link against CUDA libraries even with KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE [\#6701](https://github.com/kokkos/kokkos/pull/6701) +* Don't use the compiler launcher script if the CMake compile language is CUDA. [\#6704](https://github.com/kokkos/kokkos/pull/6704) +* nvcc(wrapper): adding "long" and "short" versions for all flags [\#6615](https://github.com/kokkos/kokkos/pull/6615) + +#### HIP: + * Fix compilation when using amdclang (with ROCm >= 5.7) and RDC [\#6857](https://github.com/kokkos/kokkos/pull/6857) + * Use rocthrust for sorting, when available [\#6793](https://github.com/kokkos/kokkos/pull/6793) + +#### SYCL: +* We only support OneAPI SYCL implementation: add check during initialization + * Error out on initialization if the backend is different from `ext_oneapi_*` [\#6784](https://github.com/kokkos/kokkos/pull/6784) + * Filter GPU devices for `ext_onapi_*` GPU devices [\#6758](https://github.com/kokkos/kokkos/pull/6784) +* Performance Improvements + * Avoid unnecessary zero-memset of the scratch flags in SYCL [\#6739](https://github.com/kokkos/kokkos/pull/6739) + * Use host-pinned memory to copy reduction/scan result [\#6500](https://github.com/kokkos/kokkos/pull/6500) +* Address deprecations after oneAPI 2023.2.0 [\#6577](https://github.com/kokkos/kokkos/pull/6739) +* Make sure to call find_dependency for oneDPL if necessary [\#6870](https://github.com/kokkos/kokkos/pull/6870) + +#### OpenMPTarget: +* Use LLVM extensions for dynamic shared memory [\#6380](https://github.com/kokkos/kokkos/pull/6380) +* Guard scratch memory usage in ParallelReduce [\#6585 ](https://github.com/kokkos/kokkos/pull/6585) +* Update linker flags for Intel GPUs update [\#6735](https://github.com/kokkos/kokkos/pull/6735) +* Improve handling of printf on Intel GPUs [\#6652](https://github.com/kokkos/kokkos/pull/6652) + +#### OpenACC: +* Add atomics support [\#6446](https://github.com/kokkos/kokkos/pull/6446) +* Make the OpenACC backend asynchronous [\#6772](https://github.com/kokkos/kokkos/pull/6772) + +#### Threads: +* Add missing broadcast to TeamThreadRange parallel_scan [\#6601](https://github.com/kokkos/kokkos/pull/6446) + +#### OpenMP: +* Improve performance of view initializations and filling with zeros [\#6573](https://github.com/kokkos/kokkos/pull/6573) + +### General Enhancements + +* Improve performance of random number generation when using a normal distribution on GPUs [\#6556](https://github.com/kokkos/kokkos/pull/6556) +* Allocate temporary view with the user-provided execution space instance and do not initialize in `unique` algorithm [\#6598](https://github.com/kokkos/kokkos/pull/6598) +* Add deduction guide for `Kokkos::Array` [\#6373](https://github.com/kokkos/kokkos/pull/6373) +* Provide new public headers `` and `` [\#6687](https://github.com/kokkos/kokkos/pull/6687) +* Fix/improvement to `remove_if` parallel algorithm: use the provided execution space instance for temporary allocations and drop unnecessaryinitialization + avoid evaluating twice the predicate during final pass [\#6747](https://github.com/kokkos/kokkos/pull/6747) +* Add runtime function to query the number of devices and make device ID consistent with `KOKKOS_VISIBLE_DEVICES` [\#6713](https://github.com/kokkos/kokkos/pull/6713) +* simd: support `vector_aligned_tag` [\#6243](https://github.com/kokkos/kokkos/pull/6243) +* Avoid unnecessary allocation when default constructing Bitset [\#6524](https://github.com/kokkos/kokkos/pull/6524) +* Fix constness for views in std algorithms [\#6813](https://github.com/kokkos/kokkos/pull/6813) +* Improve error message on unsafe implicit conversion in MDRangePolicy [\#6855](https://github.com/kokkos/kokkos/pull/6855) +* CTAD (deduction guides) for RangePolicy [\#6850](https://github.com/kokkos/kokkos/pull/6850) +* CTAD (deduction guides) for MDRangePolicy [\#5516](https://github.com/kokkos/kokkos/pull/5516) + +### Build System Changes +* Require `Kokkos_ENABLE_ATOMICS_BYPASS` option to bypass atomic operation for Serial backend only builds [\#6692](https://github.com/kokkos/kokkos/pull/6692) +* Add support for RISCV and the Milk-V's Pioneer [\#6773](https://github.com/kokkos/kokkos/pull/6773) +* Add C++26 standard to CMake setup [\#6733](https://github.com/kokkos/kokkos/pull/6733) +* Fix Makefile when using gnu_generate_makefile.sh and make >= 4.3 [\#6606](https://github.com/kokkos/kokkos/pull/6606) +* Cuda: Fix configuring with CMake >= 3.28.4 - temporary fallback to internal CudaToolkit.cmake [\#6898](https://github.com/kokkos/kokkos/pull/6898) + +### Incompatibilities (i.e. breaking changes) +* Remove all `DEPRECATED_CODE_3` option and all code that was guarded by it [\#6523](https://github.com/kokkos/kokkos/pull/6523) +* Drop guards to accommodate external code defining `KOKKOS_ASSERT` [\#6665](https://github.com/kokkos/kokkos/pull/6665) +* `Profiling::ProfilingSection(std::string)` constructor marked explicit and nodiscard [\#6690](https://github.com/kokkos/kokkos/pull/6690) +* Add bound check preconditions for `RangePolicy` and `MDRangePolicy` [\#6617](https://github.com/kokkos/kokkos/pull/6617) [\#6726](https://github.com/kokkos/kokkos/pull/6726) +* Add checks for unsafe implicit conversions in RangePolicy [\#6754](https://github.com/kokkos/kokkos/pull/6754) +* Remove Kokkos::[b]half_t volatile overloads [\#6579](https://github.com/kokkos/kokkos/pull/6579) +* Remove KOKKOS_IMPL_DO_NOT_USE_PRINTF [\#6593](https://github.com/kokkos/kokkos/pull/6593) +* Check matching static extents in View constructor [\#5190 ](https://github.com/kokkos/kokkos/pull/5190) +* Tools(profiling): fix typo Kokkos_Tools_Optim[i]zationGoal [\#6642](https://github.com/kokkos/kokkos/pull/6642) +* Remove variadic range policy constructor (disallow passing multiple trailing chunk size arguments) [\#6845](https://github.com/kokkos/kokkos/pull/6845) +* Improve message on view out of bounds access and always abort [\#6861](https://github.com/kokkos/kokkos/pull/6861) +* Drop `KOKKOS_ENABLE_INTEL_MM_ALLOC` macro [\#6797](https://github.com/kokkos/kokkos/pull/6797) +* Remove `Kokkos::Experimental::LogicalMemorySpace` (without going through deprecation) [\#6557](https://github.com/kokkos/kokkos/pull/6557) +* Remove `Experimental::HBWSpace` and support for linking against memkind [\#6791](https://github.com/kokkos/kokkos/pull/6791) +* Drop librt TPL and associated `KOKKOS_ENABLE_LIBRT` macro [\#6798](https://github.com/kokkos/kokkos/pull/6798) +* Drop support for old CPU architectures (`ARCH_BGQ`, `ARCH_POWER7`, `ARCH_WSM` and associated `ARCH_SSE4` macro) [\#6806](https://github.com/kokkos/kokkos/pull/6806) +* Drop support for deprecated command-line arguments and environment variables [\#6744](https://github.com/kokkos/kokkos/pull/6744) + +### Deprecations +* Provide kokkos_swap as part of Core and deprecate Experimental::swap in Algorithms [\#6697](https://github.com/kokkos/kokkos/pull/6697) +* Deprecate {Cuda,HIP}::detect_device_count() and Cuda::[detect_]device_arch() [\#6710](https://github.com/kokkos/kokkos/pull/6710) +* Deprecate `ExecutionSpace::in_parallel()` [\#6582](https://github.com/kokkos/kokkos/pull/6582) + +### Bug Fixes +* Fix team-level MDRange reductions: [\#6511](https://github.com/kokkos/kokkos/pull/6511) +* Fix CUDA and SYCL small value type (16-bit) team reductions [\#5334](https://github.com/kokkos/kokkos/pull/5334) +* Enable `{transform_}exclusive_scan` in place [\#6667](https://github.com/kokkos/kokkos/pull/6667) +* `fill_random` overload that do not take an execution space instance argument should fence [\#6658](https://github.com/kokkos/kokkos/pull/6658) +* HIP,Cuda,OpenMPTarget: Fixup use provided execution space when copying host inaccessible reduction result [\#6777](https://github.com/kokkos/kokkos/pull/6777) +* Fix typo in `cuda_func_set_attribute[s]_wrapper` preventing proper setting of desired occupancy [\#6786](https://github.com/kokkos/kokkos/pull/6786) +* Avoid undefined behavior due to conversion between signed and unsigned integers in shift_{right, left}_team_impl [\#6821](https://github.com/kokkos/kokkos/pull/6821) +* Fix a bug in Makefile.kokkos when using AMD GPU architectures as `AMD_GFXYYY` [\#6892](https://github.com/kokkos/kokkos/pull/6892) + ## [4.2.01](https://github.com/kokkos/kokkos/tree/4.2.01) (2023-12-07) [Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.00...4.2.01) @@ -999,95 +1099,95 @@ - Major update for OpenMPTarget: many capabilities now work. For details contact us. - Added DPC++/SYCL backend: primary capabilites are working. - Added Kokkos Graph API analogous to CUDA Graphs. -- Added parallel_scan support with TeamThreadRange [\#3536](https://github.com/kokkos/kokkos/pull/#3536) -- Added Logical Memory Spaces [\#3546](https://github.com/kokkos/kokkos/pull/#3546) -- Added initial half precision support [\#3439](https://github.com/kokkos/kokkos/pull/#3439) -- Experimental feature: control cuda occupancy [\#3379](https://github.com/kokkos/kokkos/pull/#3379) +- Added parallel_scan support with TeamThreadRange [\#3536](https://github.com/kokkos/kokkos/pull/3536) +- Added Logical Memory Spaces [\#3546](https://github.com/kokkos/kokkos/pull/3546) +- Added initial half precision support [\#3439](https://github.com/kokkos/kokkos/pull/3439) +- Experimental feature: control cuda occupancy [\#3379](https://github.com/kokkos/kokkos/pull/3379) **Implemented enhancements Backends and Archs:** -- Add a64fx and fujitsu Compiler support [\#3614](https://github.com/kokkos/kokkos/pull/#3614) -- Adding support for AMD gfx908 archictecture [\#3375](https://github.com/kokkos/kokkos/pull/#3375) -- SYCL parallel\_for MDRangePolicy [\#3583](https://github.com/kokkos/kokkos/pull/#3583) -- SYCL add parallel\_scan [\#3577](https://github.com/kokkos/kokkos/pull/#3577) -- SYCL custom reductions [\#3544](https://github.com/kokkos/kokkos/pull/#3544) -- SYCL Enable container unit tests [\#3550](https://github.com/kokkos/kokkos/pull/#3550) -- SYCL feature level 5 [\#3480](https://github.com/kokkos/kokkos/pull/#3480) -- SYCL Feature level 4 (parallel\_for) [\#3474](https://github.com/kokkos/kokkos/pull/#3474) -- SYCL feature level 3 [\#3451](https://github.com/kokkos/kokkos/pull/#3451) -- SYCL feature level 2 [\#3447](https://github.com/kokkos/kokkos/pull/#3447) -- OpenMPTarget: Hierarchial reduction for + operator on scalars [\#3504](https://github.com/kokkos/kokkos/pull/#3504) -- OpenMPTarget hierarchical [\#3411](https://github.com/kokkos/kokkos/pull/#3411) -- HIP Add Impl::atomic\_[store,load] [\#3440](https://github.com/kokkos/kokkos/pull/#3440) -- HIP enable global lock arrays [\#3418](https://github.com/kokkos/kokkos/pull/#3418) -- HIP Implement multiple occupancy paths for various HIP kernel launchers [\#3366](https://github.com/kokkos/kokkos/pull/#3366) +- Add a64fx and fujitsu Compiler support [\#3614](https://github.com/kokkos/kokkos/pull/3614) +- Adding support for AMD gfx908 archictecture [\#3375](https://github.com/kokkos/kokkos/pull/3375) +- SYCL parallel\_for MDRangePolicy [\#3583](https://github.com/kokkos/kokkos/pull/3583) +- SYCL add parallel\_scan [\#3577](https://github.com/kokkos/kokkos/pull/3577) +- SYCL custom reductions [\#3544](https://github.com/kokkos/kokkos/pull/3544) +- SYCL Enable container unit tests [\#3550](https://github.com/kokkos/kokkos/pull/3550) +- SYCL feature level 5 [\#3480](https://github.com/kokkos/kokkos/pull/3480) +- SYCL Feature level 4 (parallel\_for) [\#3474](https://github.com/kokkos/kokkos/pull/3474) +- SYCL feature level 3 [\#3451](https://github.com/kokkos/kokkos/pull/3451) +- SYCL feature level 2 [\#3447](https://github.com/kokkos/kokkos/pull/3447) +- OpenMPTarget: Hierarchial reduction for + operator on scalars [\#3504](https://github.com/kokkos/kokkos/pull/3504) +- OpenMPTarget hierarchical [\#3411](https://github.com/kokkos/kokkos/pull/3411) +- HIP Add Impl::atomic\_[store,load] [\#3440](https://github.com/kokkos/kokkos/pull/3440) +- HIP enable global lock arrays [\#3418](https://github.com/kokkos/kokkos/pull/3418) +- HIP Implement multiple occupancy paths for various HIP kernel launchers [\#3366](https://github.com/kokkos/kokkos/pull/3366) **Implemented enhancements Policies:** -- MDRangePolicy: Let it be semiregular [\#3494](https://github.com/kokkos/kokkos/pull/#3494) -- MDRangePolicy: Check narrowing conversion in construction [\#3527](https://github.com/kokkos/kokkos/pull/#3527) -- MDRangePolicy: CombinedReducers support [\#3395](https://github.com/kokkos/kokkos/pull/#3395) -- Kokkos Graph: Interface and Default Implementation [\#3362](https://github.com/kokkos/kokkos/pull/#3362) -- Kokkos Graph: add Cuda Graph implementation [\#3369](https://github.com/kokkos/kokkos/pull/#3369) -- TeamPolicy: implemented autotuning of team sizes and vector lengths [\#3206](https://github.com/kokkos/kokkos/pull/#3206) -- RangePolicy: Initialize all data members in default constructor [\#3509](https://github.com/kokkos/kokkos/pull/#3509) +- MDRangePolicy: Let it be semiregular [\#3494](https://github.com/kokkos/kokkos/pull/3494) +- MDRangePolicy: Check narrowing conversion in construction [\#3527](https://github.com/kokkos/kokkos/pull/3527) +- MDRangePolicy: CombinedReducers support [\#3395](https://github.com/kokkos/kokkos/pull/3395) +- Kokkos Graph: Interface and Default Implementation [\#3362](https://github.com/kokkos/kokkos/pull/3362) +- Kokkos Graph: add Cuda Graph implementation [\#3369](https://github.com/kokkos/kokkos/pull/3369) +- TeamPolicy: implemented autotuning of team sizes and vector lengths [\#3206](https://github.com/kokkos/kokkos/pull/3206) +- RangePolicy: Initialize all data members in default constructor [\#3509](https://github.com/kokkos/kokkos/pull/3509) **Implemented enhancements BuildSystem:** -- Auto-generate core test files for all backends [\#3488](https://github.com/kokkos/kokkos/pull/#3488) -- Avoid rewriting test files when calling cmake [\#3548](https://github.com/kokkos/kokkos/pull/#3548) -- RULE\_LAUNCH\_COMPILE and RULE\_LAUNCH\_LINK system for nvcc\_wrapper [\#3136](https://github.com/kokkos/kokkos/pull/#3136) -- Adding -include as a known argument to nvcc\_wrapper [\#3434](https://github.com/kokkos/kokkos/pull/#3434) -- Install hpcbind script [\#3402](https://github.com/kokkos/kokkos/pull/#3402) -- cmake/kokkos\_tribits.cmake: add parsing for args [\#3457](https://github.com/kokkos/kokkos/pull/#3457) +- Auto-generate core test files for all backends [\#3488](https://github.com/kokkos/kokkos/pull/3488) +- Avoid rewriting test files when calling cmake [\#3548](https://github.com/kokkos/kokkos/pull/3548) +- RULE\_LAUNCH\_COMPILE and RULE\_LAUNCH\_LINK system for nvcc\_wrapper [\#3136](https://github.com/kokkos/kokkos/pull/3136) +- Adding -include as a known argument to nvcc\_wrapper [\#3434](https://github.com/kokkos/kokkos/pull/3434) +- Install hpcbind script [\#3402](https://github.com/kokkos/kokkos/pull/3402) +- cmake/kokkos\_tribits.cmake: add parsing for args [\#3457](https://github.com/kokkos/kokkos/pull/3457) **Implemented enhancements Tools:** -- Changed namespacing of Kokkos::Tools::Impl::Impl::tune\_policy [\#3455](https://github.com/kokkos/kokkos/pull/#3455) -- Delegate to an impl allocate/deallocate method to allow specifying a SpaceHandle for MemorySpaces [\#3530](https://github.com/kokkos/kokkos/pull/#3530) -- Use the Kokkos Profiling interface rather than the Impl interface [\#3518](https://github.com/kokkos/kokkos/pull/#3518) -- Runtime option for tuning [\#3459](https://github.com/kokkos/kokkos/pull/#3459) -- Dual View Tool Events [\#3326](https://github.com/kokkos/kokkos/pull/#3326) +- Changed namespacing of Kokkos::Tools::Impl::Impl::tune\_policy [\#3455](https://github.com/kokkos/kokkos/pull/3455) +- Delegate to an impl allocate/deallocate method to allow specifying a SpaceHandle for MemorySpaces [\#3530](https://github.com/kokkos/kokkos/pull/3530) +- Use the Kokkos Profiling interface rather than the Impl interface [\#3518](https://github.com/kokkos/kokkos/pull/3518) +- Runtime option for tuning [\#3459](https://github.com/kokkos/kokkos/pull/3459) +- Dual View Tool Events [\#3326](https://github.com/kokkos/kokkos/pull/3326) **Implemented enhancements Other:** -- Abort on errors instead of just printing [\#3528](https://github.com/kokkos/kokkos/pull/#3528) -- Enable C++14 macros unconditionally [\#3449](https://github.com/kokkos/kokkos/pull/#3449) -- Make ViewMapping trivially copyable [\#3436](https://github.com/kokkos/kokkos/pull/#3436) -- Rename struct ViewMapping to class [\#3435](https://github.com/kokkos/kokkos/pull/#3435) -- Replace enums in Kokkos\_ViewMapping.hpp (removes -Wextra) [\#3422](https://github.com/kokkos/kokkos/pull/#3422) -- Use bool for enums representing bools [\#3416](https://github.com/kokkos/kokkos/pull/#3416) -- Fence active instead of default execution space instances [\#3388](https://github.com/kokkos/kokkos/pull/#3388) -- Refactor parallel\_reduce fence usage [\#3359](https://github.com/kokkos/kokkos/pull/#3359) -- Moved Space EBO helpers to Kokkos\_EBO [\#3357](https://github.com/kokkos/kokkos/pull/#3357) -- Add remove\_cvref type trait [\#3340](https://github.com/kokkos/kokkos/pull/#3340) -- Adding identity type traits and update definition of identity\_t alias [\#3339](https://github.com/kokkos/kokkos/pull/#3339) -- Add is\_specialization\_of type trait [\#3338](https://github.com/kokkos/kokkos/pull/#3338) -- Make ScratchMemorySpace semi-regular [\#3309](https://github.com/kokkos/kokkos/pull/#3309) -- Optimize min/max atomics with early exit on no-op case [\#3265](https://github.com/kokkos/kokkos/pull/#3265) -- Refactor Backend Development [\#2941](https://github.com/kokkos/kokkos/pull/#2941) +- Abort on errors instead of just printing [\#3528](https://github.com/kokkos/kokkos/pull/3528) +- Enable C++14 macros unconditionally [\#3449](https://github.com/kokkos/kokkos/pull/3449) +- Make ViewMapping trivially copyable [\#3436](https://github.com/kokkos/kokkos/pull/3436) +- Rename struct ViewMapping to class [\#3435](https://github.com/kokkos/kokkos/pull/3435) +- Replace enums in Kokkos\_ViewMapping.hpp (removes -Wextra) [\#3422](https://github.com/kokkos/kokkos/pull/3422) +- Use bool for enums representing bools [\#3416](https://github.com/kokkos/kokkos/pull/3416) +- Fence active instead of default execution space instances [\#3388](https://github.com/kokkos/kokkos/pull/3388) +- Refactor parallel\_reduce fence usage [\#3359](https://github.com/kokkos/kokkos/pull/3359) +- Moved Space EBO helpers to Kokkos\_EBO [\#3357](https://github.com/kokkos/kokkos/pull/3357) +- Add remove\_cvref type trait [\#3340](https://github.com/kokkos/kokkos/pull/3340) +- Adding identity type traits and update definition of identity\_t alias [\#3339](https://github.com/kokkos/kokkos/pull/3339) +- Add is\_specialization\_of type trait [\#3338](https://github.com/kokkos/kokkos/pull/3338) +- Make ScratchMemorySpace semi-regular [\#3309](https://github.com/kokkos/kokkos/pull/3309) +- Optimize min/max atomics with early exit on no-op case [\#3265](https://github.com/kokkos/kokkos/pull/3265) +- Refactor Backend Development [\#2941](https://github.com/kokkos/kokkos/pull/2941) **Fixed bugs:** -- Fixup MDRangePolicy construction from Kokkos arrays [\#3591](https://github.com/kokkos/kokkos/pull/#3591) -- Add atomic functions for unsigned long long using gcc built-in [\#3588](https://github.com/kokkos/kokkos/pull/#3588) -- Fixup silent pointless comparison with zero in checked\_narrow\_cast (compiler workaround) [\#3566](https://github.com/kokkos/kokkos/pull/#3566) -- Fixes for ROCm 3.9 [\#3565](https://github.com/kokkos/kokkos/pull/#3565) -- Fix windows build issues which crept in for the CUDA build [\#3532](https://github.com/kokkos/kokkos/pull/#3532) -- HIP Fix atomics of large data types and clean up lock arrays [\#3529](https://github.com/kokkos/kokkos/pull/#3529) -- Pthreads fix exception resulting from 0 grain size [\#3510](https://github.com/kokkos/kokkos/pull/#3510) -- Fixup do not require atomic operation to be default constructible [\#3503](https://github.com/kokkos/kokkos/pull/#3503) -- Fix race condition in HIP backend [\#3467](https://github.com/kokkos/kokkos/pull/#3467) -- Replace KOKKOS\_DEBUG with KOKKOS\_ENABLE\_DEBUG [\#3458](https://github.com/kokkos/kokkos/pull/#3458) -- Fix multi-stream team scratch space definition for HIP [\#3398](https://github.com/kokkos/kokkos/pull/#3398) -- HIP fix template deduction [\#3393](https://github.com/kokkos/kokkos/pull/#3393) -- Fix compiling with HIP and C++17 [\#3390](https://github.com/kokkos/kokkos/pull/#3390) -- Fix sigFPE in HIP blocksize deduction [\#3378](https://github.com/kokkos/kokkos/pull/#3378) -- Type alias change: replace CS with CTS to avoid conflicts with NVSHMEM [\#3348](https://github.com/kokkos/kokkos/pull/#3348) -- Clang compilation of CUDA backend on Windows [\#3345](https://github.com/kokkos/kokkos/pull/#3345) -- Fix HBW support [\#3343](https://github.com/kokkos/kokkos/pull/#3343) -- Added missing fences to unique token [\#3260](https://github.com/kokkos/kokkos/pull/#3260) +- Fixup MDRangePolicy construction from Kokkos arrays [\#3591](https://github.com/kokkos/kokkos/pull/3591) +- Add atomic functions for unsigned long long using gcc built-in [\#3588](https://github.com/kokkos/kokkos/pull/3588) +- Fixup silent pointless comparison with zero in checked\_narrow\_cast (compiler workaround) [\#3566](https://github.com/kokkos/kokkos/pull/3566) +- Fixes for ROCm 3.9 [\#3565](https://github.com/kokkos/kokkos/pull/3565) +- Fix windows build issues which crept in for the CUDA build [\#3532](https://github.com/kokkos/kokkos/pull/3532) +- HIP Fix atomics of large data types and clean up lock arrays [\#3529](https://github.com/kokkos/kokkos/pull/3529) +- Pthreads fix exception resulting from 0 grain size [\#3510](https://github.com/kokkos/kokkos/pull/3510) +- Fixup do not require atomic operation to be default constructible [\#3503](https://github.com/kokkos/kokkos/pull/3503) +- Fix race condition in HIP backend [\#3467](https://github.com/kokkos/kokkos/pull/3467) +- Replace KOKKOS\_DEBUG with KOKKOS\_ENABLE\_DEBUG [\#3458](https://github.com/kokkos/kokkos/pull/3458) +- Fix multi-stream team scratch space definition for HIP [\#3398](https://github.com/kokkos/kokkos/pull/3398) +- HIP fix template deduction [\#3393](https://github.com/kokkos/kokkos/pull/3393) +- Fix compiling with HIP and C++17 [\#3390](https://github.com/kokkos/kokkos/pull/3390) +- Fix sigFPE in HIP blocksize deduction [\#3378](https://github.com/kokkos/kokkos/pull/3378) +- Type alias change: replace CS with CTS to avoid conflicts with NVSHMEM [\#3348](https://github.com/kokkos/kokkos/pull/3348) +- Clang compilation of CUDA backend on Windows [\#3345](https://github.com/kokkos/kokkos/pull/3345) +- Fix HBW support [\#3343](https://github.com/kokkos/kokkos/pull/3343) +- Added missing fences to unique token [\#3260](https://github.com/kokkos/kokkos/pull/3260) **Incompatibilities:** -- Remove unused utilities (forward, move, and expand\_variadic) from Kokkos::Impl [\#3535](https://github.com/kokkos/kokkos/pull/#3535) -- Remove unused traits [\#3534](https://github.com/kokkos/kokkos/pull/#3534) -- HIP: Remove old HCC code [\#3301](https://github.com/kokkos/kokkos/pull/#3301) -- Prepare for deprecation of ViewAllocateWithoutInitializing [\#3264](https://github.com/kokkos/kokkos/pull/#3264) -- Remove ROCm backend [\#3148](https://github.com/kokkos/kokkos/pull/#3148) +- Remove unused utilities (forward, move, and expand\_variadic) from Kokkos::Impl [\#3535](https://github.com/kokkos/kokkos/pull/3535) +- Remove unused traits [\#3534](https://github.com/kokkos/kokkos/pull/3534) +- HIP: Remove old HCC code [\#3301](https://github.com/kokkos/kokkos/pull/3301) +- Prepare for deprecation of ViewAllocateWithoutInitializing [\#3264](https://github.com/kokkos/kokkos/pull/3264) +- Remove ROCm backend [\#3148](https://github.com/kokkos/kokkos/pull/3148) ## [3.2.01](https://github.com/kokkos/kokkos/tree/3.2.01) (2020-11-17) [Full Changelog](https://github.com/kokkos/kokkos/compare/3.2.00...3.2.01) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4a4e7a55019..93a796f200b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -150,8 +150,8 @@ ENDIF() set(Kokkos_VERSION_MAJOR 4) -set(Kokkos_VERSION_MINOR 2) -set(Kokkos_VERSION_PATCH 1) +set(Kokkos_VERSION_MINOR 3) +set(Kokkos_VERSION_PATCH 0) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") message(STATUS "Kokkos version: ${Kokkos_VERSION}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") diff --git a/Makefile.kokkos b/Makefile.kokkos index 6e28d2c0cc6..2c74dd77bfb 100644 --- a/Makefile.kokkos +++ b/Makefile.kokkos @@ -1,8 +1,8 @@ # Default settings common options. KOKKOS_VERSION_MAJOR = 4 -KOKKOS_VERSION_MINOR = 2 -KOKKOS_VERSION_PATCH = 1 +KOKKOS_VERSION_MINOR = 3 +KOKKOS_VERSION_PATCH = 0 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial @@ -12,14 +12,14 @@ KOKKOS_DEVICES ?= "Threads" # Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR # NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90 # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX -# IBM: BGQ,Power7,Power8,Power9 -# AMD-GPUS: GFX906,GFX908,GFX90A,GFX940,GFX942,GFX1030,GFX1100 +# IBM: Power8,Power9 +# AMD-GPUS: AMD_GFX906,AMD_GFX908,AMD_GFX90A,AMD_GFX940,AMD_GFX942,AMD_GFX1030,AMD_GFX1100 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3 # Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP,PVC KOKKOS_ARCH ?= "" # Options: yes,no KOKKOS_DEBUG ?= "no" -# Options: hwloc,librt,experimental_memkind +# Options: hwloc KOKKOS_USE_TPLS ?= "" # Options: c++17,c++1z,c++20,c++2a,c++23,c++2b KOKKOS_CXX_STANDARD ?= "c++17" @@ -46,7 +46,7 @@ uppercase_internal=$(if $1,$$(subst $(firstword $1),$(call uppercase_internal,$( uppercase=$(eval uppercase_RESULT:=$(call uppercase_internal,$(uppercase_TABLE),$1))$(uppercase_RESULT) # Return a 1 if a string contains a substring and 0 if not # Note the search string should be without '"' -# Example: $(call kokkos_has_string,"hwloc,librt",hwloc) +# Example: $(call kokkos_has_string,"hwloc,libdl",hwloc) # Will return a 1 kokkos_has_string=$(if $(findstring $(call uppercase,$2),$(call uppercase,$1)),1,0) # Returns 1 if the path exists, 0 otherwise @@ -63,11 +63,11 @@ KOKKOS_INTERNAL_ENABLE_CXX20 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD), KOKKOS_INTERNAL_ENABLE_CXX2A := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2a) KOKKOS_INTERNAL_ENABLE_CXX23 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++23) KOKKOS_INTERNAL_ENABLE_CXX2B := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2b) +KOKKOS_INTERNAL_ENABLE_CXX26 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++26) +KOKKOS_INTERNAL_ENABLE_CXX2C := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2c) # Check for external libraries. KOKKOS_INTERNAL_USE_HWLOC := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),hwloc) -KOKKOS_INTERNAL_USE_LIBRT := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),librt) -KOKKOS_INTERNAL_USE_MEMKIND := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),experimental_memkind) # Check for advanced settings. KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),compiler_warnings) @@ -308,7 +308,6 @@ endif # Intel based. KOKKOS_INTERNAL_USE_ARCH_KNC := $(call kokkos_has_string,$(KOKKOS_ARCH),KNC) -KOKKOS_INTERNAL_USE_ARCH_WSM := $(call kokkos_has_string,$(KOKKOS_ARCH),WSM) KOKKOS_INTERNAL_USE_ARCH_SNB := $(call kokkos_has_string,$(KOKKOS_ARCH),SNB) KOKKOS_INTERNAL_USE_ARCH_HSW := $(call kokkos_has_string,$(KOKKOS_ARCH),HSW) KOKKOS_INTERNAL_USE_ARCH_BDW := $(call kokkos_has_string,$(KOKKOS_ARCH),BDW) @@ -388,11 +387,9 @@ KOKKOS_INTERNAL_USE_ARCH_A64FX := $(call kokkos_has_string,$(KOKKOS_ARCH),A64FX) KOKKOS_INTERNAL_USE_ARCH_ARM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2)+$(KOKKOS_INTERNAL_USE_ARCH_A64FX) | bc)) # IBM based. -KOKKOS_INTERNAL_USE_ARCH_BGQ := $(call kokkos_has_string,$(KOKKOS_ARCH),BGQ) -KOKKOS_INTERNAL_USE_ARCH_POWER7 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power7) KOKKOS_INTERNAL_USE_ARCH_POWER8 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power8) KOKKOS_INTERNAL_USE_ARCH_POWER9 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power9) -KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BGQ)+$(KOKKOS_INTERNAL_USE_ARCH_POWER7)+$(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc)) +KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc)) # AMD based. KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX) @@ -403,22 +400,37 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN3), 0) KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen) endif endif -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA906),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX906)) -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA908),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX908)) -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA90A),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX90A)) + +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX906) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA906) +endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX908) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA908) +endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX90A) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA90A) +endif KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX940) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX942) -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030)) -KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100)) +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030) +endif +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100) +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100), 0) + KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100) +endif # Any AVX? -KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM)) KOKKOS_INTERNAL_USE_ARCH_AVX := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)) KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3)) KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL)) # Incompatible flags? -KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_SKL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX)+$(KOKKOS_INTERNAL_USE_ARCH_ICL)+$(KOKKOS_INTERNAL_USE_ARCH_ICX)+$(KOKKOS_INTERNAL_USE_ARCH_SPR)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc) +KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_SKL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX)+$(KOKKOS_INTERNAL_USE_ARCH_ICL)+$(KOKKOS_INTERNAL_USE_ARCH_ICX)+$(KOKKOS_INTERNAL_USE_ARCH_SPR)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc) KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1") | bc) ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1) @@ -563,6 +575,16 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX2B), 1) KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX2B_FLAG) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX23") endif +ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX26), 1) + #I cannot make CMake add this in a good way - so add it here + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX26_FLAG) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX26") +endif +ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX2C), 1) + #I cannot make CMake add this in a good way - so add it here + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX2C_FLAG) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX26") +endif ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1) ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) @@ -602,27 +624,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HWLOC") endif -ifeq ($(KOKKOS_INTERNAL_USE_LIBRT), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_LIBRT") - KOKKOS_LIBS += -lrt - KOKKOS_TPL_LIBRARY_NAMES += rt -endif - -ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) - ifneq ($(KOKKOS_CMAKE), yes) - ifneq ($(MEMKIND_PATH),) - KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include - KOKKOS_LIBDIRS += -L$(MEMKIND_PATH)/lib - KOKKOS_CXXLDFLAGS += -L$(MEMKIND_PATH)/lib - KOKKOS_TPL_INCLUDE_DIRS += $(MEMKIND_PATH)/include - KOKKOS_TPL_LIBRARY_DIRS += $(MEMKIND_PATH)/lib - endif - KOKKOS_LIBS += -lmemkind -lnuma - KOKKOS_TPL_LIBRARY_NAMES += memkind numa - endif - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HBWSPACE") -endif - ifeq ($(KOKKOS_INTERNAL_ENABLE_LARGE_MEM_TESTS), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_LARGE_MEM_TESTS") endif @@ -689,10 +690,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) endif endif - ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_IMPL_CUDA_CLANG_WORKAROUND") - endif - ifeq ($(KOKKOS_INTERNAL_CUDA_DISABLE_MALLOC_ASYNC), 0) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC") else @@ -817,20 +814,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2), 1) endif endif -ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_SSE42") - - ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) - KOKKOS_CXXFLAGS += -xSSE4.2 - KOKKOS_LDFLAGS += -xSSE4.2 - else ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) - else - # Assume that this is a really a GNU compiler. - KOKKOS_CXXFLAGS += -msse4.2 - KOKKOS_LDFLAGS += -msse4.2 - endif -endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX") @@ -1239,7 +1222,6 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp") tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp") - tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp") ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") @@ -1279,10 +1261,6 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif - ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") - endif endif KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) @@ -1393,11 +1371,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) KOKKOS_TPL_LIBRARY_NAMES += hpx endif -# Don't include Kokkos_HBWSpace.cpp if not using MEMKIND to avoid a link warning. -ifneq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) - KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp,$(KOKKOS_SRC)) -endif - # With Cygwin functions such as fdopen and fileno are not defined # when strict ansi is enabled. strict ansi gets enabled with -std=c++14 # though. So we hard undefine it here. Not sure if that has any bad side effects @@ -1451,6 +1424,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) else tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENMP */") endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_OPENACC") +else + tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENACC */") +endif tmp := $(call desul_append_header, "") tmp := $(call desul_append_header, "$H""endif") @@ -1483,7 +1462,7 @@ include $(KOKKOS_PATH)/Makefile.targets kokkos-clean: rm -f $(KOKKOS_OBJ_LINK) $(DESUL_CONFIG_HEADER) $(DESUL_INTERNAL_CONFIG_TMP) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a KokkosCore_Config_SetupBackend.hpp \ KokkosCore_Config_FwdBackend.hpp KokkosCore_Config_DeclareBackend.hpp KokkosCore_Config_DeclareBackend.tmp \ - KokkosCore_Config_FwdBackend.tmp KokkosCore_Config_PostInclude.hpp KokkosCore_Config_PostInclude.tmp KokkosCore_Config_SetupBackend.tmp + KokkosCore_Config_FwdBackend.tmp KokkosCore_Config_SetupBackend.tmp libkokkos.a: $(KOKKOS_OBJ_LINK) $(KOKKOS_SRC) $(KOKKOS_HEADERS) ar cr libkokkos.a $(KOKKOS_OBJ_LINK) diff --git a/Makefile.targets b/Makefile.targets index ec8770dd7de..e6900a822a8 100644 --- a/Makefile.targets +++ b/Makefile.targets @@ -20,8 +20,6 @@ Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Ta $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp -Kokkos_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp Kokkos_HostBarrier.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp Kokkos_Profiling.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling.cpp @@ -30,8 +28,6 @@ Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_ $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp -Kokkos_MemorySpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp Kokkos_HostSpace_deepcopy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp Kokkos_NumericTraits.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp @@ -82,8 +78,10 @@ Lock_Array_HIP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array endif ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1) -Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp +Kokkos_Threads_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Instance.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Instance.cpp +Kokkos_Threads_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_Spinwait.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Spinwait.cpp endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) @@ -123,6 +121,3 @@ Kokkos_OpenACC_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenACC Kokkos_OpenACC_SharedAllocationRecord.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp endif - -Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp diff --git a/README.md b/README.md index 033346e956e..19793bb82d9 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ To start learning about Kokkos: - [Use cases and Examples](https://kokkos.github.io/kokkos-core-wiki/usecases.html): a series of examples ranging from how to use Kokkos with MPI to Fortran interoperability. -For questions find us on Slack: https://kokkosteam.slack.com or open a github issue. +For questions find us on Slack: https://kokkosteam.slack.com or open a GitHub issue. For non-public questions send an email to: *crtrott(at)sandia.gov* @@ -48,10 +48,10 @@ Please see the [following page](https://kokkos.github.io/kokkos-core-wiki/citati # License -[![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) +[![License](https://img.shields.io/badge/License-Apache--2.0_WITH_LLVM--exception-blue)](https://spdx.org/licenses/LLVM-exception.html) Under the terms of Contract DE-NA0003525 with NTESS, the U.S. Government retains certain rights in this software. -The full license statement used in all headers is available [here](https://kokkos.github.io/kokkos-core-wiki/license.html) or -[here](https://github.com/kokkos/kokkos/blob/master/LICENSE). +The full license statement used in all headers is available [here](https://kokkos.org/kokkos-core-wiki/license.html) or +[here](https://github.com/kokkos/kokkos/blob/develop/LICENSE). diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 00000000000..93cf6e3663e --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,12 @@ +# Reporting Security Issues + +To report a security issue, please email +[lebrungrandt@ornl.gov](mailto:lebrungrandt@ornl.gov) +and [crtrott@sandia.gov](mailto:crtrott@sandia.gov) +with a description of the issue, the steps you took to create the issue, +affected versions, and, if known, mitigations for the issue. + +Our vulnerability management team will respond within 5 working days of your +email. If the issue is confirmed as a vulnerability, we will open a +Security Advisory and acknowledge your contributions as part of it. This project +follows a 90 day disclosure timeline. diff --git a/Spack.md b/Spack.md index 79606c259d5..06c763a64ee 100644 --- a/Spack.md +++ b/Spack.md @@ -159,7 +159,6 @@ If you don't specify a CUDA build variant in a `packages.yaml` and you build you > spack install superscience ```` you may end up just getting the default Kokkos (i.e. Serial). -Some examples are included in the `config/yaml` folder for common platforms. Before running `spack install ` we recommend running `spack spec ` to confirm your dependency tree is correct. For example, with Kokkos Kernels: ````bash diff --git a/algorithms/src/CMakeLists.txt b/algorithms/src/CMakeLists.txt index 16957789472..b490caca628 100644 --- a/algorithms/src/CMakeLists.txt +++ b/algorithms/src/CMakeLists.txt @@ -30,5 +30,5 @@ KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkosalgorithms ${CMAKE_CURRENT_SOURCE_DIR} ) - - +KOKKOS_LINK_TPL(kokkoscontainers PUBLIC ROCTHRUST) +KOKKOS_LINK_TPL(kokkoscore PUBLIC ONEDPL) diff --git a/algorithms/src/Kokkos_Random.hpp b/algorithms/src/Kokkos_Random.hpp index 2d7d236d2fc..7df12b8518e 100644 --- a/algorithms/src/Kokkos_Random.hpp +++ b/algorithms/src/Kokkos_Random.hpp @@ -849,18 +849,17 @@ class Random_XorShift64 { return drand(end - start) + start; } - // Marsaglia polar method for drawing a standard normal distributed random + // Box-muller method for drawing a standard normal distributed random // number KOKKOS_INLINE_FUNCTION double normal() { - double S = 2.0; - double U; - while (S >= 1.0) { - U = 2.0 * drand() - 1.0; - const double V = 2.0 * drand() - 1.0; - S = U * U + V * V; - } - return U * std::sqrt(-2.0 * std::log(S) / S); + constexpr auto two_pi = 2 * Kokkos::numbers::pi_v; + + const double u = drand(); + const double v = drand(); + const double r = Kokkos::sqrt(-2.0 * Kokkos::log(u)); + const double theta = v * two_pi; + return r * Kokkos::cos(theta); } KOKKOS_INLINE_FUNCTION @@ -1094,18 +1093,17 @@ class Random_XorShift1024 { return drand(end - start) + start; } - // Marsaglia polar method for drawing a standard normal distributed random + // Box-muller method for drawing a standard normal distributed random // number KOKKOS_INLINE_FUNCTION double normal() { - double S = 2.0; - double U; - while (S >= 1.0) { - U = 2.0 * drand() - 1.0; - const double V = 2.0 * drand() - 1.0; - S = U * U + V * V; - } - return U * std::sqrt(-2.0 * std::log(S) / S); + constexpr auto two_pi = 2 * Kokkos::numbers::pi_v; + + const double u = drand(); + const double v = drand(); + const double r = Kokkos::sqrt(-2.0 * Kokkos::log(u)); + const double theta = v * two_pi; + return r * Kokkos::cos(theta); } KOKKOS_INLINE_FUNCTION @@ -1545,13 +1543,23 @@ template void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type begin, typename ViewType::const_value_type end) { - fill_random(typename ViewType::execution_space{}, a, g, begin, end); + Kokkos::fence( + "fill_random: fence before since no execution space instance provided"); + typename ViewType::execution_space exec; + fill_random(exec, a, g, begin, end); + exec.fence( + "fill_random: fence after since no execution space instance provided"); } template void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type range) { - fill_random(typename ViewType::execution_space{}, a, g, 0, range); + Kokkos::fence( + "fill_random: fence before since no execution space instance provided"); + typename ViewType::execution_space exec; + fill_random(exec, a, g, 0, range); + exec.fence( + "fill_random: fence after since no execution space instance provided"); } } // namespace Kokkos diff --git a/algorithms/src/Kokkos_Sort.hpp b/algorithms/src/Kokkos_Sort.hpp index f77484cc555..136b4ec82dc 100644 --- a/algorithms/src/Kokkos_Sort.hpp +++ b/algorithms/src/Kokkos_Sort.hpp @@ -23,6 +23,7 @@ #include "sorting/Kokkos_BinSortPublicAPI.hpp" #include "sorting/Kokkos_SortPublicAPI.hpp" +#include "sorting/Kokkos_SortByKeyPublicAPI.hpp" #include "sorting/Kokkos_NestedSortPublicAPI.hpp" #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_SORT diff --git a/algorithms/src/Kokkos_StdAlgorithms.hpp b/algorithms/src/Kokkos_StdAlgorithms.hpp index 436ae0d10bf..b532a774e13 100644 --- a/algorithms/src/Kokkos_StdAlgorithms.hpp +++ b/algorithms/src/Kokkos_StdAlgorithms.hpp @@ -35,7 +35,6 @@ // following the std classification. // modifying ops -#include "std_algorithms/Kokkos_Swap.hpp" #include "std_algorithms/Kokkos_IterSwap.hpp" // non-modifying sequence diff --git a/algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp b/algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp new file mode 100644 index 00000000000..fc73eccad68 --- /dev/null +++ b/algorithms/src/sorting/Kokkos_SortByKeyPublicAPI.hpp @@ -0,0 +1,117 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SORT_BY_KEY_PUBLIC_API_HPP_ +#define KOKKOS_SORT_BY_KEY_PUBLIC_API_HPP_ + +#include "./impl/Kokkos_SortByKeyImpl.hpp" +#include +#include + +namespace Kokkos::Experimental { + +// --------------------------------------------------------------- +// basic overloads +// --------------------------------------------------------------- + +template +void sort_by_key( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { + // constraints + using KeysType = Kokkos::View; + using ValuesType = Kokkos::View; + ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(keys); + ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(values); + + static_assert(SpaceAccessibility::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the keys View argument!"); + static_assert( + SpaceAccessibility::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the values View argument!"); + + static_assert(KeysType::static_extent(0) == 0 || + ValuesType::static_extent(0) == 0 || + KeysType::static_extent(0) == ValuesType::static_extent(0)); + if (values.size() != keys.size()) + Kokkos::abort((std::string("values and keys extents must be the same. The " + "values extent is ") + + std::to_string(values.size()) + ", and the keys extent is " + + std::to_string(keys.size()) + ".") + .c_str()); + + if (keys.extent(0) <= 1) { + return; + } + + ::Kokkos::Impl::sort_by_key_device_view_without_comparator(exec, keys, + values); +} + +// --------------------------------------------------------------- +// overloads supporting a custom comparator +// --------------------------------------------------------------- + +template +void sort_by_key( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { + // constraints + using KeysType = Kokkos::View; + using ValuesType = Kokkos::View; + ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(keys); + ::Kokkos::Impl::static_assert_is_admissible_to_kokkos_sort_by_key(values); + + static_assert(SpaceAccessibility::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the keys View argument!"); + static_assert( + SpaceAccessibility::accessible, + "Kokkos::sort: execution space instance is not able to access " + "the memory space of the values View argument!"); + + static_assert(KeysType::static_extent(0) == 0 || + ValuesType::static_extent(0) == 0 || + KeysType::static_extent(0) == ValuesType::static_extent(0)); + if (values.size() != keys.size()) + Kokkos::abort((std::string("values and keys extents must be the same. The " + "values extent is ") + + std::to_string(values.size()) + ", and the keys extent is " + + std::to_string(keys.size()) + ".") + .c_str()); + + if (keys.extent(0) <= 1) { + return; + } + + ::Kokkos::Impl::sort_by_key_device_view_with_comparator(exec, keys, values, + comparator); +} + +} // namespace Kokkos::Experimental +#endif diff --git a/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp b/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp index a763c41e580..308e9e3a008 100644 --- a/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp +++ b/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp @@ -29,7 +29,7 @@ namespace Kokkos { // --------------------------------------------------------------- template -void sort([[maybe_unused]] const ExecutionSpace& exec, +void sort(const ExecutionSpace& exec, const Kokkos::View& view) { // constraints using ViewType = Kokkos::View; @@ -52,6 +52,7 @@ void sort([[maybe_unused]] const ExecutionSpace& exec, } if constexpr (Impl::better_off_calling_std_sort_v) { + exec.fence("Kokkos::sort without comparator use std::sort"); auto first = ::Kokkos::Experimental::begin(view); auto last = ::Kokkos::Experimental::end(view); std::sort(first, last); @@ -82,7 +83,7 @@ void sort(const Kokkos::View& view) { // --------------------------------------------------------------- template -void sort([[maybe_unused]] const ExecutionSpace& exec, +void sort(const ExecutionSpace& exec, const Kokkos::View& view, const ComparatorType& comparator) { // constraints @@ -105,6 +106,7 @@ void sort([[maybe_unused]] const ExecutionSpace& exec, } if constexpr (Impl::better_off_calling_std_sort_v) { + exec.fence("Kokkos::sort with comparator use std::sort"); auto first = ::Kokkos::Experimental::begin(view); auto last = ::Kokkos::Experimental::end(view); std::sort(first, last, comparator); diff --git a/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp b/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp index 50ac8233195..2fe58272d92 100644 --- a/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp +++ b/algorithms/src/sorting/impl/Kokkos_NestedSortImpl.hpp @@ -18,7 +18,6 @@ #define KOKKOS_NESTED_SORT_IMPL_HPP_ #include -#include namespace Kokkos { namespace Experimental { @@ -99,7 +98,7 @@ KOKKOS_INLINE_FUNCTION void sort_nested_impl( keyView(elem1) = key2; keyView(elem2) = key1; if constexpr (!std::is_same_v) { - Kokkos::Experimental::swap(valueView(elem1), valueView(elem2)); + Kokkos::kokkos_swap(valueView(elem1), valueView(elem2)); } } } diff --git a/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp b/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp new file mode 100644 index 00000000000..36deccdfb1e --- /dev/null +++ b/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp @@ -0,0 +1,401 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SORT_BY_KEY_FREE_FUNCS_IMPL_HPP_ +#define KOKKOS_SORT_BY_KEY_FREE_FUNCS_IMPL_HPP_ + +#include + +#if defined(KOKKOS_ENABLE_CUDA) + +// Workaround for `Instruction 'shfl' without '.sync' is not supported on +// .target sm_70 and higher from PTX ISA version 6.4`. +// Also see https://github.com/NVIDIA/cub/pull/170. +#if !defined(CUB_USE_COOPERATIVE_GROUPS) +#define CUB_USE_COOPERATIVE_GROUPS +#endif + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wshadow" + +#if defined(KOKKOS_COMPILER_CLANG) +// Some versions of Clang fail to compile Thrust, failing with errors like +// this: +// /thrust/system/cuda/detail/core/agent_launcher.h:557:11: +// error: use of undeclared identifier 'va_printf' +// The exact combination of versions for Clang and Thrust (or CUDA) for this +// failure was not investigated, however even very recent version combination +// (Clang 10.0.0 and Cuda 10.0) demonstrated failure. +// +// Defining _CubLog here locally allows us to avoid that code path, however +// disabling some debugging diagnostics +#pragma push_macro("_CubLog") +#ifdef _CubLog +#undef _CubLog +#endif +#define _CubLog +#include +#include +#pragma pop_macro("_CubLog") +#else +#include +#include +#endif + +#pragma GCC diagnostic pop + +#endif + +#if defined(KOKKOS_ENABLE_ROCTHRUST) +#include +#include +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) && \ + (ONEDPL_VERSION_MAJOR > 2022 || \ + (ONEDPL_VERSION_MAJOR == 2022 && ONEDPL_VERSION_MINOR >= 2)) +#define KOKKOS_ONEDPL_HAS_SORT_BY_KEY +#include +#include +#endif + +namespace Kokkos::Impl { + +template +constexpr inline bool is_admissible_to_kokkos_sort_by_key = + ::Kokkos::is_view::value&& T::rank() == 1 && + (std::is_same::value || + std::is_same::value || + std::is_same::value); + +template +KOKKOS_INLINE_FUNCTION constexpr void +static_assert_is_admissible_to_kokkos_sort_by_key(const ViewType& /* view */) { + static_assert(is_admissible_to_kokkos_sort_by_key, + "Kokkos::sort_by_key only accepts 1D values View with " + "LayoutRight, LayoutLeft or LayoutStride."); +} + +// For the fallback implementation for sort_by_key using Kokkos::sort, we need +// to consider if Kokkos::sort defers to the fallback implementation that copies +// the array to the host and uses std::sort, see +// copy_to_host_run_stdsort_copy_back() in impl/Kokkos_SortImpl.hpp. If +// sort_on_device_v is true, we assume that std::sort doesn't copy data. +// Otherwise, we manually copy all data to the host and provide Kokkos::sort +// with a host execution space. +template +inline constexpr bool sort_on_device_v = false; + +#if defined(KOKKOS_ENABLE_CUDA) +template +inline constexpr bool sort_on_device_v = true; + +template +void sort_by_key_cudathrust( + const Kokkos::Cuda& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + MaybeComparator&&... maybeComparator) { + const auto policy = thrust::cuda::par.on(exec.cuda_stream()); + auto keys_first = ::Kokkos::Experimental::begin(keys); + auto keys_last = ::Kokkos::Experimental::end(keys); + auto values_first = ::Kokkos::Experimental::begin(values); + thrust::sort_by_key(policy, keys_first, keys_last, values_first, + std::forward(maybeComparator)...); +} +#endif + +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +inline constexpr bool sort_on_device_v = true; + +template +void sort_by_key_rocthrust( + const Kokkos::HIP& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + MaybeComparator&&... maybeComparator) { + const auto policy = thrust::hip::par.on(exec.hip_stream()); + auto keys_first = ::Kokkos::Experimental::begin(keys); + auto keys_last = ::Kokkos::Experimental::end(keys); + auto values_first = ::Kokkos::Experimental::begin(values); + thrust::sort_by_key(policy, keys_first, keys_last, values_first, + std::forward(maybeComparator)...); +} +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) +template +inline constexpr bool sort_on_device_v = + std::is_same_v || + std::is_same_v; + +#ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY +template +void sort_by_key_onedpl( + const Kokkos::Experimental::SYCL& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + MaybeComparator&&... maybeComparator) { + if (keys.stride(0) != 1 && values.stride(0) != 1) { + Kokkos::abort( + "SYCL sort_by_key only supports rank-1 Views with stride(0) = 1."); + } + + // Can't use Experimental::begin/end here since the oneDPL then assumes that + // the data is on the host. + auto queue = exec.sycl_queue(); + auto policy = oneapi::dpl::execution::make_device_policy(queue); + const int n = keys.extent(0); + oneapi::dpl::sort_by_key(policy, keys.data(), keys.data() + n, values.data(), + std::forward(maybeComparator)...); +} +#endif +#endif + +template +void applyPermutation(const ExecutionSpace& space, + const PermutationView& permutation, + const ViewType& view) { + static_assert(std::is_integral::value); + + auto view_copy = Kokkos::create_mirror( + Kokkos::view_alloc(space, typename ExecutionSpace::memory_space{}, + Kokkos::WithoutInitializing), + view); + Kokkos::deep_copy(space, view_copy, view); + Kokkos::parallel_for( + "Kokkos::sort_by_key_via_sort::permute_" + view.label(), + Kokkos::RangePolicy(space, 0, view.extent(0)), + KOKKOS_LAMBDA(int i) { view(i) = view_copy(permutation(i)); }); +} + +template +void sort_by_key_via_sort( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + MaybeComparator&&... maybeComparator) { + static_assert(sizeof...(MaybeComparator) <= 1); + + auto const n = keys.size(); + + Kokkos::View permute( + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, + "Kokkos::sort_by_key_via_sort::permute"), + n); + + // iota + Kokkos::parallel_for( + "Kokkos::sort_by_key_via_sort::iota", + Kokkos::RangePolicy(exec, 0, n), + KOKKOS_LAMBDA(int i) { permute(i) = i; }); + + using Layout = + typename Kokkos::View::array_layout; + if constexpr (!sort_on_device_v) { + auto host_keys = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::HostSpace{}, Kokkos::WithoutInitializing), + keys); + auto host_permute = Kokkos::create_mirror_view( + Kokkos::view_alloc(Kokkos::HostSpace{}, Kokkos::WithoutInitializing), + permute); + Kokkos::deep_copy(exec, host_keys, keys); + Kokkos::deep_copy(exec, host_permute, permute); + + exec.fence("Kokkos::Impl::sort_by_key_via_sort: before host sort"); + Kokkos::DefaultHostExecutionSpace host_exec; + + if constexpr (sizeof...(MaybeComparator) == 0) { + Kokkos::sort( + host_exec, host_permute, + KOKKOS_LAMBDA(int i, int j) { return host_keys(i) < host_keys(j); }); + } else { + auto keys_comparator = + std::get<0>(std::tuple(maybeComparator...)); + Kokkos::sort( + host_exec, host_permute, KOKKOS_LAMBDA(int i, int j) { + return keys_comparator(host_keys(i), host_keys(j)); + }); + } + host_exec.fence("Kokkos::Impl::sort_by_key_via_sort: after host sort"); + Kokkos::deep_copy(exec, permute, host_permute); + } else { +#ifdef KOKKOS_ENABLE_SYCL + auto* raw_keys_in_comparator = keys.data(); + auto stride = keys.stride(0); + if constexpr (sizeof...(MaybeComparator) == 0) { + Kokkos::sort( + exec, permute, KOKKOS_LAMBDA(int i, int j) { + return raw_keys_in_comparator[i * stride] < + raw_keys_in_comparator[j * stride]; + }); + } else { + auto keys_comparator = + std::get<0>(std::tuple(maybeComparator...)); + Kokkos::sort( + exec, permute, KOKKOS_LAMBDA(int i, int j) { + return keys_comparator(raw_keys_in_comparator[i * stride], + raw_keys_in_comparator[j * stride]); + }); + } +#else + if constexpr (sizeof...(MaybeComparator) == 0) { + Kokkos::sort( + exec, permute, + KOKKOS_LAMBDA(int i, int j) { return keys(i) < keys(j); }); + } else { + auto keys_comparator = + std::get<0>(std::tuple(maybeComparator...)); + Kokkos::sort( + exec, permute, KOKKOS_LAMBDA(int i, int j) { + return keys_comparator(keys(i), keys(j)); + }); + } +#endif + } + + applyPermutation(exec, permute, keys); + applyPermutation(exec, permute, values); +} + +// ------------------------------------------------------ +// +// specialize cases for sorting by key without comparator +// +// ------------------------------------------------------ + +#if defined(KOKKOS_ENABLE_CUDA) +template +void sort_by_key_device_view_without_comparator( + const Kokkos::Cuda& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { + sort_by_key_cudathrust(exec, keys, values); +} +#endif + +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_by_key_device_view_without_comparator( + const Kokkos::HIP& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { + sort_by_key_rocthrust(exec, keys, values); +} +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) +template +void sort_by_key_device_view_without_comparator( + const Kokkos::Experimental::SYCL& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { +#ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY + if (keys.stride(0) == 1 && values.stride(0) == 1) + sort_by_key_onedpl(exec, keys, values); + else +#endif + sort_by_key_via_sort(exec, keys, values); +} +#endif + +// fallback case +template +std::enable_if_t::value> +sort_by_key_device_view_without_comparator( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values) { + sort_by_key_via_sort(exec, keys, values); +} + +// --------------------------------------------------- +// +// specialize cases for sorting by key with comparator +// +// --------------------------------------------------- + +#if defined(KOKKOS_ENABLE_CUDA) +template +void sort_by_key_device_view_with_comparator( + const Kokkos::Cuda& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { + sort_by_key_cudathrust(exec, keys, values, comparator); +} +#endif + +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_by_key_device_view_with_comparator( + const Kokkos::HIP& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { + sort_by_key_rocthrust(exec, keys, values, comparator); +} +#endif + +#if defined(KOKKOS_ENABLE_ONEDPL) +template +void sort_by_key_device_view_with_comparator( + const Kokkos::Experimental::SYCL& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { +#ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY + if (keys.stride(0) == 1 && values.stride(0) == 1) + sort_by_key_onedpl(exec, keys, values, comparator); + else +#endif + sort_by_key_via_sort(exec, keys, values, comparator); +} +#endif + +// fallback case +template +std::enable_if_t::value> +sort_by_key_device_view_with_comparator( + const ExecutionSpace& exec, + const Kokkos::View& keys, + const Kokkos::View& values, + const ComparatorType& comparator) { + sort_by_key_via_sort(exec, keys, values, comparator); +} + +#undef KOKKOS_ONEDPL_HAS_SORT_BY_KEY + +} // namespace Kokkos::Impl +#endif diff --git a/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp b/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp index d87ab09e772..4c174b5fda9 100644 --- a/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp +++ b/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp @@ -63,6 +63,11 @@ #endif +#if defined(KOKKOS_ENABLE_ROCTHRUST) +#include +#include +#endif + #if defined(KOKKOS_ENABLE_ONEDPL) #include #include @@ -184,6 +189,26 @@ void sort_cudathrust(const Cuda& space, } #endif +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_rocthrust(const HIP& space, + const Kokkos::View& view, + MaybeComparator&&... maybeComparator) { + using ViewType = Kokkos::View; + static_assert(ViewType::rank == 1, + "Kokkos::sort: currently only supports rank-1 Views."); + + if (view.extent(0) <= 1) { + return; + } + const auto exec = thrust::hip::par.on(space.hip_stream()); + auto first = ::Kokkos::Experimental::begin(view); + auto last = ::Kokkos::Experimental::end(view); + thrust::sort(exec, first, last, + std::forward(maybeComparator)...); +} +#endif + #if defined(KOKKOS_ENABLE_ONEDPL) template void sort_onedpl(const Kokkos::Experimental::SYCL& space, @@ -274,6 +299,14 @@ void sort_device_view_without_comparator( } #endif +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_device_view_without_comparator( + const HIP& exec, const Kokkos::View& view) { + sort_rocthrust(exec, view); +} +#endif + #if defined(KOKKOS_ENABLE_ONEDPL) template void sort_device_view_without_comparator( @@ -320,6 +353,15 @@ void sort_device_view_with_comparator( } #endif +#if defined(KOKKOS_ENABLE_ROCTHRUST) +template +void sort_device_view_with_comparator( + const HIP& exec, const Kokkos::View& view, + const ComparatorType& comparator) { + sort_rocthrust(exec, view, comparator); +} +#endif + #if defined(KOKKOS_ENABLE_ONEDPL) template void sort_device_view_with_comparator( diff --git a/algorithms/src/std_algorithms/Kokkos_Copy.hpp b/algorithms/src/std_algorithms/Kokkos_Copy.hpp index b7ce1ba5edb..c5406c72b0d 100644 --- a/algorithms/src/std_algorithms/Kokkos_Copy.hpp +++ b/algorithms/src/std_algorithms/Kokkos_Copy.hpp @@ -50,7 +50,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -66,7 +66,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -93,7 +93,7 @@ template & source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp b/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp index 8f9e0f19b80..82071a9362e 100644 --- a/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp +++ b/algorithms/src/std_algorithms/Kokkos_CopyBackward.hpp @@ -50,7 +50,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_backward(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -65,7 +65,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_backward(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -92,7 +92,7 @@ template & source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp b/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp index ba18bc76b93..599fde5737a 100644 --- a/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp +++ b/algorithms/src/std_algorithms/Kokkos_CopyIf.hpp @@ -54,7 +54,8 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_if(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest, Predicate pred) { + const ::Kokkos::View& dest, + Predicate pred) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -69,7 +70,8 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_if(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest, Predicate pred) { + const ::Kokkos::View& dest, + Predicate pred) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -96,7 +98,7 @@ template & source, - ::Kokkos::View& dest, Predicate pred) { + const ::Kokkos::View& dest, Predicate pred) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/algorithms/src/std_algorithms/Kokkos_CopyN.hpp b/algorithms/src/std_algorithms/Kokkos_CopyN.hpp index 43c91204837..637d8d4cbc5 100644 --- a/algorithms/src/std_algorithms/Kokkos_CopyN.hpp +++ b/algorithms/src/std_algorithms/Kokkos_CopyN.hpp @@ -51,7 +51,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_n(const ExecutionSpace& ex, const ::Kokkos::View& source, Size count, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -66,7 +66,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto copy_n(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, Size count, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -93,7 +93,7 @@ template & source, Size count, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/algorithms/src/std_algorithms/Kokkos_Equal.hpp b/algorithms/src/std_algorithms/Kokkos_Equal.hpp index a72a49cc22b..593c42f87e1 100644 --- a/algorithms/src/std_algorithms/Kokkos_Equal.hpp +++ b/algorithms/src/std_algorithms/Kokkos_Equal.hpp @@ -80,7 +80,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool equal(const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -96,7 +96,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool equal(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -111,7 +111,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool equal(const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2, + const ::Kokkos::View& view2, BinaryPredicateType predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -128,7 +128,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool equal(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2, + const ::Kokkos::View& view2, BinaryPredicateType predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -227,7 +227,7 @@ template & view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -243,7 +243,7 @@ template & view1, - ::Kokkos::View& view2, + const ::Kokkos::View& view2, BinaryPredicateType predicate) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); diff --git a/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp b/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp index a796a306dda..5bb2d1039dc 100644 --- a/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp +++ b/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp @@ -19,7 +19,6 @@ #include #include "impl/Kokkos_Constraints.hpp" -#include "Kokkos_Swap.hpp" namespace Kokkos { namespace Experimental { @@ -33,7 +32,7 @@ struct StdIterSwapFunctor { KOKKOS_FUNCTION void operator()(int i) const { (void)i; - ::Kokkos::Experimental::swap(*m_a, *m_b); + ::Kokkos::kokkos_swap(*m_a, *m_b); } KOKKOS_FUNCTION @@ -58,6 +57,16 @@ void iter_swap(IteratorType1 a, IteratorType2 b) { Impl::iter_swap_impl(a, b); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template +KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::kokkos_swap instead!") +KOKKOS_FUNCTION + void swap(T& a, T& b) noexcept(::Kokkos::kokkos_swap(std::declval(), + std::declval())) { + ::Kokkos::kokkos_swap(a, b); +} +#endif + } // namespace Experimental } // namespace Kokkos diff --git a/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp b/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp index 4b5c69df451..e13479c370b 100644 --- a/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp +++ b/algorithms/src/std_algorithms/Kokkos_LexicographicalCompare.hpp @@ -54,7 +54,7 @@ template < bool lexicographical_compare( const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -71,7 +71,7 @@ template < bool lexicographical_compare( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -112,7 +112,8 @@ template < bool lexicographical_compare( const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2, ComparatorType comp) { + const ::Kokkos::View& view2, + ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -129,7 +130,8 @@ template < bool lexicographical_compare( const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view1, - ::Kokkos::View& view2, ComparatorType comp) { + const ::Kokkos::View& view2, + ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -161,7 +163,7 @@ template & view1, - ::Kokkos::View& view2) { + const ::Kokkos::View& view2) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); @@ -187,7 +189,8 @@ template & view1, - ::Kokkos::View& view2, ComparatorType comp) { + const ::Kokkos::View& view2, + ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view2); diff --git a/algorithms/src/std_algorithms/Kokkos_Move.hpp b/algorithms/src/std_algorithms/Kokkos_Move.hpp index f04ea12ba88..ac308ea1845 100644 --- a/algorithms/src/std_algorithms/Kokkos_Move.hpp +++ b/algorithms/src/std_algorithms/Kokkos_Move.hpp @@ -50,7 +50,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto move(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -64,7 +64,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto move(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -92,7 +92,7 @@ template & source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp b/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp index 375474ca57f..2789ab21796 100644 --- a/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp +++ b/algorithms/src/std_algorithms/Kokkos_MoveBackward.hpp @@ -41,7 +41,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto move_backward(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -65,7 +65,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto move_backward(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -94,7 +94,7 @@ template & source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp b/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp index 37336c983ab..66f39c4eaa6 100644 --- a/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp +++ b/algorithms/src/std_algorithms/Kokkos_ReverseCopy.hpp @@ -50,7 +50,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto reverse_copy(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -65,7 +65,7 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto reverse_copy(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -94,7 +94,7 @@ template & source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp b/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp index 39f33b64879..d66763d304c 100644 --- a/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp +++ b/algorithms/src/std_algorithms/Kokkos_SwapRanges.hpp @@ -40,7 +40,7 @@ template , int> = 0> auto swap_ranges(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -64,7 +64,7 @@ template , int> = 0> auto swap_ranges(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -94,7 +94,7 @@ template & source, - ::Kokkos::View& dest) { + const ::Kokkos::View& dest) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); diff --git a/algorithms/src/std_algorithms/Kokkos_Transform.hpp b/algorithms/src/std_algorithms/Kokkos_Transform.hpp index 838c9169e25..84cbed524d3 100644 --- a/algorithms/src/std_algorithms/Kokkos_Transform.hpp +++ b/algorithms/src/std_algorithms/Kokkos_Transform.hpp @@ -58,7 +58,7 @@ template , int> = 0> auto transform(const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest, + const ::Kokkos::View& dest, UnaryOperation unary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -73,7 +73,7 @@ template , int> = 0> auto transform(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& source, - ::Kokkos::View& dest, + const ::Kokkos::View& dest, UnaryOperation unary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -119,7 +119,7 @@ template & source1, const ::Kokkos::View& source2, - ::Kokkos::View& dest, + const ::Kokkos::View& dest, BinaryOperation binary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2); @@ -137,7 +137,7 @@ template & source1, const ::Kokkos::View& source2, - ::Kokkos::View& dest, + const ::Kokkos::View& dest, BinaryOperation binary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2); @@ -174,7 +174,8 @@ template & source, - ::Kokkos::View& dest, UnaryOperation unary_op) { + const ::Kokkos::View& dest, + UnaryOperation unary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(dest); @@ -207,7 +208,7 @@ KOKKOS_FUNCTION auto transform( const TeamHandleType& teamHandle, const ::Kokkos::View& source1, const ::Kokkos::View& source2, - ::Kokkos::View& dest, + const ::Kokkos::View& dest, BinaryOperation binary_op) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source1); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(source2); diff --git a/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp b/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp index 8151ee34955..5a7fe16984a 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_FunctorsForExclusiveScan.hpp @@ -47,8 +47,9 @@ struct ExclusiveScanDefaultFunctorForKnownNeutralElement { KOKKOS_FUNCTION void operator()(const IndexType i, ValueType& update, const bool final_pass) const { + const auto tmp = m_first_from[i]; if (final_pass) m_first_dest[i] = update + m_init_value; - update += m_first_from[i]; + update += tmp; } }; @@ -73,6 +74,7 @@ struct ExclusiveScanDefaultFunctorWithValueWrapper { KOKKOS_FUNCTION void operator()(const IndexType i, value_type& update, const bool final_pass) const { + const auto tmp = value_type{m_first_from[i], false}; if (final_pass) { if (i == 0) { m_first_dest[i] = m_init_value; @@ -81,7 +83,6 @@ struct ExclusiveScanDefaultFunctorWithValueWrapper { } } - const auto tmp = value_type{m_first_from[i], false}; this->join(update, tmp); } @@ -132,6 +133,7 @@ struct TransformExclusiveScanFunctorWithValueWrapper { KOKKOS_FUNCTION void operator()(const IndexType i, value_type& update, const bool final_pass) const { + const auto tmp = value_type{m_unary_op(m_first_from[i]), false}; if (final_pass) { if (i == 0) { // for both ExclusiveScan and TransformExclusiveScan, @@ -142,7 +144,6 @@ struct TransformExclusiveScanFunctorWithValueWrapper { } } - const auto tmp = value_type{m_unary_op(m_first_from[i]), false}; this->join(update, tmp); } @@ -190,6 +191,7 @@ struct TransformExclusiveScanFunctorWithoutValueWrapper { KOKKOS_FUNCTION void operator()(const IndexType i, ValueType& update, const bool final_pass) const { + const auto tmp = ValueType{m_unary_op(m_first_from[i])}; if (final_pass) { if (i == 0) { // for both ExclusiveScan and TransformExclusiveScan, @@ -200,7 +202,6 @@ struct TransformExclusiveScanFunctorWithoutValueWrapper { } } - const auto tmp = ValueType{m_unary_op(m_first_from[i])}; this->join(update, tmp); } diff --git a/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp b/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp index 50224c8874e..456df43aed2 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_RemoveAllVariants.hpp @@ -46,15 +46,14 @@ struct StdRemoveIfStage1Functor { void operator()(const IndexType i, IndexType& update, const bool final_pass) const { auto& myval = m_first_from[i]; - if (final_pass) { - if (!m_must_remove(myval)) { + + if (!m_must_remove(myval)) { + if (final_pass) { // calling move here is ok because we are inside final pass // we are calling move assign as specified by the std m_first_dest[update] = std::move(myval); } - } - if (!m_must_remove(myval)) { update += 1; } } @@ -108,7 +107,9 @@ IteratorType remove_if_exespace_impl(const std::string& label, // create helper tmp view using value_type = typename IteratorType::value_type; using tmp_view_type = Kokkos::View; - tmp_view_type tmp_view("std_remove_if_tmp_view", keep_count); + tmp_view_type tmp_view(Kokkos::view_alloc(Kokkos::WithoutInitializing, ex, + "std_remove_if_tmp_view"), + keep_count); using tmp_readwrite_iterator_type = decltype(begin(tmp_view)); // in stage 1, *move* all elements to keep from original range to tmp diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp index 428dc0d744a..b4046c7645b 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp @@ -21,7 +21,6 @@ #include "Kokkos_Constraints.hpp" #include "Kokkos_HelperPredicates.hpp" #include -#include #include namespace Kokkos { @@ -39,7 +38,7 @@ struct StdReverseFunctor { KOKKOS_FUNCTION void operator()(index_type i) const { - ::Kokkos::Experimental::swap(m_first[i], m_last[-i - 1]); + ::Kokkos::kokkos_swap(m_first[i], m_last[-i - 1]); } KOKKOS_FUNCTION diff --git a/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp b/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp index 50bc7c8d610..94147485071 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_ShiftLeft.hpp @@ -126,10 +126,11 @@ KOKKOS_FUNCTION IteratorType shift_left_team_impl( // execution space impl because for this team impl we are // within a parallel region, so for now we solve serially - const std::size_t numElementsToMove = + using difference_type = typename IteratorType::difference_type; + const difference_type numElementsToMove = ::Kokkos::Experimental::distance(first + n, last); Kokkos::single(Kokkos::PerTeam(teamHandle), [=]() { - for (std::size_t i = 0; i < numElementsToMove; ++i) { + for (difference_type i = 0; i < numElementsToMove; ++i) { first[i] = std::move(first[i + n]); } }); diff --git a/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp b/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp index cac20bfbba6..0414e6f1c25 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_ShiftRight.hpp @@ -103,26 +103,6 @@ IteratorType shift_right_exespace_impl( return first + n; } -template -struct StdShiftRightTeamSingleFunctor { - Iterator m_first; - Iterator m_last; - std::size_t m_shift; - - KOKKOS_FUNCTION - void operator()() const { - // the impl function calling this functor guarantees that - // - m_shift is non-negative - // - m_first, m_last identify a valid range with m_last > m_first - // - m_shift is less than m_last - m_first - // so I can safely use std::size_t here - } - - KOKKOS_FUNCTION - StdShiftRightTeamSingleFunctor(Iterator _first, Iterator _last, std::size_t n) - : m_first(std::move(_first)), m_last(std::move(_last)), m_shift(n) {} -}; - template KOKKOS_FUNCTION IteratorType shift_right_team_impl( const TeamHandleType& teamHandle, IteratorType first, IteratorType last, @@ -145,10 +125,11 @@ KOKKOS_FUNCTION IteratorType shift_right_team_impl( // execution space impl because for this team impl we are // within a parallel region, so for now we solve serially - const std::size_t numElementsToMove = + using difference_type = typename IteratorType::difference_type; + const difference_type numElementsToMove = ::Kokkos::Experimental::distance(first, last - n); Kokkos::single(Kokkos::PerTeam(teamHandle), [=]() { - for (std::size_t i = 0; i < numElementsToMove; ++i) { + for (difference_type i = 0; i < numElementsToMove; ++i) { last[-i - 1] = std::move(last[-n - i - 1]); } }); diff --git a/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp b/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp index 5bc77ed7ddc..930a14ac48c 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_SwapRanges.hpp @@ -21,7 +21,6 @@ #include "Kokkos_Constraints.hpp" #include "Kokkos_HelperPredicates.hpp" #include -#include #include namespace Kokkos { @@ -36,7 +35,7 @@ struct StdSwapRangesFunctor { KOKKOS_FUNCTION void operator()(index_type i) const { - ::Kokkos::Experimental::swap(m_first1[i], m_first2[i]); + ::Kokkos::kokkos_swap(m_first1[i], m_first2[i]); } KOKKOS_FUNCTION diff --git a/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp b/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp index 11afa8ed6e0..28635824585 100644 --- a/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp +++ b/algorithms/src/std_algorithms/impl/Kokkos_Unique.hpp @@ -105,7 +105,9 @@ IteratorType unique_exespace_impl(const std::string& label, // using the same algorithm used for unique_copy but we now move things using value_type = typename IteratorType::value_type; using tmp_view_type = Kokkos::View; - tmp_view_type tmp_view("std_unique_tmp_view", num_elements_to_explore); + tmp_view_type tmp_view(Kokkos::view_alloc(ex, Kokkos::WithoutInitializing, + "std_unique_tmp_view"), + num_elements_to_explore); // scan extent is: num_elements_to_explore - 1 // for same reason as the one explained in unique_copy diff --git a/algorithms/unit_tests/CMakeLists.txt b/algorithms/unit_tests/CMakeLists.txt index 419f5ec1d13..db184bc8a99 100644 --- a/algorithms/unit_tests/CMakeLists.txt +++ b/algorithms/unit_tests/CMakeLists.txt @@ -25,6 +25,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) set(ALGO_SORT_SOURCES) foreach(SOURCE_Input TestSort + TestSortByKey TestSortCustomComp TestBinSortA TestBinSortB @@ -57,35 +58,37 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) configure_file(${dir}/dummy.cpp ${file}) list(APPEND ALGO_RANDOM_SOURCES ${file}) endforeach() + endif() +endforeach() - # ------------------------------------------ - # std set A - # ------------------------------------------ - set(STDALGO_SOURCES_A) - foreach(Name +# ------------------------------------------ +# std set A +# ------------------------------------------ +set(STDALGO_SOURCES_A) +foreach(Name StdReducers StdAlgorithmsConstraints RandomAccessIterator - ) - list(APPEND STDALGO_SOURCES_A Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_SOURCES_A Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std set B - # ------------------------------------------ - set(STDALGO_SOURCES_B) - foreach(Name +# ------------------------------------------ +# std set B +# ------------------------------------------ +set(STDALGO_SOURCES_B) +foreach(Name StdAlgorithmsCommon StdAlgorithmsMinMaxElementOps - ) - list(APPEND STDALGO_SOURCES_B Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_SOURCES_B Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std set C - # ------------------------------------------ - set(STDALGO_SOURCES_C) - foreach(Name +# ------------------------------------------ +# std set C +# ------------------------------------------ +set(STDALGO_SOURCES_C) +foreach(Name StdAlgorithmsCommon StdAlgorithmsLexicographicalCompare StdAlgorithmsForEach @@ -100,15 +103,15 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsSearch_n StdAlgorithmsMismatch StdAlgorithmsMoveBackward - ) - list(APPEND STDALGO_SOURCES_C Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_SOURCES_C Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std set D - # ------------------------------------------ - set(STDALGO_SOURCES_D) - foreach(Name +# ------------------------------------------ +# std set D +# ------------------------------------------ +set(STDALGO_SOURCES_D) +foreach(Name StdAlgorithmsCommon StdAlgorithmsModOps StdAlgorithmsModSeqOps @@ -128,15 +131,15 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsReverse StdAlgorithmsShiftLeft StdAlgorithmsShiftRight - ) - list(APPEND STDALGO_SOURCES_D Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_SOURCES_D Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std set E - # ------------------------------------------ - set(STDALGO_SOURCES_E) - foreach(Name +# ------------------------------------------ +# std set E +# ------------------------------------------ +set(STDALGO_SOURCES_E) +foreach(Name StdAlgorithmsCommon StdAlgorithmsIsSorted StdAlgorithmsIsSortedUntil @@ -149,83 +152,83 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsTransformUnaryOp StdAlgorithmsTransformExclusiveScan StdAlgorithmsTransformInclusiveScan - ) - list(APPEND STDALGO_SOURCES_E Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_SOURCES_E Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team Q - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_Q) - foreach(Name +# ------------------------------------------ +# std team Q +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_Q) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamInclusiveScan StdAlgorithmsTeamTransformInclusiveScan - ) - list(APPEND STDALGO_TEAM_SOURCES_Q Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_Q Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team P - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_P) - foreach(Name +# ------------------------------------------ +# std team P +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_P) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamExclusiveScan StdAlgorithmsTeamTransformExclusiveScan - ) - list(APPEND STDALGO_TEAM_SOURCES_P Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_P Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team M - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_M) - foreach(Name +# ------------------------------------------ +# std team M +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_M) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamTransformUnaryOp StdAlgorithmsTeamTransformBinaryOp StdAlgorithmsTeamGenerate StdAlgorithmsTeamGenerate_n StdAlgorithmsTeamSwapRanges - ) - list(APPEND STDALGO_TEAM_SOURCES_M Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_M Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team L - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_L) - foreach(Name +# ------------------------------------------ +# std team L +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_L) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamIsSorted StdAlgorithmsTeamIsSortedUntil StdAlgorithmsTeamIsPartitioned StdAlgorithmsTeamPartitionCopy StdAlgorithmsTeamPartitionPoint - ) - list(APPEND STDALGO_TEAM_SOURCES_L Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_L Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team I - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_I) - foreach(Name +# ------------------------------------------ +# std team I +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_I) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamUnique StdAlgorithmsTeamAdjacentDifference StdAlgorithmsTeamReduce StdAlgorithmsTeamTransformReduce - ) - list(APPEND STDALGO_TEAM_SOURCES_I Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_I Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team H - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_H) - foreach(Name +# ------------------------------------------ +# std team H +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_H) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamCopy StdAlgorithmsTeamCopy_n @@ -236,43 +239,43 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsTeamRemoveIf StdAlgorithmsTeamRemoveCopy StdAlgorithmsTeamRemoveCopyIf - ) - list(APPEND STDALGO_TEAM_SOURCES_H Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_H Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team G - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_G) - foreach(Name +# ------------------------------------------ +# std team G +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_G) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamMove StdAlgorithmsTeamMoveBackward StdAlgorithmsTeamShiftLeft StdAlgorithmsTeamShiftRight - ) - list(APPEND STDALGO_TEAM_SOURCES_G Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_G Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team F - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_F) - foreach(Name +# ------------------------------------------ +# std team F +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_F) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamReverse StdAlgorithmsTeamReverseCopy StdAlgorithmsTeamRotate StdAlgorithmsTeamRotateCopy - ) - list(APPEND STDALGO_TEAM_SOURCES_F Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_F Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team E - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_E) - foreach(Name +# ------------------------------------------ +# std team E +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_E) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamFill StdAlgorithmsTeamFill_n @@ -280,28 +283,28 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsTeamReplaceIf StdAlgorithmsTeamReplaceCopy StdAlgorithmsTeamReplaceCopyIf - ) - list(APPEND STDALGO_TEAM_SOURCES_E Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_E Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team D - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_D) - foreach(Name +# ------------------------------------------ +# std team D +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_D) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamMinElement StdAlgorithmsTeamMaxElement StdAlgorithmsTeamMinMaxElement - ) - list(APPEND STDALGO_TEAM_SOURCES_D Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_D Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team C - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_C) - foreach(Name +# ------------------------------------------ +# std team C +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_C) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamFind StdAlgorithmsTeamFindIf @@ -310,29 +313,29 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsTeamAnyOf StdAlgorithmsTeamNoneOf StdAlgorithmsTeamSearchN - ) - list(APPEND STDALGO_TEAM_SOURCES_C Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_C Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team B - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_B) - foreach(Name +# ------------------------------------------ +# std team B +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_B) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamEqual StdAlgorithmsTeamSearch StdAlgorithmsTeamFindEnd StdAlgorithmsTeamFindFirstOf - ) - list(APPEND STDALGO_TEAM_SOURCES_B Test${Name}.cpp) - endforeach() + ) + list(APPEND STDALGO_TEAM_SOURCES_B Test${Name}.cpp) +endforeach() - # ------------------------------------------ - # std team A - # ------------------------------------------ - set(STDALGO_TEAM_SOURCES_A) - foreach(Name +# ------------------------------------------ +# std team A +# ------------------------------------------ +set(STDALGO_TEAM_SOURCES_A) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamAdjacentFind StdAlgorithmsTeamCount @@ -341,11 +344,8 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) StdAlgorithmsTeamForEachN StdAlgorithmsTeamLexicographicalCompare StdAlgorithmsTeamMismatch - ) - list(APPEND STDALGO_TEAM_SOURCES_A Test${Name}.cpp) - endforeach() - - endif() + ) + list(APPEND STDALGO_TEAM_SOURCES_A Test${Name}.cpp) endforeach() # FIXME_OPENMPTARGET - remove sort test as it leads to ICE with clang/16 and above at compile time. diff --git a/algorithms/unit_tests/Makefile b/algorithms/unit_tests/Makefile index 601217799a8..d3946c149ba 100644 --- a/algorithms/unit_tests/Makefile +++ b/algorithms/unit_tests/Makefile @@ -27,13 +27,13 @@ TARGETS = tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ $(if $(filter Test$(device).cpp, $(shell ls Test$(device).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ - $(shell echo "\#include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " > Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ + $(shell echo "$(H)include " >> Test$(device).cpp); \ ) \ ) diff --git a/algorithms/unit_tests/TestSortByKey.hpp b/algorithms/unit_tests/TestSortByKey.hpp new file mode 100644 index 00000000000..16f68eaaf26 --- /dev/null +++ b/algorithms/unit_tests/TestSortByKey.hpp @@ -0,0 +1,241 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TEST_SORT_BY_KEY_HPP +#define KOKKOS_ALGORITHMS_UNITTESTS_TEST_SORT_BY_KEY_HPP + +#include +#include +#include +#include + +#include // pair + +namespace Test { +namespace SortImpl { + +struct Less { + template + KOKKOS_INLINE_FUNCTION bool operator()(const ValueType &lhs, + const ValueType &rhs) const { + return lhs < rhs; + } +}; + +struct Greater { + template + KOKKOS_INLINE_FUNCTION bool operator()(const ValueType &lhs, + const ValueType &rhs) const { + return lhs > rhs; + } +}; + +template +struct is_sorted_by_key_struct { + Keys keys; + Keys keys_orig; + Permute permute; + Comparator comparator; + + is_sorted_by_key_struct(Keys keys_, Keys keys_orig_, Permute permute_, + Comparator comparator_ = Comparator{}) + : keys(keys_), + keys_orig(keys_orig_), + permute(permute_), + comparator(comparator_) {} + KOKKOS_INLINE_FUNCTION + void operator()(int i, unsigned int &count) const { + if (i < keys.extent_int(0) - 1 && comparator(keys(i + 1), keys(i))) ++count; + if (keys(i) != keys_orig(permute(i))) ++count; + } +}; + +template +void iota(ExecutionSpace const &space, ViewType const &v, + typename ViewType::value_type value = 0) { + using ValueType = typename ViewType::value_type; + Kokkos::parallel_for( + "ArborX::Algorithms::iota", + Kokkos::RangePolicy(space, 0, v.extent(0)), + KOKKOS_LAMBDA(int i) { v(i) = value + (ValueType)i; }); +} + +} // namespace SortImpl + +TEST(TEST_CATEGORY, SortByKeyEmptyView) { + using ExecutionSpace = TEST_EXECSPACE; + + // does not matter if we use int or something else + Kokkos::View keys("keys", 0); + Kokkos::View values("values", 0); + + ASSERT_NO_THROW( + Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values)); +} + +TEST(TEST_CATEGORY, SortByKey) { + using ExecutionSpace = TEST_EXECSPACE; + using MemorySpace = typename ExecutionSpace::memory_space; + + ExecutionSpace space{}; + + for (auto keys_vector : {std::vector{36, 19, 25, 17, 3, 7, 1, 2, 9}, + std::vector{36, 19, 25, 17, 3, 9, 1, 2, 7}, + std::vector{100, 19, 36, 17, 3, 25, 1, 2, 7}, + std::vector{15, 5, 11, 3, 4, 8}}) { + auto const n = keys_vector.size(); + + auto keys = Kokkos::create_mirror_view_and_copy( + MemorySpace{}, + Kokkos::View( + keys_vector.data(), n)); + + auto keys_orig = Kokkos::create_mirror(space, keys); + Kokkos::deep_copy(space, keys_orig, keys); + + Kokkos::View permute("permute", n); + SortImpl::iota(space, permute); + + Kokkos::Experimental::sort_by_key(space, keys, permute); + + unsigned int sort_fails = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(space, 0, n), + SortImpl::is_sorted_by_key_struct(keys, keys_orig, + permute), + sort_fails); + + ASSERT_EQ(sort_fails, 0u); + } +} + +TEST(TEST_CATEGORY, SortByKeyWithComparator) { + using ExecutionSpace = TEST_EXECSPACE; + using MemorySpace = typename ExecutionSpace::memory_space; + + ExecutionSpace space{}; + + SortImpl::Greater comparator; + + for (auto keys_vector : {std::vector{36, 19, 25, 17, 3, 7, 1, 2, 9}, + std::vector{36, 19, 25, 17, 3, 9, 1, 2, 7}, + std::vector{100, 19, 36, 17, 3, 25, 1, 2, 7}, + std::vector{15, 5, 11, 3, 4, 8}}) { + auto const n = keys_vector.size(); + + auto keys = Kokkos::create_mirror_view_and_copy( + MemorySpace{}, + Kokkos::View( + keys_vector.data(), n)); + + auto keys_orig = Kokkos::create_mirror(space, keys); + Kokkos::deep_copy(space, keys_orig, keys); + + Kokkos::View permute("permute", n); + SortImpl::iota(space, permute); + + Kokkos::Experimental::sort_by_key(space, keys, permute, comparator); + + unsigned int sort_fails = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(space, 0, n), + SortImpl::is_sorted_by_key_struct( + keys, keys_orig, permute, comparator), + sort_fails); + + ASSERT_EQ(sort_fails, 0u); + } +} + +TEST(TEST_CATEGORY, SortByKeyStaticExtents) { + using ExecutionSpace = TEST_EXECSPACE; + + ExecutionSpace space{}; + + Kokkos::View keys("keys"); + + Kokkos::View values_static("values_static"); + ASSERT_NO_THROW( + Kokkos::Experimental::sort_by_key(space, keys, values_static)); + + Kokkos::View values_dynamic("values_dynamic", 10); + ASSERT_NO_THROW( + Kokkos::Experimental::sort_by_key(space, keys, values_dynamic)); +} + +template +void buildViewsForStrided(ExecutionSpace const &space, int n, Keys &keys, + Values &values) { + Kokkos::parallel_for( + "create_data", + Kokkos::MDRangePolicy, ExecutionSpace>(space, {0, 0, 0}, + {n, n, n}), + KOKKOS_LAMBDA(int i, int j, int k) { + keys(i, j, k) = n - i; + values(i, j, k) = j; + }); +} + +TEST(TEST_CATEGORY, SortByKeyWithStrides) { + using ExecutionSpace = TEST_EXECSPACE; + + ExecutionSpace space{}; + + auto const n = 10; + + Kokkos::View keys("keys", n, n, n); + Kokkos::View values("values", n, n, n); + buildViewsForStrided(space, n, keys, values); + + auto keys_sub = Kokkos::subview(keys, Kokkos::ALL(), 1, 2); + auto values_sub = Kokkos::subview(values, 4, Kokkos::ALL(), 6); + + auto keys_orig = Kokkos::create_mirror(space, keys_sub); + Kokkos::deep_copy(space, keys_orig, keys_sub); + + Kokkos::Experimental::sort_by_key(space, keys_sub, values_sub); + + unsigned int sort_fails = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(space, 0, n), + SortImpl::is_sorted_by_key_struct( + keys_sub, keys_orig, values_sub), + sort_fails); + + ASSERT_EQ(sort_fails, 0u); +} + +TEST(TEST_CATEGORY, SortByKeyKeysLargerThanValues) { + using ExecutionSpace = TEST_EXECSPACE; + + // does not matter if we use int or something else + Kokkos::View keys("keys", 3); + Kokkos::View values("values", 1); + + ASSERT_DEATH( + Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values), + "values and keys extents must be the same"); + ASSERT_DEATH(Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values, + SortImpl::Greater{}), + "values and keys extents must be the same"); +} + +} // namespace Test +#endif diff --git a/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp b/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp index 3eb963faf2d..67052e2f9d4 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp +++ b/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp @@ -239,16 +239,8 @@ KOKKOS_FUNCTION bool team_members_have_matching_result( // set accum to 1 if a mismach is found const bool mismatch = memberValue != target; int accum = static_cast(mismatch); - // FIXME_OPENMPTARGET: team API does not meet the TeamHandle concept and - // ignores the reducer passed -#if defined KOKKOS_ENABLE_OPENMPTARGET - Kokkos::Sum dummyReducer(accum); - const auto result = teamHandle.team_reduce(accum, dummyReducer); - return (result == 0); -#else teamHandle.team_reduce(Kokkos::Sum(accum)); return (accum == 0); -#endif } template diff --git a/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp index 6ab68a1987d..b364c53a888 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp @@ -16,6 +16,7 @@ #include #include +#include namespace Test { namespace stdalgos { @@ -132,47 +133,6 @@ void my_host_exclusive_scan(it1 first, it1 last, it2 dest, ValType init, } } -template -void verify_data(ViewType1 data_view, // contains data - ViewType2 test_view, // the view to test - ValueType init_value, BinaryOp bop) { - //! always careful because views might not be deep copyable - - auto data_view_dc = create_deep_copyable_compatible_clone(data_view); - auto data_view_h = - create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc); - - using gold_view_value_type = typename ViewType2::value_type; - Kokkos::View gold_h( - "goldh", data_view.extent(0)); - my_host_exclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h), - KE::begin(gold_h), init_value, bop); - - auto test_view_dc = create_deep_copyable_compatible_clone(test_view); - auto test_view_h = - create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); - if (test_view_h.extent(0) > 0) { - for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " " - // << gold_h(i) << " " << test_view_h(i) << " " - // << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - if (std::is_same::value) { - ASSERT_EQ(gold_h(i), test_view_h(i)); - } else { - const auto error = - std::abs(static_cast(gold_h(i) - test_view_h(i))); - if (error > 1e-10) { - std::cout << i << " " << std::setprecision(15) << data_view_h(i) - << " " << gold_h(i) << " " << test_view_h(i) << " " - << std::abs(static_cast(gold_h(i) - test_view_h(i))) - << std::endl; - } - EXPECT_LT(error, 1e-10); - } - } - } -} - template struct MultiplyFunctor { KOKKOS_INLINE_FUNCTION @@ -189,107 +149,153 @@ struct SumFunctor { } }; +struct VerifyData { + template + void operator()(ViewType1 data_view, // contains data + ViewType2 test_view, // the view to test + ValueType init_value, BinaryOp bop) { + //! always careful because views might not be deep copyable + + auto data_view_dc = create_deep_copyable_compatible_clone(data_view); + auto data_view_h = + create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc); + + using gold_view_value_type = typename ViewType2::value_type; + Kokkos::View gold_h( + "goldh", data_view.extent(0)); + my_host_exclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h), + KE::begin(gold_h), init_value, bop); + + auto test_view_dc = create_deep_copyable_compatible_clone(test_view); + auto test_view_h = + create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); + if (test_view_h.extent(0) > 0) { + for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { + if (std::is_same::value) { + ASSERT_EQ(gold_h(i), test_view_h(i)); + } else { + const auto error = + std::abs(static_cast(gold_h(i) - test_view_h(i))); + ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error + << static_cast(test_view_h(i)) << " " + << static_cast(gold_h(i)); + } + } + } + } + + template + void operator()(ViewType1 data_view, // contains data + ViewType2 test_view, // the view to test + ValueType init_value) { + (*this)(data_view, test_view, init_value, SumFunctor()); + } +}; + std::string value_type_to_string(int) { return "int"; } std::string value_type_to_string(double) { return "double"; } -template -void run_single_scenario_default_op(const InfoType& scenario_info, - ValueType init_value) { - using default_op = SumFunctor; +template +void run_single_scenario(const InfoType& scenario_info, ValueType init_value, + OpOrEmpty... empty_or_op) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // std::cout << "exclusive_scan default op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " - // << "init = " << init_value << std::endl; auto view_dest = create_view(Tag{}, view_ext, "exclusive_scan"); auto view_from = create_view(Tag{}, view_ext, "exclusive_scan"); fill_view(view_from, name); + // view_dest is filled with zeros before calling the algorithm everytime to + // ensure the algorithm does something meaningful { fill_zero(view_dest); auto r = KE::exclusive_scan(exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), - init_value); + init_value, empty_or_op...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, default_op()); + VerifyData()(view_from, view_dest, init_value, empty_or_op...); } { fill_zero(view_dest); auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), - init_value); + init_value, empty_or_op...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, default_op()); + VerifyData()(view_from, view_dest, init_value, empty_or_op...); } { fill_zero(view_dest); - auto r = KE::exclusive_scan(exespace(), view_from, view_dest, init_value); + auto r = KE::exclusive_scan(exespace(), view_from, view_dest, init_value, + empty_or_op...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, default_op()); + VerifyData()(view_from, view_dest, init_value, empty_or_op...); } { fill_zero(view_dest); auto r = KE::exclusive_scan("label", exespace(), view_from, view_dest, - init_value); + init_value, empty_or_op...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, default_op()); + VerifyData()(view_from, view_dest, init_value, empty_or_op...); } Kokkos::fence(); } -template -void run_single_scenario_custom_op(const InfoType& scenario_info, - ValueType init_value, BinaryOp bop) { +template +void run_single_scenario_inplace(const InfoType& scenario_info, + ValueType init_value, + OpOrEmpty... empty_or_op) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // std::cout << "exclusive_scan custom op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " - // << "init = " << init_value << std::endl; - auto view_dest = create_view(Tag{}, view_ext, "exclusive_scan"); - auto view_from = create_view(Tag{}, view_ext, "exclusive_scan"); - fill_view(view_from, name); + // since here we call the in-place operation, we need to use two views: + // view1: filled according to what the scenario asks for and is not modified + // view2: filled according to what the scenario asks for and used for the + // in-place op Therefore, after the op is done, view2 should contain the + // result of doing exclusive scan NOTE: view2 is filled below every time + // because the algorithm acts in place + + auto view1 = + create_view(Tag{}, view_ext, "exclusive_scan_inplace_view1"); + fill_view(view1, name); + auto view2 = + create_view(Tag{}, view_ext, "exclusive_scan_inplace_view2"); { - fill_zero(view_dest); - auto r = KE::exclusive_scan(exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), - init_value, bop); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, bop); + fill_view(view2, name); + auto r = KE::exclusive_scan(exespace(), KE::cbegin(view2), KE::cend(view2), + KE::begin(view2), init_value, empty_or_op...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, init_value, empty_or_op...); } { - fill_zero(view_dest); - auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), - init_value, bop); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, bop); + fill_view(view2, name); + auto r = KE::exclusive_scan("label", exespace(), KE::cbegin(view2), + KE::cend(view2), KE::begin(view2), init_value, + empty_or_op...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, init_value, empty_or_op...); } { - fill_zero(view_dest); - auto r = - KE::exclusive_scan(exespace(), view_from, view_dest, init_value, bop); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, bop); + fill_view(view2, name); + auto r = KE::exclusive_scan(exespace(), view2, view2, init_value, + empty_or_op...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, init_value, empty_or_op...); } { - fill_zero(view_dest); - auto r = KE::exclusive_scan("label", exespace(), view_from, view_dest, - init_value, bop); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, init_value, bop); + fill_view(view2, name); + auto r = KE::exclusive_scan("label", exespace(), view2, view2, init_value, + empty_or_op...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, init_value, empty_or_op...); } Kokkos::fence(); @@ -303,34 +309,39 @@ void run_exclusive_scan_all_scenarios() { {"medium", 1103}, {"large", 10513}}; for (const auto& it : scenarios) { - run_single_scenario_default_op(it, ValueType{0}); - run_single_scenario_default_op(it, ValueType{1}); - run_single_scenario_default_op(it, ValueType{-2}); - run_single_scenario_default_op(it, ValueType{3}); + run_single_scenario(it, ValueType{0}); + run_single_scenario(it, ValueType{1}); + run_single_scenario(it, ValueType{-2}); + run_single_scenario(it, ValueType{3}); + + run_single_scenario_inplace(it, ValueType{0}); + run_single_scenario_inplace(it, ValueType{-2}); #if !defined KOKKOS_ENABLE_OPENMPTARGET // custom multiply op is only run for small views otherwise it overflows if (it.first == "small-a" || it.first == "small-b") { using custom_bop_t = MultiplyFunctor; - run_single_scenario_custom_op(it, ValueType{0}, - custom_bop_t()); - run_single_scenario_custom_op(it, ValueType{1}, - custom_bop_t()); - run_single_scenario_custom_op(it, ValueType{-2}, - custom_bop_t()); - run_single_scenario_custom_op(it, ValueType{3}, - custom_bop_t()); - } + run_single_scenario(it, ValueType{0}, custom_bop_t()); + run_single_scenario(it, ValueType{1}, custom_bop_t()); + run_single_scenario(it, ValueType{-2}, custom_bop_t()); + run_single_scenario(it, ValueType{3}, custom_bop_t()); - using custom_bop_t = SumFunctor; - run_single_scenario_custom_op(it, ValueType{0}, - custom_bop_t()); - run_single_scenario_custom_op(it, ValueType{1}, + run_single_scenario_inplace(it, ValueType{0}, custom_bop_t()); - run_single_scenario_custom_op(it, ValueType{-2}, - custom_bop_t()); - run_single_scenario_custom_op(it, ValueType{3}, + run_single_scenario_inplace(it, ValueType{-2}, custom_bop_t()); + } + + using custom_bop_t = SumFunctor; + run_single_scenario(it, ValueType{0}, custom_bop_t()); + run_single_scenario(it, ValueType{1}, custom_bop_t()); + run_single_scenario(it, ValueType{-2}, custom_bop_t()); + run_single_scenario(it, ValueType{3}, custom_bop_t()); + + run_single_scenario_inplace(it, ValueType{0}, + custom_bop_t()); + run_single_scenario_inplace(it, ValueType{-2}, + custom_bop_t()); #endif } } diff --git a/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp index 8e60a43e5ff..a08a7372108 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp @@ -16,6 +16,7 @@ #include #include +#include namespace Test { namespace stdalgos { @@ -143,51 +144,6 @@ void my_host_inclusive_scan(it1 first, it1 last, it2 dest, BinOp bop, } } -template -void verify_data(ViewType1 data_view, // contains data - ViewType2 test_view, // the view to test - BinaryOp bop, Args... args /* copy on purpose */) { - //! always careful because views might not be deep copyable - - auto data_view_dc = create_deep_copyable_compatible_clone(data_view); - auto data_view_h = - create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc); - - using gold_view_value_type = typename ViewType2::value_type; - Kokkos::View gold_h( - "goldh", data_view.extent(0)); - my_host_inclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h), - KE::begin(gold_h), bop, args...); - - auto test_view_dc = create_deep_copyable_compatible_clone(test_view); - auto test_view_h = - create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); - - const auto ext = test_view_h.extent(0); - if (ext > 0) { - for (std::size_t i = 0; i < ext; ++i) { - // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " " - // << gold_h(i) << " " << test_view_h(i) << " " - // << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - - if (std::is_same::value) { - ASSERT_EQ(gold_h(i), test_view_h(i)); - } else { - const auto error = - std::abs(static_cast(gold_h(i) - test_view_h(i))); - if (error > 1e-10) { - std::cout << i << " " << std::setprecision(15) << data_view_h(i) - << " " << gold_h(i) << " " << test_view_h(i) << " " - << std::abs(static_cast(gold_h(i) - test_view_h(i))) - << std::endl; - } - EXPECT_LT(error, 1e-10); - } - } - // std::cout << " last el: " << test_view_h(ext-1) << std::endl; - } -} - template struct MultiplyFunctor { KOKKOS_INLINE_FUNCTION @@ -204,107 +160,151 @@ struct SumFunctor { } }; +struct VerifyData { + template + void operator()(ViewType1 data_view, // contains data + ViewType2 test_view, // the view to test + BinaryOp bop, Args... args /* copy on purpose */) { + //! always careful because views might not be deep copyable + + auto data_view_dc = create_deep_copyable_compatible_clone(data_view); + auto data_view_h = + create_mirror_view_and_copy(Kokkos::HostSpace(), data_view_dc); + + using gold_view_value_type = typename ViewType2::value_type; + Kokkos::View gold_h( + "goldh", data_view.extent(0)); + my_host_inclusive_scan(KE::cbegin(data_view_h), KE::cend(data_view_h), + KE::begin(gold_h), bop, args...); + + auto test_view_dc = create_deep_copyable_compatible_clone(test_view); + auto test_view_h = + create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); + + const auto ext = test_view_h.extent(0); + if (ext > 0) { + for (std::size_t i = 0; i < ext; ++i) { + if (std::is_same::value) { + ASSERT_EQ(gold_h(i), test_view_h(i)); + } else { + const auto error = + std::abs(static_cast(gold_h(i) - test_view_h(i))); + ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error + << static_cast(test_view_h(i)) << " " + << static_cast(gold_h(i)); + } + } + } + } + + template + void operator()(ViewType1 data_view, // contains data + ViewType2 test_view) // the view to test + { + using value_type = typename ViewType1::non_const_value_type; + (*this)(data_view, test_view, SumFunctor()); + } +}; + std::string value_type_to_string(int) { return "int"; } std::string value_type_to_string(double) { return "double"; } -template -void run_single_scenario_default_op(const InfoType& scenario_info) { - using default_op = SumFunctor; +template +void run_single_scenario(const InfoType& scenario_info, + Args... args /* copy on purpose */) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // std::cout << "inclusive_scan default op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << std::endl; auto view_dest = create_view(Tag{}, view_ext, "inclusive_scan"); auto view_from = create_view(Tag{}, view_ext, "inclusive_scan"); fill_view(view_from, name); + // view_dest is filled with zeros before calling the algorithm everytime to + // ensure the algorithm does something meaningful { fill_zero(view_dest); - auto r = KE::inclusive_scan(exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest)); + auto r = + KE::inclusive_scan(exespace(), KE::cbegin(view_from), + KE::cend(view_from), KE::begin(view_dest), args...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, default_op()); + VerifyData()(view_from, view_dest, args...); } { fill_zero(view_dest); - auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest)); + auto r = + KE::inclusive_scan("label", exespace(), KE::cbegin(view_from), + KE::cend(view_from), KE::begin(view_dest), args...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, default_op()); + VerifyData()(view_from, view_dest, args...); } { fill_zero(view_dest); - auto r = KE::inclusive_scan(exespace(), view_from, view_dest); + auto r = KE::inclusive_scan(exespace(), view_from, view_dest, args...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, default_op()); + VerifyData()(view_from, view_dest, args...); } { fill_zero(view_dest); - auto r = KE::inclusive_scan("label", exespace(), view_from, view_dest); + auto r = + KE::inclusive_scan("label", exespace(), view_from, view_dest, args...); ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, default_op()); + VerifyData()(view_from, view_dest, args...); } Kokkos::fence(); } -template -void run_single_scenario_custom_op(const InfoType& scenario_info, BinaryOp bop, - Args... args /* copy on purpose */) { +template +void run_single_scenario_inplace(const InfoType& scenario_info, + Args... args /* copy on purpose */) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // if (1 == sizeof...(Args)) { - // std::cout << "inclusive_scan custom op and init value: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " << std::endl; - // } else { - // std::cout << "inclusive_scan custom op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " << std::endl; - // } + // since here we call the in-place operation, we need to use two views: + // view1: filled according to what the scenario asks for and is not modified + // view2: filled according to what the scenario asks for and used for the + // in-place op Therefore, after the op is done, view_2 should contain the + // result of doing exclusive scan NOTE: view2 is filled below every time + // because the algorithm acts in place - auto view_dest = create_view(Tag{}, view_ext, "inclusive_scan"); - auto view_from = create_view(Tag{}, view_ext, "inclusive_scan"); - fill_view(view_from, name); + auto view1 = + create_view(Tag{}, view_ext, "inclusive_scan_inplace_view1"); + fill_view(view1, name); + + auto view2 = + create_view(Tag{}, view_ext, "inclusive_scan_inplace_view2"); { - fill_zero(view_dest); - auto r = KE::inclusive_scan(exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), bop, - args...); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, bop, args...); + fill_view(view2, name); + auto r = KE::inclusive_scan(exespace(), KE::cbegin(view2), KE::cend(view2), + KE::begin(view2), args...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, args...); } { - fill_zero(view_dest); - auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), bop, - args...); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, bop, args...); + fill_view(view2, name); + auto r = KE::inclusive_scan("label", exespace(), KE::cbegin(view2), + KE::cend(view2), KE::begin(view2), args...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, args...); } { - fill_zero(view_dest); - auto r = KE::inclusive_scan(exespace(), view_from, view_dest, bop, args...); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, bop, args...); + fill_view(view2, name); + auto r = KE::inclusive_scan(exespace(), view2, view2, args...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, args...); } { - fill_zero(view_dest); - auto r = KE::inclusive_scan("label", exespace(), view_from, view_dest, bop, - args...); - ASSERT_EQ(r, KE::end(view_dest)); - verify_data(view_from, view_dest, bop, args...); + fill_view(view2, name); + auto r = KE::inclusive_scan("label", exespace(), view2, view2, args...); + ASSERT_EQ(r, KE::end(view2)); + VerifyData()(view1, view2, args...); } Kokkos::fence(); @@ -318,27 +318,35 @@ void run_inclusive_scan_all_scenarios() { {"medium-a", 313}, {"medium-b", 1103}, {"large", 10513}}; for (const auto& it : scenarios) { - run_single_scenario_default_op(it); + run_single_scenario(it); + run_single_scenario_inplace(it); #if !defined KOKKOS_ENABLE_OPENMPTARGET // the sum custom op is always run using sum_binary_op = SumFunctor; sum_binary_op sbop; - run_single_scenario_custom_op(it, sbop); - run_single_scenario_custom_op(it, sbop, ValueType{0}); - run_single_scenario_custom_op(it, sbop, ValueType{1}); - run_single_scenario_custom_op(it, sbop, ValueType{-2}); - run_single_scenario_custom_op(it, sbop, ValueType{3}); + run_single_scenario(it, sbop); + run_single_scenario(it, sbop, ValueType{0}); + run_single_scenario(it, sbop, ValueType{1}); + run_single_scenario(it, sbop, ValueType{-2}); + run_single_scenario(it, sbop, ValueType{3}); + + run_single_scenario_inplace(it, sbop, ValueType{0}); + run_single_scenario_inplace(it, sbop, ValueType{-2}); // custom multiply only for small views to avoid overflows if (it.first == "small-a" || it.first == "small-b") { using mult_binary_op = MultiplyFunctor; mult_binary_op mbop; - run_single_scenario_custom_op(it, mbop); - run_single_scenario_custom_op(it, mbop, ValueType{0}); - run_single_scenario_custom_op(it, mbop, ValueType{1}); - run_single_scenario_custom_op(it, mbop, ValueType{-2}); - run_single_scenario_custom_op(it, mbop, ValueType{3}); + run_single_scenario(it, mbop); + run_single_scenario(it, mbop, ValueType{0}); + run_single_scenario(it, mbop, ValueType{1}); + run_single_scenario(it, mbop, ValueType{-2}); + run_single_scenario(it, mbop, ValueType{3}); + + run_single_scenario_inplace(it, mbop); + run_single_scenario_inplace(it, mbop, ValueType{0}); + run_single_scenario_inplace(it, mbop, ValueType{-2}); } #endif } diff --git a/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp b/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp index f31d49e06b4..75d4f0afebc 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp @@ -146,7 +146,7 @@ void run_single_scenario(const InfoType& scenario_info) { resultsA[3] = KE::is_sorted("label", exespace(), view); const auto allA = std::all_of(resultsA.cbegin(), resultsA.cend(), [=](bool v) { return v == gold; }); - EXPECT_TRUE(allA); + EXPECT_TRUE(allA) << name << ", " << view_tag_to_string(Tag{}); #if !defined KOKKOS_ENABLE_OPENMPTARGET CustomLessThanComparator comp; @@ -159,7 +159,7 @@ void run_single_scenario(const InfoType& scenario_info) { resultsB[3] = KE::is_sorted("label", exespace(), view, comp); const auto allB = std::all_of(resultsB.cbegin(), resultsB.cend(), [=](bool v) { return v == gold; }); - EXPECT_TRUE(allB); + EXPECT_TRUE(allB) << name << ", " << view_tag_to_string(Tag{}); #endif Kokkos::fence(); @@ -173,9 +173,6 @@ void run_is_sorted_all_scenarios() { {"medium-a", 1003}, {"medium-b", 1003}, {"large-a", 101513}, {"large-b", 101513}}; - std::cout << "is_sorted: " << view_tag_to_string(Tag{}) - << ", all overloads \n"; - for (const auto& it : scenarios) { run_single_scenario(it); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp b/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp index dcfe8ad67e1..29ac7cc9bc1 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp @@ -145,10 +145,10 @@ void run_single_scenario(const InfoType& scenario_info) { KE::is_sorted_until("label", exespace(), KE::begin(view), KE::end(view)); auto r3 = KE::is_sorted_until(exespace(), view); auto r4 = KE::is_sorted_until("label", exespace(), view); - ASSERT_EQ(r1, gold); - ASSERT_EQ(r2, gold); - ASSERT_EQ(r3, gold); - ASSERT_EQ(r4, gold); + ASSERT_EQ(r1, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r2, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r3, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r4, gold) << name << ", " << view_tag_to_string(Tag{}); #if !defined KOKKOS_ENABLE_OPENMPTARGET CustomLessThanComparator comp; @@ -160,10 +160,10 @@ void run_single_scenario(const InfoType& scenario_info) { auto r8 = KE::is_sorted_until("label", exespace(), view, comp); #endif - ASSERT_EQ(r1, gold); - ASSERT_EQ(r2, gold); - ASSERT_EQ(r3, gold); - ASSERT_EQ(r4, gold); + ASSERT_EQ(r1, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r2, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r3, gold) << name << ", " << view_tag_to_string(Tag{}); + ASSERT_EQ(r4, gold) << name << ", " << view_tag_to_string(Tag{}); Kokkos::fence(); } @@ -176,9 +176,6 @@ void run_is_sorted_until_all_scenarios() { {"medium-a", 1003}, {"medium-b", 1003}, {"large-a", 101513}, {"large-b", 101513}}; - std::cout << "is_sorted_until: " << view_tag_to_string(Tag{}) - << ", all overloads \n"; - for (const auto& it : scenarios) { run_single_scenario(it); } diff --git a/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp b/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp index 4604764097e..1b1a02f39c4 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp @@ -48,7 +48,7 @@ struct MyMovableType { TEST(std_algorithms_mod_ops_test, move) { MyMovableType a; using move_t = decltype(std::move(a)); - static_assert(std::is_rvalue_reference::value, ""); + static_assert(std::is_rvalue_reference::value); // move constr MyMovableType b(std::move(a)); @@ -70,7 +70,7 @@ struct StdAlgoModSeqOpsTestMove { void operator()(const int index) const { typename ViewType::value_type a{11}; using move_t = decltype(std::move(a)); - static_assert(std::is_rvalue_reference::value, ""); + static_assert(std::is_rvalue_reference::value); m_view(index) = std::move(a); } @@ -89,50 +89,6 @@ TEST(std_algorithms_mod_ops_test, move_within_parfor) { } } -// ------------ -// swap -// ------------ -TEST(std_algorithms_mod_ops_test, swap) { - { - int a = 1; - int b = 2; - KE::swap(a, b); - ASSERT_EQ(a, 2); - ASSERT_EQ(b, 1); - } - - { - double a = 3.; - double b = 1.; - KE::swap(a, b); - EXPECT_DOUBLE_EQ(a, 1.); - EXPECT_DOUBLE_EQ(b, 3.); - } -} - -template -struct StdAlgoModSeqOpsTestSwap { - ViewType m_view; - - KOKKOS_INLINE_FUNCTION - void operator()(const int index) const { - typename ViewType::value_type newval{11}; - KE::swap(m_view(index), newval); - } - - StdAlgoModSeqOpsTestSwap(ViewType aIn) : m_view(aIn) {} -}; - -TEST(std_algorithms_mod_ops_test, swap_within_parfor) { - auto a = create_view(stdalgos::DynamicTag{}, 10, "a"); - StdAlgoModSeqOpsTestSwap fnc(a); - Kokkos::parallel_for(a.extent(0), fnc); - auto a_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a); - for (std::size_t i = 0; i < a.extent(0); ++i) { - EXPECT_DOUBLE_EQ(a_h(0), 11.); - } -} - // ------------ // iter_swap // ------------ diff --git a/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp b/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp index f169fd9ce88..a36c9db2b9e 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp @@ -110,11 +110,9 @@ void verify_data(const std::string& name, ResultType my_result, ViewTypeDestFalse view_dest_false, PredType pred) { using value_type = typename ViewTypeFrom::value_type; static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); const std::size_t ext = view_from.extent(0); diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp index b5aa27c7c38..7c3c465dc8d 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp @@ -166,6 +166,10 @@ void run_all_scenarios() { } TEST(std_algorithms_copy_if_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp index c6b2566c6cf..2c8fee02f47 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp @@ -121,7 +121,9 @@ struct TestFunctorA { } }; -template +struct InPlace {}; + +template void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { /* description: use a rank-2 view randomly filled with values, @@ -147,9 +149,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { using space_t = Kokkos::DefaultExecutionSpace; Kokkos::TeamPolicy policy(numTeams, Kokkos::AUTO()); - // create the destination view - Kokkos::View destView("destView", numTeams, numCols); - // exclusive_scan returns an iterator so to verify that it is correct // each team stores the distance of the returned iterator from the beginning // of the interval that team operates on and then we check that these @@ -168,12 +167,19 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { rand_pool pool(lowerBound * upperBound); Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound); - // use CTAD for functor auto initValuesView = Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h); - TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, - initValuesView, binaryOp, apiId); - Kokkos::parallel_for(policy, fnc); + + Kokkos::View destView("destView", numTeams, numCols); + if constexpr (std::is_same_v) { + TestFunctorA fnc(sourceView, sourceView, distancesView, + intraTeamSentinelView, initValuesView, binaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } else { + TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, + initValuesView, binaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } // ----------------------------------------------- // run cpp-std kernel and check @@ -223,11 +229,16 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { #undef exclusive_scan } - auto dataViewAfterOp_h = create_host_space_copy(destView); - expect_equal_host_views(stdDestView, dataViewAfterOp_h); + if constexpr (std::is_same_v) { + auto dataViewAfterOp_h = create_host_space_copy(sourceView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } else { + auto dataViewAfterOp_h = create_host_space_copy(destView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } } -template +template void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { @@ -236,16 +247,24 @@ void run_all_scenarios() { #else for (int apiId : {0, 1}) { #endif - test_A(numTeams, numCols, apiId); + test_A(numTeams, numCols, apiId); } } } } TEST(std_algorithms_exclusive_scan_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); + + run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); } } // namespace TeamExclusiveScan diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp index 0daf9dbfe82..b5f4cdd6123 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp @@ -139,7 +139,9 @@ struct TestFunctorA { } }; -template +struct InPlace {}; + +template void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { /* description: use a rank-2 view randomly filled with values, @@ -165,9 +167,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { using space_t = Kokkos::DefaultExecutionSpace; Kokkos::TeamPolicy policy(numTeams, Kokkos::AUTO()); - // create the destination view - Kokkos::View destView("destView", numTeams, numCols); - // inclusive_scan returns an iterator so to verify that it is correct // each team stores the distance of the returned iterator from the beginning // of the interval that team operates on and then we check that these @@ -186,12 +185,20 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { rand_pool pool(lowerBound * upperBound); Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound); - // use CTAD for functor auto initValuesView = Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h); - TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, - initValuesView, binaryOp, apiId); - Kokkos::parallel_for(policy, fnc); + + // create the destination view + Kokkos::View destView("destView", numTeams, numCols); + if constexpr (std::is_same_v) { + TestFunctorA fnc(sourceView, sourceView, distancesView, + intraTeamSentinelView, initValuesView, binaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } else { + TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, + initValuesView, binaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } // ----------------------------------------------- // run cpp-std kernel and check @@ -251,25 +258,38 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { #undef inclusive_scan } - auto dataViewAfterOp_h = create_host_space_copy(destView); - expect_equal_host_views(stdDestView, dataViewAfterOp_h); + if constexpr (std::is_same_v) { + auto dataViewAfterOp_h = create_host_space_copy(sourceView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } else { + auto dataViewAfterOp_h = create_host_space_copy(destView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } } -template +template void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { for (int apiId : {0, 1, 2, 3, 4, 5}) { - test_A(numTeams, numCols, apiId); + test_A(numTeams, numCols, apiId); } } } } TEST(std_algorithms_inclusive_scan_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); + + run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); } } // namespace TeamInclusiveScan diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp index 24b840154b7..6bb0d249988 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp @@ -212,6 +212,10 @@ void run_all_scenarios() { } TEST(std_algorithms_remove_copy_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp index ce18eb4d319..cff9aa178a2 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp @@ -168,6 +168,10 @@ void run_all_scenarios() { } TEST(std_algorithms_remove_copy_if_team_test, test) { +// FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp index 9f30812d8ef..60fa369af18 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp @@ -108,7 +108,9 @@ struct TestFunctorA { } }; -template +struct InPlace {}; + +template void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { /* description: use a rank-2 view randomly filled with values, @@ -134,9 +136,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { using space_t = Kokkos::DefaultExecutionSpace; Kokkos::TeamPolicy policy(numTeams, Kokkos::AUTO()); - // create the destination view - Kokkos::View destView("destView", numTeams, numCols); - // tranform_exclusive_scan returns an iterator so to verify that it is correct // each team stores the distance of the returned iterator from the beginning // of the interval that team operates on and then we check that these @@ -156,12 +155,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { rand_pool pool(lowerBound * upperBound); Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound); - // use CTAD for functor auto initValuesView = Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h); - TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, - initValuesView, binaryOp, unaryOp, apiId); - Kokkos::parallel_for(policy, fnc); + + // create the destination view + Kokkos::View destView("destView", numTeams, numCols); + if constexpr (std::is_same_v) { + TestFunctorA fnc(sourceView, sourceView, distancesView, + intraTeamSentinelView, initValuesView, binaryOp, unaryOp, + apiId); + Kokkos::parallel_for(policy, fnc); + } else { + TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, + initValuesView, binaryOp, unaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } // ----------------------------------------------- // run cpp-std kernel and check @@ -200,16 +208,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { #undef transform_exclusive_scan } - auto dataViewAfterOp_h = create_host_space_copy(destView); - expect_equal_host_views(stdDestView, dataViewAfterOp_h); + if constexpr (std::is_same_v) { + auto dataViewAfterOp_h = create_host_space_copy(sourceView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } else { + auto dataViewAfterOp_h = create_host_space_copy(destView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } } -template +template void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { for (int apiId : {0, 1}) { - test_A(numTeams, numCols, apiId); + test_A(numTeams, numCols, apiId); } } } @@ -219,6 +232,10 @@ TEST(std_algorithms_transform_exclusive_scan_team_test, test) { run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); + + run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); } } // namespace TeamTransformExclusiveScan diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp index 4b316602326..10454d65515 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp @@ -131,7 +131,9 @@ struct TestFunctorA { } }; -template +struct InPlace {}; + +template void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { /* description: use a rank-2 view randomly filled with values, @@ -157,9 +159,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { using space_t = Kokkos::DefaultExecutionSpace; Kokkos::TeamPolicy policy(numTeams, Kokkos::AUTO()); - // create the destination view - Kokkos::View destView("destView", numTeams, numCols); - // tranform_inclusive_scan returns an iterator so to verify that it is correct // each team stores the distance of the returned iterator from the beginning // of the interval that team operates on and then we check that these @@ -179,12 +178,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { rand_pool pool(lowerBound * upperBound); Kokkos::fill_random(initValuesView_h, pool, lowerBound, upperBound); - // use CTAD for functor auto initValuesView = Kokkos::create_mirror_view_and_copy(space_t(), initValuesView_h); - TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, - initValuesView, binaryOp, unaryOp, apiId); - Kokkos::parallel_for(policy, fnc); + + // create the destination view + Kokkos::View destView("destView", numTeams, numCols); + if constexpr (std::is_same_v) { + TestFunctorA fnc(sourceView, sourceView, distancesView, + intraTeamSentinelView, initValuesView, binaryOp, unaryOp, + apiId); + Kokkos::parallel_for(policy, fnc); + } else { + TestFunctorA fnc(sourceView, destView, distancesView, intraTeamSentinelView, + initValuesView, binaryOp, unaryOp, apiId); + Kokkos::parallel_for(policy, fnc); + } // ----------------------------------------------- // run cpp-std kernel and check @@ -236,16 +244,21 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { } #undef transform_inclusive_scan - auto dataViewAfterOp_h = create_host_space_copy(destView); - expect_equal_host_views(stdDestView, dataViewAfterOp_h); + if constexpr (std::is_same_v) { + auto dataViewAfterOp_h = create_host_space_copy(sourceView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } else { + auto dataViewAfterOp_h = create_host_space_copy(destView); + expect_equal_host_views(stdDestView, dataViewAfterOp_h); + } } -template +template void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { for (int apiId : {0, 1, 2, 3}) { - test_A(numTeams, numCols, apiId); + test_A(numTeams, numCols, apiId); } } } @@ -255,6 +268,10 @@ TEST(std_algorithms_transform_inclusive_scan_team_test, test) { run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); + + run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); } } // namespace TeamTransformInclusiveScan diff --git a/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp b/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp index 87687b60a16..0d3289e196f 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp @@ -186,6 +186,10 @@ void run_all_scenarios() { } TEST(std_algorithms_unique_copy_team_test, test) { + // FIXME_OPENMPTARGET +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + GTEST_SKIP() << "the test is known to fail with OpenMPTarget on Intel GPUs"; +#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp index 9dac3ce75ff..fa2804256ac 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp @@ -16,6 +16,7 @@ #include #include +#include namespace Test { namespace stdalgos { @@ -160,24 +161,15 @@ void verify_data(ViewType1 data_view, // contains data create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); if (test_view_h.extent(0) > 0) { for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " " - // << gold_h(i) << " " << test_view_h(i) << " " - // << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - if (std::is_same::value) { ASSERT_EQ(gold_h(i), test_view_h(i)); } else { const auto error = std::abs(gold_h(i) - test_view_h(i)); - if (error > 1e-10) { - std::cout << i << " " << std::setprecision(15) << data_view_h(i) - << " " << gold_h(i) << " " << test_view_h(i) << " " - << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - } - EXPECT_LT(error, 1e-10); + ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error + << static_cast(test_view_h(i)) << " " + << static_cast(gold_h(i)); } } - // std::cout << " last el: " << test_view_h(test_view_h.extent(0)-1) << - // std::endl; } } @@ -205,17 +197,13 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value, BinaryOp bop, UnaryOp uop) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // std::cout << "transform_exclusive_scan custom op: " << name << ", " - // << view_tag_to_string(Tag{}) << ", " - // << value_type_to_string(ValueType()) << ", " - // << "init = " << init_value << std::endl; - - auto view_dest = - create_view(Tag{}, view_ext, "transform_exclusive_scan"); - auto view_from = - create_view(Tag{}, view_ext, "transform_exclusive_scan"); + + auto view_from = create_view(Tag{}, view_ext, + "transform_exclusive_scan_view_from"); fill_view(view_from, name); + auto view_dest = create_view(Tag{}, view_ext, + "transform_exclusive_scan_view_dest"); { fill_zero(view_dest); auto r = KE::transform_exclusive_scan( @@ -253,6 +241,65 @@ void run_single_scenario(const InfoType& scenario_info, ValueType init_value, Kokkos::fence(); } +template +void run_single_scenario_inplace(const InfoType& scenario_info, + ValueType init_value, BinaryOp bop, + UnaryOp uop) { + const auto name = std::get<0>(scenario_info); + const std::size_t view_ext = std::get<1>(scenario_info); + + // since here we call the in-place operation, we need to use two views: + // view1: filled according to what the scenario asks for and is not modified + // view2: filled according to what the scenario asks for and used for the + // in-place op Therefore, after the op is done, view2 should contain the + // result of doing exclusive scan NOTE: view2 is filled below every time + // because the algorithm acts in place + + auto view1 = + create_view(Tag{}, view_ext, "transform_exclusive_scan_view1"); + fill_view(view1, name); + + auto view2 = + create_view(Tag{}, view_ext, "transform_exclusive_scan_view2"); + + { + fill_view(view2, name); + auto r = KE::transform_exclusive_scan(exespace(), KE::cbegin(view2), + KE::cend(view2), KE::begin(view2), + init_value, bop, uop); + ASSERT_EQ(r, KE::end(view2)); + verify_data(view1, view2, init_value, bop, uop); + } + + { + fill_view(view2, name); + auto r = KE::transform_exclusive_scan( + "label", exespace(), KE::cbegin(view2), KE::cend(view2), + KE::begin(view2), init_value, bop, uop); + ASSERT_EQ(r, KE::end(view2)); + verify_data(view1, view2, init_value, bop, uop); + } + + { + fill_view(view2, name); + auto r = KE::transform_exclusive_scan(exespace(), view2, view2, init_value, + bop, uop); + ASSERT_EQ(r, KE::end(view2)); + verify_data(view1, view2, init_value, bop, uop); + } + + { + fill_view(view2, name); + auto r = KE::transform_exclusive_scan("label", exespace(), view2, view2, + init_value, bop, uop); + ASSERT_EQ(r, KE::end(view2)); + verify_data(view1, view2, init_value, bop, uop); + } + + Kokkos::fence(); +} + template void run_all_scenarios() { const std::map scenarios = { @@ -267,6 +314,11 @@ void run_all_scenarios() { run_single_scenario(it, ValueType{1}, bop_t(), uop_t()); run_single_scenario(it, ValueType{-2}, bop_t(), uop_t()); run_single_scenario(it, ValueType{3}, bop_t(), uop_t()); + + run_single_scenario_inplace(it, ValueType{0}, bop_t(), + uop_t()); + run_single_scenario_inplace(it, ValueType{-2}, bop_t(), + uop_t()); } } diff --git a/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp b/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp index a90a68ca1d7..fb81ae91b04 100644 --- a/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp +++ b/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp @@ -16,6 +16,7 @@ #include #include +#include namespace Test { namespace stdalgos { @@ -172,24 +173,15 @@ void verify_data(ViewType1 data_view, // contains data create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); if (test_view_h.extent(0) > 0) { for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - // std::cout << i << " " << std::setprecision(15) << data_view_h(i) << " " - // << gold_h(i) << " " << test_view_h(i) << " " - // << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - if (std::is_same::value) { ASSERT_EQ(gold_h(i), test_view_h(i)); } else { const auto error = std::abs(gold_h(i) - test_view_h(i)); - if (error > 1e-10) { - std::cout << i << " " << std::setprecision(15) << data_view_h(i) - << " " << gold_h(i) << " " << test_view_h(i) << " " - << std::abs(gold_h(i) - test_view_h(i)) << std::endl; - } - EXPECT_LT(error, 1e-10); + ASSERT_LT(error, 1e-10) << i << " " << std::setprecision(15) << error + << static_cast(test_view_h(i)) << " " + << static_cast(gold_h(i)); } } - // std::cout << " last el: " << test_view_h(test_view_h.extent(0)-1) << - // std::endl; } } @@ -210,30 +202,11 @@ struct SumBinaryFunctor { std::string value_type_to_string(int) { return "int"; } std::string value_type_to_string(double) { return "double"; } -template -void print_scenario_details(const std::string& name, BopT bop, UopT uop) { - (void)bop; - (void)uop; - std::cout << "transform_inclusive_scan: " << name << ", " - << view_tag_to_string(Tag{}) << std::endl; -} - -template -void print_scenario_details(const std::string& name, BopT bop, UopT uop, - ValueType init_value) { - (void)bop; - (void)uop; - std::cout << "transform_inclusive_scan: " << name << ", " - << view_tag_to_string(Tag{}) << ", " - << "init = " << init_value << std::endl; -} - template void run_single_scenario(const InfoType& scenario_info, Args... args /* by value on purpose*/) { const auto name = std::get<0>(scenario_info); const std::size_t view_ext = std::get<1>(scenario_info); - // print_scenario_details(name, args...); auto view_dest = create_view(Tag{}, view_ext, "transform_inclusive_scan"); @@ -278,6 +251,63 @@ void run_single_scenario(const InfoType& scenario_info, Kokkos::fence(); } +template +void run_single_scenario_inplace(const InfoType& scenario_info, + Args... args /* by value on purpose*/) { + const auto name = std::get<0>(scenario_info); + const std::size_t view_ext = std::get<1>(scenario_info); + + // since here we call the in-place operation, we need to use two views: + // view1: filled according to scenario and is not modified + // view2: filled according scenario and used for the in-place op + // Therefore, after the op is done, view_2 should contain the + // result of doing exclusive scan. + // NOTE: view2 must be filled before every call to the algorithm + // because the algorithm acts in place + + auto view_1 = create_view(Tag{}, view_ext, + "transform_inclusive_scan_view_1"); + fill_view(view_1, name); + + auto view_2 = create_view(Tag{}, view_ext, + "transform_inclusive_scan_view_2"); + + { + fill_view(view_2, name); + auto r = KE::transform_inclusive_scan(exespace(), KE::cbegin(view_2), + KE::cend(view_2), KE::begin(view_2), + args...); + ASSERT_EQ(r, KE::end(view_2)); + verify_data(view_1, view_2, args...); + } + + { + fill_view(view_2, name); + auto r = KE::transform_inclusive_scan("label", exespace(), + KE::cbegin(view_2), KE::cend(view_2), + KE::begin(view_2), args...); + ASSERT_EQ(r, KE::end(view_2)); + verify_data(view_1, view_2, args...); + } + + { + fill_view(view_2, name); + auto r = KE::transform_inclusive_scan(exespace(), view_2, view_2, args...); + ASSERT_EQ(r, KE::end(view_2)); + verify_data(view_1, view_2, args...); + } + + { + fill_view(view_2, name); + auto r = KE::transform_inclusive_scan("label", exespace(), view_2, view_2, + args...); + ASSERT_EQ(r, KE::end(view_2)); + verify_data(view_1, view_2, args...); + } + + Kokkos::fence(); +} + template void run_all_scenarios() { const std::map scenarios = { @@ -294,15 +324,23 @@ void run_all_scenarios() { run_single_scenario(it, bop_t(), uop_t(), ValueType{2}); run_single_scenario(it, bop_t(), uop_t(), ValueType{-1}); run_single_scenario(it, bop_t(), uop_t(), ValueType{-2}); + + run_single_scenario_inplace(it, bop_t(), uop_t()); + run_single_scenario_inplace(it, bop_t(), uop_t(), + ValueType{0}); + run_single_scenario_inplace(it, bop_t(), uop_t(), + ValueType{2}); + run_single_scenario_inplace(it, bop_t(), uop_t(), + ValueType{-2}); } } #if !defined KOKKOS_ENABLE_OPENMPTARGET TEST(std_algorithms_numeric_ops_test, transform_inclusive_scan) { run_all_scenarios(); - // run_all_scenarios(); - // run_all_scenarios(); - // run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); + run_all_scenarios(); } #endif diff --git a/algorithms/unit_tests/TestStdReducers.cpp b/algorithms/unit_tests/TestStdReducers.cpp index 3847e1e6a36..c05006a1617 100644 --- a/algorithms/unit_tests/TestStdReducers.cpp +++ b/algorithms/unit_tests/TestStdReducers.cpp @@ -83,9 +83,6 @@ auto run_min_or_max_test(ViewType view, StdReducersTestEnumOrder enValue) { static_assert(std::is_same::value, "test is only enabled for HostSpace"); - std::cout << "checking reduction with order: " << order_to_string(enValue) - << "\n"; - using view_value_type = typename ViewType::value_type; using reducer_type = std::conditional_t< (flag == 0), Kokkos::MaxFirstLoc, @@ -132,18 +129,24 @@ TEST(std_algorithms_reducers, max_first_loc) { const auto pair1 = run_min_or_max_test<0, hostspace, index_type>( view_h, StdReducersTestEnumOrder::LeftToRight); - ASSERT_EQ(pair1.first, gold_value); - ASSERT_EQ(pair1.second, gold_location); + ASSERT_EQ(pair1.first, gold_value) + << order_to_string(StdReducersTestEnumOrder::LeftToRight); + ASSERT_EQ(pair1.second, gold_location) + << order_to_string(StdReducersTestEnumOrder::LeftToRight); const auto pair2 = run_min_or_max_test<0, hostspace, index_type>( view_h, StdReducersTestEnumOrder::RightToLeft); - ASSERT_EQ(pair2.first, gold_value); - ASSERT_EQ(pair2.second, gold_location); + ASSERT_EQ(pair2.first, gold_value) + << order_to_string(StdReducersTestEnumOrder::RightToLeft); + ASSERT_EQ(pair2.second, gold_location) + << order_to_string(StdReducersTestEnumOrder::RightToLeft); const auto pair3 = run_min_or_max_test<0, hostspace, index_type>( view_h, StdReducersTestEnumOrder::Random); - ASSERT_EQ(pair3.first, gold_value); - ASSERT_EQ(pair3.second, gold_location); + ASSERT_EQ(pair3.first, gold_value) + << order_to_string(StdReducersTestEnumOrder::Random); + ASSERT_EQ(pair3.second, gold_location) + << order_to_string(StdReducersTestEnumOrder::Random); } TEST(std_algorithms_reducers, min_first_loc) { @@ -191,9 +194,6 @@ void run_min_max_test(ViewType view, StdReducersTestEnumOrder enValue, static_assert(std::is_same::value, "test is only enabled for HostSpace"); - std::cout << "checking reduction with order: " << order_to_string(enValue) - << "\n"; - using view_value_type = typename ViewType::value_type; using reducer_type = Kokkos::MinMaxFirstLastLoc; @@ -212,10 +212,10 @@ void run_min_max_test(ViewType view, StdReducersTestEnumOrder enValue, reduction_value_type{view(index), view(index), index, index}); } - ASSERT_EQ(red_result.min_val, gold_values.first); - ASSERT_EQ(red_result.max_val, gold_values.second); - ASSERT_EQ(red_result.min_loc, gold_locs.first); - ASSERT_EQ(red_result.max_loc, gold_locs.second); + ASSERT_EQ(red_result.min_val, gold_values.first) << order_to_string(enValue); + ASSERT_EQ(red_result.max_val, gold_values.second) << order_to_string(enValue); + ASSERT_EQ(red_result.min_loc, gold_locs.first) << order_to_string(enValue); + ASSERT_EQ(red_result.max_loc, gold_locs.second) << order_to_string(enValue); } TEST(std_algorithms_reducers, min_max_first_last_loc) { diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 42279bf55db..abf50283594 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -1 +1,12 @@ +#FIXME_OPENMPTARGET - compiling in debug mode causes ICE. +KOKKOS_ADD_BENCHMARK_DIRECTORIES(atomic) +KOKKOS_ADD_BENCHMARK_DIRECTORIES(gather) KOKKOS_ADD_BENCHMARK_DIRECTORIES(gups) +KOKKOS_ADD_BENCHMARK_DIRECTORIES(launch_latency) +KOKKOS_ADD_BENCHMARK_DIRECTORIES(stream) + +#FIXME_OPENMPTARGET - These two benchmarks cause ICE. Commenting them for now but a deeper analysis on the cause and a possible fix will follow. +IF(NOT Kokkos_ENABLE_OPENMPTARGET) + KOKKOS_ADD_BENCHMARK_DIRECTORIES(policy_performance) + KOKKOS_ADD_BENCHMARK_DIRECTORIES(bytes_and_flops) +ENDIF() diff --git a/benchmarks/atomic/CMakeLists.txt b/benchmarks/atomic/CMakeLists.txt new file mode 100644 index 00000000000..85f7412f492 --- /dev/null +++ b/benchmarks/atomic/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + atomic + SOURCES main.cpp +) diff --git a/benchmarks/bytes_and_flops/CMakeLists.txt b/benchmarks/bytes_and_flops/CMakeLists.txt new file mode 100644 index 00000000000..0ce44a6f1a8 --- /dev/null +++ b/benchmarks/bytes_and_flops/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + bytes_and_flops + SOURCES bench_double.cpp bench_float.cpp bench_int32_t.cpp bench_int64_t.cpp main.cpp +) diff --git a/benchmarks/bytes_and_flops/bench.hpp b/benchmarks/bytes_and_flops/bench.hpp index 2589fd7309b..88830af624b 100644 --- a/benchmarks/bytes_and_flops/bench.hpp +++ b/benchmarks/bytes_and_flops/bench.hpp @@ -37,22 +37,22 @@ struct RunStride { }; #define STRIDE 1 -#include +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 2 -#include +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 4 -#include +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 8 -#include +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 16 -#include +#include "bench_stride.hpp" #undef STRIDE #define STRIDE 32 -#include +#include "bench_stride.hpp" #undef STRIDE template diff --git a/benchmarks/bytes_and_flops/bench_double.cpp b/benchmarks/bytes_and_flops/bench_double.cpp index f955c996660..2fda1ae3d42 100644 --- a/benchmarks/bytes_and_flops/bench_double.cpp +++ b/benchmarks/bytes_and_flops/bench_double.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "bench.hpp" template void run_stride_unroll(int N, int K, int R, int D, int U, int F, int T, int S, int B, int I); diff --git a/benchmarks/bytes_and_flops/bench_float.cpp b/benchmarks/bytes_and_flops/bench_float.cpp index 137ff67d404..3210116a9ee 100644 --- a/benchmarks/bytes_and_flops/bench_float.cpp +++ b/benchmarks/bytes_and_flops/bench_float.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "bench.hpp" template void run_stride_unroll(int N, int K, int R, int D, int U, int F, int T, int S, int B, int I); diff --git a/benchmarks/bytes_and_flops/bench_int32_t.cpp b/benchmarks/bytes_and_flops/bench_int32_t.cpp index 29ccec01414..24a5dcd3899 100644 --- a/benchmarks/bytes_and_flops/bench_int32_t.cpp +++ b/benchmarks/bytes_and_flops/bench_int32_t.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "bench.hpp" template void run_stride_unroll(int N, int K, int R, int D, int U, int F, int T, int S, int B, int I); diff --git a/benchmarks/bytes_and_flops/bench_int64_t.cpp b/benchmarks/bytes_and_flops/bench_int64_t.cpp index c153d5eff39..0634700c31e 100644 --- a/benchmarks/bytes_and_flops/bench_int64_t.cpp +++ b/benchmarks/bytes_and_flops/bench_int64_t.cpp @@ -14,7 +14,7 @@ // //@HEADER -#include +#include "bench.hpp" template void run_stride_unroll(int N, int K, int R, int D, int U, int F, int T, int S, int B, int I); diff --git a/benchmarks/bytes_and_flops/bench_stride.hpp b/benchmarks/bytes_and_flops/bench_stride.hpp index b63d486fc9e..80f017fbe8f 100644 --- a/benchmarks/bytes_and_flops/bench_stride.hpp +++ b/benchmarks/bytes_and_flops/bench_stride.hpp @@ -15,28 +15,28 @@ //@HEADER #define UNROLL 1 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 2 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 3 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 4 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 5 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 6 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 7 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL #define UNROLL 8 -#include +#include "bench_unroll_stride.hpp" #undef UNROLL template diff --git a/benchmarks/bytes_and_flops/bench_unroll_stride.hpp b/benchmarks/bytes_and_flops/bench_unroll_stride.hpp index 0f7a298c1bb..78cfd48effe 100644 --- a/benchmarks/bytes_and_flops/bench_unroll_stride.hpp +++ b/benchmarks/bytes_and_flops/bench_unroll_stride.hpp @@ -26,7 +26,7 @@ struct Run { Kokkos::deep_copy(C, Scalar(3.5)); Kokkos::Timer timer; - for (int i = 0; i < I; ++i) { + for (int iter = 0; iter < I; ++iter) { Kokkos::parallel_for( "BenchmarkKernel", Kokkos::TeamPolicy<>(N, T).set_scratch_size(0, Kokkos::PerTeam(S)), diff --git a/benchmarks/bytes_and_flops/main.cpp b/benchmarks/bytes_and_flops/main.cpp index 20077757d1f..fdfcc4ea64f 100644 --- a/benchmarks/bytes_and_flops/main.cpp +++ b/benchmarks/bytes_and_flops/main.cpp @@ -16,7 +16,7 @@ #include #include -#include +#include "bench.hpp" #include extern template void run_stride_unroll(int, int, int, int, int, int, int, @@ -86,7 +86,7 @@ int main(int argc, char* argv[]) { printf("D must be one of 1,2,4,8,16,32\n"); return 0; } - if ((P < 1) && (P > 2)) { + if ((P < 1) || (P > 4)) { printf("P must be one of 1,2,3,4\n"); return 0; } diff --git a/benchmarks/gather/CMakeLists.txt b/benchmarks/gather/CMakeLists.txt new file mode 100644 index 00000000000..24c70627725 --- /dev/null +++ b/benchmarks/gather/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + gather + SOURCES main.cpp +) diff --git a/benchmarks/gather/gather.hpp b/benchmarks/gather/gather.hpp index d83461702c7..90b1101c1d5 100644 --- a/benchmarks/gather/gather.hpp +++ b/benchmarks/gather/gather.hpp @@ -20,28 +20,28 @@ struct RunGather { }; #define UNROLL 1 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 2 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 3 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 4 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 5 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 6 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 7 -#include +#include "gather_unroll.hpp" #undef UNROLL #define UNROLL 8 -#include +#include "gather_unroll.hpp" #undef UNROLL template diff --git a/benchmarks/gather/gather_unroll.hpp b/benchmarks/gather/gather_unroll.hpp index 5ee5742a3f7..1aa73091bc5 100644 --- a/benchmarks/gather/gather_unroll.hpp +++ b/benchmarks/gather/gather_unroll.hpp @@ -138,7 +138,7 @@ struct RunGather { printf( "SNKDRUF: %i %i %i %i %i %i %i Time: %lfs Bandwidth: %lfGiB/s GFlop/s: " "%lf GGather/s: %lf\n", - sizeof(Scalar) / 4, N, K, D, R, UNROLL, F, seconds, + static_cast(sizeof(Scalar) / 4), N, K, D, R, UNROLL, F, seconds, 1.0 * bytes / seconds / 1024 / 1024 / 1024, 1.e-9 * flops / seconds, 1.e-9 * gather_ops / seconds); } diff --git a/benchmarks/gather/main.cpp b/benchmarks/gather/main.cpp index 7f4fc9ede6c..07fca9fdc64 100644 --- a/benchmarks/gather/main.cpp +++ b/benchmarks/gather/main.cpp @@ -16,7 +16,7 @@ #include #include -#include +#include "gather.hpp" #include int main(int argc, char* argv[]) { diff --git a/benchmarks/launch_latency/CMakeLists.txt b/benchmarks/launch_latency/CMakeLists.txt new file mode 100644 index 00000000000..bb14da749d1 --- /dev/null +++ b/benchmarks/launch_latency/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + launch_latency + SOURCES launch_latency.cpp +) diff --git a/benchmarks/launch_latency/launch_latency.cpp b/benchmarks/launch_latency/launch_latency.cpp new file mode 100644 index 00000000000..73b176ab8dd --- /dev/null +++ b/benchmarks/launch_latency/launch_latency.cpp @@ -0,0 +1,283 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +/*! \file launch_latency.cpp + + Tests of parallel_for and parallel_reduce latency for different + circumstances. + + Three launch kinds are tested: parallel_for, parallel_reduce into scalar, + and parallel_reduce into view + + N controls how large the parallel loops is + V controls how large the functor is + M controls across how many launches the latency is averaged + K controls how larege the nested loop is (no larger than V) + + For each launch kind, + 1. Avg functor dispatch latency: (time to do M launches) / M + 2. Avg functor completion throughput: (M launches + sync) / M + 3. Avg functor completion latency: (M (launch + sync)) / M +*/ + +#include + +template +struct TestFunctor { + double values[V]; + Kokkos::View a; + int K; + TestFunctor(Kokkos::View a_, int K_) : a(a_), K(K_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i) const { + for (int j = 0; j < K; j++) a(i) += 1.0 * i * values[j]; + } +}; + +template +struct TestRFunctor { + double values[V]; + Kokkos::View a; + int K; + TestRFunctor(Kokkos::View a_, int K_) : a(a_), K(K_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int i, double& lsum) const { + for (int j = 0; j < K; j++) a(i) += 1.0 * i * values[j]; + lsum += a(i); + } +}; + +struct Opts { + bool par_for = true; + bool par_reduce = true; + bool par_reduce_view = true; +}; + +template +void run(int N, int M, int K, const Opts& opts) { + std::string l_no_fence, l_fence, l_red_no_fence, l_red_fence, + l_red_view_no_fence, l_red_view_fence; + { + std::ostringstream ostream; + ostream << "RunNoFence_" << N << "_" << K << std::endl; + l_no_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunFence_" << N << "_" << K << std::endl; + l_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunReduceNoFence_" << N << "_" << K << std::endl; + l_red_no_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunReduceFence_" << N << "_" << K << std::endl; + l_red_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunReduceViewNoFence_" << N << "_" << K << std::endl; + l_red_view_no_fence = ostream.str(); + } + { + std::ostringstream ostream; + ostream << "RunReduceViewFence_" << N << "_" << K << std::endl; + l_red_view_fence = ostream.str(); + } + + double result; + Kokkos::View a("A", N); + Kokkos::View v_result("result"); + TestFunctor f(a, K); + TestRFunctor rf(a, K); + Kokkos::Timer timer; + + // initialize to an obviously wrong value + double time_no_fence = -1; // launch loop + double time_no_fence_fenced = -1; // launch loop then fence + double time_fence = -1; // launch&fence loop + + double time_red_no_fence = -1; + double time_red_no_fence_fenced = -1; + double time_red_fence = -1; + + double time_red_view_no_fence = -1; + double time_red_view_no_fence_fenced = -1; + double time_red_view_fence = -1; + + if (opts.par_for) { + // warmup + for (int i = 0; i < 4; ++i) { + Kokkos::parallel_for(l_no_fence, N, f); + } + Kokkos::fence(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_for(l_no_fence, N, f); + } + time_no_fence = timer.seconds(); + Kokkos::fence(); + time_no_fence_fenced = timer.seconds(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_for(l_fence, N, f); + Kokkos::fence(); + } + time_fence = timer.seconds(); + } + + if (opts.par_reduce) { + // warmup + for (int i = 0; i < 4; ++i) { + Kokkos::parallel_reduce(l_red_no_fence, N, rf, result); + } + Kokkos::fence(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_reduce(l_red_no_fence, N, rf, result); + } + time_red_no_fence = timer.seconds(); + Kokkos::fence(); + time_red_no_fence_fenced = timer.seconds(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_reduce(l_red_fence, N, rf, result); + Kokkos::fence(); + } + time_red_fence = timer.seconds(); + Kokkos::fence(); + } + + if (opts.par_reduce_view) { + // warmup + for (int i = 0; i < 4; ++i) { + Kokkos::parallel_reduce(l_red_view_no_fence, N, rf, v_result); + } + Kokkos::fence(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_reduce(l_red_view_no_fence, N, rf, v_result); + } + time_red_view_no_fence = timer.seconds(); + Kokkos::fence(); + time_red_view_no_fence_fenced = timer.seconds(); + + timer.reset(); + for (int i = 0; i < M; i++) { + Kokkos::parallel_reduce(l_red_view_fence, N, rf, v_result); + Kokkos::fence(); + } + time_red_view_fence = timer.seconds(); + Kokkos::fence(); + timer.reset(); + } + + const double x = 1.e6 / M; + printf("%i %i %i %i", N, V, K, M); + if (opts.par_for) { + printf(" parallel_for: %lf %lf ( %lf )", x * time_no_fence, x * time_fence, + x * time_no_fence_fenced); + } + if (opts.par_reduce) { + printf(" parallel_reduce: %lf %lf ( %lf )", x * time_red_no_fence, + x * time_red_fence, x * time_red_no_fence_fenced); + } + if (opts.par_reduce_view) { + printf(" parallel_reduce(view): %lf %lf ( %lf )", + x * time_red_view_no_fence, x * time_red_view_fence, + x * time_red_view_no_fence_fenced); + } + printf("\n"); +} +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + { + int N = 10000; + int M = 20; + int K = 1; + + Opts opts; + + printf("==========================\n"); + printf("Kokkos Launch Latency Test\n"); + printf("==========================\n"); + printf("\n"); + printf("Usage: %s ARGUMENTS [OPTIONS...]\n\n", argv[0]); + printf("Arguments: N M K\n"); + printf(" N: loop length\n"); + printf(" M: how many kernels to dispatch\n"); + printf( + " K: nested loop length (capped by size of functor member array\n\n"); + printf("Options:\n"); + printf(" --no-parallel-for: skip parallel_for benchmark\n"); + printf(" --no-parallel-reduce: skip parallel_reduce benchmark\n"); + printf( + " --no-parallel-reduce-view: skip parallel_reduce into view " + "benchmark\n"); + printf("\n\n"); + printf(" Output V is the size of the functor member array\n"); + printf("\n\n"); + + for (int i = 1; i < argc; ++i) { + const std::string_view arg(argv[i]); + + // anything that doesn't start with -- + if (arg.size() < 2 || + (arg.size() >= 2 && arg[0] != '-' && arg[1] != '-')) { + if (i == 1) + N = atoi(arg.data()); + else if (i == 2) + M = atoi(arg.data()); + else if (i == 3) + K = atoi(arg.data()); + else { + throw std::runtime_error("unexpected argument!"); + } + } else if (arg == "--no-parallel-for") { + opts.par_for = false; + } else if (arg == "--no-parallel-reduce") { + opts.par_reduce = false; + } else if (arg == "--no-parallel-reduce-view") { + opts.par_reduce_view = false; + } else { + std::stringstream ss; + ss << "unexpected argument \"" << arg << "\" at position " << i; + throw std::runtime_error(ss.str()); + } + } + + printf("N V K M time_no_fence time_fence (time_no_fence_fenced)\n"); + + /* A backend may have different launch strategies for functors of different + * sizes: test a variety of functor sizes.*/ + run<1>(N, M, K <= 1 ? K : 1, opts); + run<16>(N, M, K <= 16 ? K : 16, opts); + run<200>(N, M, K <= 200 ? K : 200, opts); + run<3000>(N, M, K <= 3000 ? K : 3000, opts); + run<30000>(N, M, K <= 30000 ? K : 30000, opts); + } + Kokkos::finalize(); +} diff --git a/benchmarks/policy_performance/CMakeLists.txt b/benchmarks/policy_performance/CMakeLists.txt new file mode 100644 index 00000000000..929b9c97023 --- /dev/null +++ b/benchmarks/policy_performance/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + policy_performance + SOURCES main.cpp +) diff --git a/benchmarks/policy_performance/main.cpp b/benchmarks/policy_performance/main.cpp index 28cfde552a5..0983a3d535c 100644 --- a/benchmarks/policy_performance/main.cpp +++ b/benchmarks/policy_performance/main.cpp @@ -106,8 +106,9 @@ int main(int argc, char* argv[]) { Kokkos::parallel_reduce( "parallel_reduce warmup", Kokkos::TeamPolicy<>(10, 1), - KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type team, - double& lval) { lval += 1; }, + KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type&, double& lval) { + lval += 1; + }, result); using view_type_1d = Kokkos::View; diff --git a/benchmarks/policy_performance/policy_perf_test.hpp b/benchmarks/policy_performance/policy_perf_test.hpp index cc2cc40257b..0e23d221f67 100644 --- a/benchmarks/policy_performance/policy_perf_test.hpp +++ b/benchmarks/policy_performance/policy_perf_test.hpp @@ -21,13 +21,13 @@ struct ParallelScanFunctor { using value_type = double; ViewType v; - ParallelScanFunctor(const ViewType& v_) : v(v_) {} + explicit ParallelScanFunctor(const ViewType& v_) : v(v_) {} KOKKOS_INLINE_FUNCTION - void operator()(const int idx, value_type& val, const bool& final) const { + void operator()(const int idx, value_type& val, const bool& is_final) const { // inclusive scan val += v(idx); - if (final) { + if (is_final) { v(idx) = val; } } @@ -109,7 +109,7 @@ void test_policy(int team_range, int thread_range, int vector_range, vector_result = 0.0; Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, vector_range), - [&](const int vi, double& vval) { vval += 1; }, + [&](const int, double& vval) { vval += 1; }, vector_result); } v2(idx, t) = vector_result; @@ -128,7 +128,7 @@ void test_policy(int team_range, int thread_range, int vector_range, team_result = 0.0; Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, thread_range), - [&](const int t, double& lval) { lval += 1; }, team_result); + [&](const int, double& lval) { lval += 1; }, team_result); } v1(idx) = team_result; // prevent compiler optimizing loop away @@ -170,13 +170,13 @@ void test_policy(int team_range, int thread_range, int vector_range, for (int tr = 0; tr < thread_repeat; ++tr) { Kokkos::parallel_reduce( Kokkos::TeamThreadRange(team, thread_range), - [&](const int t, double& lval) { + [&](const int, double& lval) { double vector_result = 0.0; for (int vr = 0; vr < inner_repeat; ++vr) { vector_result = 0.0; Kokkos::parallel_reduce( Kokkos::ThreadVectorRange(team, vector_range), - [&](const int vi, double& vval) { vval += 1; }, + [&](const int, double& vval) { vval += 1; }, vector_result); lval += vector_result; } diff --git a/benchmarks/stream/CMakeLists.txt b/benchmarks/stream/CMakeLists.txt new file mode 100644 index 00000000000..0dded6e3a54 --- /dev/null +++ b/benchmarks/stream/CMakeLists.txt @@ -0,0 +1,4 @@ +KOKKOS_ADD_EXECUTABLE( + stream + SOURCES stream-kokkos.cpp +) diff --git a/bin/nvcc_wrapper b/bin/nvcc_wrapper index c1400872402..9b935835d5f 100755 --- a/bin/nvcc_wrapper +++ b/bin/nvcc_wrapper @@ -229,7 +229,7 @@ do fi ;; #Handle known nvcc args - --dryrun|--verbose|--keep|--source-in-ptx|-src-in-ptx|--keep-dir*|-G|-lineinfo|-extended-lambda|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|--fmad=*|--use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) + --dryrun|-dryrun|--verbose|--keep|-keep|--source-in-ptx|-src-in-ptx|--keep-dir*|-keep-dir*|-G|-lineinfo|--generate-line-info|-extended-lambda|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-res-usage|-fmad=*|--use_fast_math|-use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) cuda_args="$cuda_args $1" ;; #Handle more known nvcc args diff --git a/cmake/KokkosConfig.cmake.in b/cmake/KokkosConfig.cmake.in index e26c75b3122..1b6d1b66ff5 100644 --- a/cmake/KokkosConfig.cmake.in +++ b/cmake/KokkosConfig.cmake.in @@ -39,10 +39,12 @@ IF("launch_compiler" IN_LIST Kokkos_FIND_COMPONENTS) GLOBAL CHECK_CUDA_COMPILES) -ELSEIF(@Kokkos_ENABLE_CUDA@ AND NOT "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS) +ELSEIF(@Kokkos_ENABLE_CUDA@ + AND NOT @KOKKOS_COMPILE_LANGUAGE@ STREQUAL CUDA + AND NOT "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS) # - # if CUDA was enabled, separable compilation was not specified, and current compiler - # cannot compile CUDA, then set the RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK globally and + # if CUDA was enabled, the compilation language was not set to CUDA, and separable compilation was not + # specified, then set the RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK globally and # kokkos_launch_compiler will re-direct to the compiler used to compile CUDA code during installation. # kokkos_launch_compiler will re-direct if ${CMAKE_CXX_COMPILER} and -DKOKKOS_DEPENDENCE is present, # otherwise, the original command will be executed diff --git a/cmake/KokkosCore_config.h.in b/cmake/KokkosCore_config.h.in index 9930d2abf0f..2df0f6c5205 100644 --- a/cmake/KokkosCore_config.h.in +++ b/cmake/KokkosCore_config.h.in @@ -23,8 +23,6 @@ #cmakedefine KOKKOS_ENABLE_CUDA #cmakedefine KOKKOS_ENABLE_HIP #cmakedefine KOKKOS_ENABLE_HPX -#cmakedefine KOKKOS_ENABLE_MEMKIND -#cmakedefine KOKKOS_ENABLE_LIBRT #cmakedefine KOKKOS_ENABLE_SYCL #cmakedefine KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED @@ -32,6 +30,7 @@ #cmakedefine KOKKOS_ENABLE_CXX17 #cmakedefine KOKKOS_ENABLE_CXX20 #cmakedefine KOKKOS_ENABLE_CXX23 +#cmakedefine KOKKOS_ENABLE_CXX26 #cmakedefine KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE #cmakedefine KOKKOS_ENABLE_CUDA_UVM @@ -45,7 +44,6 @@ #cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK #cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK #cmakedefine KOKKOS_ENABLE_TUNING -#cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE_3 #cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE_4 #cmakedefine KOKKOS_ENABLE_DEPRECATION_WARNINGS #cmakedefine KOKKOS_ENABLE_LARGE_MEM_TESTS @@ -53,17 +51,15 @@ #cmakedefine KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION // deprecated #cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION #cmakedefine KOKKOS_ENABLE_IMPL_MDSPAN +#cmakedefine KOKKOS_ENABLE_ATOMICS_BYPASS /* TPL Settings */ #cmakedefine KOKKOS_ENABLE_HWLOC -#cmakedefine KOKKOS_USE_LIBRT -#cmakedefine KOKKOS_ENABLE_HBWSPACE #cmakedefine KOKKOS_ENABLE_LIBDL #cmakedefine KOKKOS_ENABLE_LIBQUADMATH -#cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND #cmakedefine KOKKOS_ENABLE_ONEDPL +#cmakedefine KOKKOS_ENABLE_ROCTHRUST -#cmakedefine KOKKOS_ARCH_SSE42 #cmakedefine KOKKOS_ARCH_ARMV80 #cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX #cmakedefine KOKKOS_ARCH_ARMV81 @@ -78,6 +74,7 @@ #cmakedefine KOKKOS_ARCH_POWER7 #cmakedefine KOKKOS_ARCH_POWER8 #cmakedefine KOKKOS_ARCH_POWER9 +#cmakedefine KOKKOS_ARCH_RISCV_SG2042 #cmakedefine KOKKOS_ARCH_INTEL_GEN #cmakedefine KOKKOS_ARCH_INTEL_DG1 #cmakedefine KOKKOS_ARCH_INTEL_GEN9 diff --git a/cmake/Modules/FindTPLCUDA.cmake b/cmake/Modules/FindTPLCUDA.cmake index 792c92c07e9..5a62c530fce 100644 --- a/cmake/Modules/FindTPLCUDA.cmake +++ b/cmake/Modules/FindTPLCUDA.cmake @@ -7,7 +7,8 @@ IF (NOT CUDAToolkit_ROOT) ENDIF() ENDIF() -IF(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0") +# FIXME CMake 3.28.4 creates more targets than we export +IF(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0" AND CMAKE_VERSION VERSION_LESS "3.28.4") find_package(CUDAToolkit) ELSE() include(${CMAKE_CURRENT_LIST_DIR}/CudaToolkit.cmake) diff --git a/cmake/Modules/FindTPLLIBRT.cmake b/cmake/Modules/FindTPLLIBRT.cmake deleted file mode 100644 index e75da56b5b5..00000000000 --- a/cmake/Modules/FindTPLLIBRT.cmake +++ /dev/null @@ -1 +0,0 @@ -KOKKOS_FIND_IMPORTED(LIBRT HEADER time.h LIBRARY rt) diff --git a/cmake/Modules/FindTPLMEMKIND.cmake b/cmake/Modules/FindTPLMEMKIND.cmake deleted file mode 100644 index 20aaff22955..00000000000 --- a/cmake/Modules/FindTPLMEMKIND.cmake +++ /dev/null @@ -1 +0,0 @@ -KOKKOS_FIND_IMPORTED(MEMKIND HEADER memkind.h LIBRARY memkind) diff --git a/cmake/Modules/FindTPLONEDPL.cmake b/cmake/Modules/FindTPLONEDPL.cmake index 01791cff443..603510c315e 100644 --- a/cmake/Modules/FindTPLONEDPL.cmake +++ b/cmake/Modules/FindTPLONEDPL.cmake @@ -43,4 +43,7 @@ ELSE() COMPILE_DEFINITIONS PSTL_USE_PARALLEL_POLICIES=0 _GLIBCXX_USE_TBB_PAR_BACKEND=0 ) ENDIF() + + # Export oneDPL as a Kokkos dependency + KOKKOS_EXPORT_CMAKE_TPL(oneDPL) ENDIF() diff --git a/cmake/Modules/FindTPLROCTHRUST.cmake b/cmake/Modules/FindTPLROCTHRUST.cmake new file mode 100644 index 00000000000..dae7dc3c952 --- /dev/null +++ b/cmake/Modules/FindTPLROCTHRUST.cmake @@ -0,0 +1,15 @@ +# ROCm 5.6 and earlier set AMDGPU_TARGETS and GPU_TARGETS to all the supported +# architectures. Therefore, we end up compiling Kokkos for all the supported +# architecture. Starting with ROCm 5.7 AMDGPU_TARGETS and GPU_TARGETS are empty. +# It is the user's job to set the variables. Since we are injecting the +# architecture flag ourselves, we can let the variables empty. To replicate the +# behavior of ROCm 5.7 and later for earlier version of ROCm we set +# AMDGPU_TARGETS and GPU_TARGETS to empty and set the values in the cache. If +# the values are not cached, FIND_PACKAGE(rocthrust) will overwrite them. +SET(AMDGPU_TARGETS "" CACHE STRING "AMD GPU targets to compile for") +SET(GPU_TARGETS "" CACHE STRING "GPU targets to compile for") +FIND_PACKAGE(rocthrust REQUIRED) +KOKKOS_CREATE_IMPORTED_TPL(ROCTHRUST INTERFACE LINK_LIBRARIES roc::rocthrust) + +# Export ROCTHRUST as a Kokkos dependency +KOKKOS_EXPORT_CMAKE_TPL(rocthrust) diff --git a/cmake/kokkos_arch.cmake b/cmake/kokkos_arch.cmake index 30764bde860..34e9f05986f 100644 --- a/cmake/kokkos_arch.cmake +++ b/cmake/kokkos_arch.cmake @@ -49,7 +49,6 @@ DECLARE_AND_CHECK_HOST_ARCH(ARMV81 "ARMv8.1 Compatible CPU") DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX "ARMv8 Cavium ThunderX CPU") DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX2 "ARMv8 Cavium ThunderX2 CPU") DECLARE_AND_CHECK_HOST_ARCH(A64FX "ARMv8.2 with SVE Support") -DECLARE_AND_CHECK_HOST_ARCH(WSM "Intel Westmere CPU") DECLARE_AND_CHECK_HOST_ARCH(SNB "Intel Sandy/Ivy Bridge CPUs") DECLARE_AND_CHECK_HOST_ARCH(HSW "Intel Haswell CPUs") DECLARE_AND_CHECK_HOST_ARCH(BDW "Intel Broadwell Xeon E-class CPUs") @@ -60,13 +59,12 @@ DECLARE_AND_CHECK_HOST_ARCH(SKX "Intel Skylake Xeon Server CPUs (A DECLARE_AND_CHECK_HOST_ARCH(KNC "Intel Knights Corner Xeon Phi") DECLARE_AND_CHECK_HOST_ARCH(KNL "Intel Knights Landing Xeon Phi") DECLARE_AND_CHECK_HOST_ARCH(SPR "Intel Sapphire Rapids Xeon Server CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(BGQ "IBM Blue Gene Q") -DECLARE_AND_CHECK_HOST_ARCH(POWER7 "IBM POWER7 CPUs") DECLARE_AND_CHECK_HOST_ARCH(POWER8 "IBM POWER8 CPUs") DECLARE_AND_CHECK_HOST_ARCH(POWER9 "IBM POWER9 CPUs") DECLARE_AND_CHECK_HOST_ARCH(ZEN "AMD Zen architecture") DECLARE_AND_CHECK_HOST_ARCH(ZEN2 "AMD Zen2 architecture") DECLARE_AND_CHECK_HOST_ARCH(ZEN3 "AMD Zen3 architecture") +DECLARE_AND_CHECK_HOST_ARCH(RISCV_SG2042 "SG2042 (RISC-V) CPUs") IF(Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR Kokkos_ENABLE_SYCL) SET(KOKKOS_SHOW_CUDA_ARCHS ON) @@ -191,9 +189,6 @@ IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang) ELSEIF(CUDAToolkit_BIN_DIR) GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS --cuda-path=${CUDAToolkit_BIN_DIR}/..) ENDIF() - IF (KOKKOS_ENABLE_CUDA) - SET(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND ON CACHE BOOL "enable CUDA Clang workarounds" FORCE) - ENDIF() ELSEIF (KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) SET(CUDA_ARCH_FLAG "-gpu") GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -cuda) @@ -342,18 +337,6 @@ IF (KOKKOS_ARCH_ZEN3) SET(KOKKOS_ARCH_AVX2 ON) ENDIF() -IF (KOKKOS_ARCH_WSM) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xSSE4.2 - MSVC NO-VALUE-SPECIFIED - NVHPC -tp=px - DEFAULT -msse4.2 - ) - SET(KOKKOS_ARCH_SSE42 ON) -ENDIF() - IF (KOKKOS_ARCH_SNB OR KOKKOS_ARCH_AMDAVX) SET(KOKKOS_ARCH_AVX ON) COMPILER_SPECIFIC_FLAGS( @@ -378,6 +361,23 @@ IF (KOKKOS_ARCH_HSW) ) ENDIF() +IF (KOKKOS_ARCH_RISCV_SG2042) + IF(NOT + (KOKKOS_CXX_COMPILER_ID STREQUAL GNU + AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) + OR + (KOKKOS_CXX_COMPILER_ID STREQUAL Clang + AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14) + ) + MESSAGE(SEND_ERROR "Only gcc >= 12 and clang >= 14 support RISC-V.") + ENDIF() + COMPILER_SPECIFIC_FLAGS( + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID + DEFAULT -march=rv64imafdcv + ) +ENDIF() + + IF (KOKKOS_ARCH_BDW) SET(KOKKOS_ARCH_AVX2 ON) COMPILER_SPECIFIC_FLAGS( @@ -571,6 +571,11 @@ IF (KOKKOS_ENABLE_HIP) COMPILER_SPECIFIC_FLAGS( DEFAULT -fgpu-rdc ) + IF (NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + COMPILER_SPECIFIC_LINK_OPTIONS( + DEFAULT --hip-link + ) + ENDIF() ELSE() COMPILER_SPECIFIC_FLAGS( DEFAULT -fno-gpu-rdc @@ -588,32 +593,44 @@ IF (KOKKOS_ENABLE_SYCL) ENDIF() # Check support for device_global variables -# FIXME_SYCL Once the feature test macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL is -# available, use that instead. -IF(KOKKOS_ENABLE_SYCL AND NOT BUILD_SHARED_LIBS) - INCLUDE(CheckCXXSourceCompiles) +# FIXME_SYCL If SYCL_EXT_ONEAPI_DEVICE_GLOBAL is defined, we can use device +# global variables with shared libraries using the "non-separable compilation" +# implementation. Otherwise, the feature is not supported when building shared +# libraries. Thus, we don't even check for support if shared libraries are +# requested and SYCL_EXT_ONEAPI_DEVICE_GLOBAL is not defined. +IF(KOKKOS_ENABLE_SYCL) STRING(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") - CHECK_CXX_SOURCE_COMPILES(" - #include - using namespace sycl::ext::oneapi::experimental; - using namespace sycl; - - SYCL_EXTERNAL device_global Foo; - - void bar(queue q) { - q.single_task([=] { - Foo = 42; - }); - } - - int main(){ return 0; } - " - KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) - - IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED - ) + INCLUDE(CheckCXXSymbolExists) + CHECK_CXX_SYMBOL_EXISTS(SYCL_EXT_ONEAPI_DEVICE_GLOBAL "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + IF (KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + SET(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON) + # Use the non-separable compilation implementation to support shared libraries as well. + COMPILER_SPECIFIC_FLAGS(DEFAULT -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) + ELSEIF(NOT BUILD_SHARED_LIBS) + INCLUDE(CheckCXXSourceCompiles) + CHECK_CXX_SOURCE_COMPILES(" + #include + using namespace sycl::ext::oneapi::experimental; + using namespace sycl; + + SYCL_EXTERNAL device_global Foo; + + void bar(queue q) { + q.single_task([=] { + Foo = 42; + }); + } + + int main(){ return 0; } + " + KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) + + IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) + # Only the separable compilation implementation is supported. + COMPILER_SPECIFIC_FLAGS( + DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED + ) + ENDIF() ENDIF() ENDIF() @@ -767,30 +784,35 @@ IF (KOKKOS_ENABLE_OPENMPTARGET) COMPILER_SPECIFIC_FLAGS( IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__ ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN9) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen9" -D__STRICT_ANSI__ - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN11) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen11" -D__STRICT_ANSI__ - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen12lp" -D__STRICT_ANSI__ - ) - ELSEIF(KOKKOS_ARCH_INTEL_DG1) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device dg1" -D__STRICT_ANSI__ - ) - ELSEIF(KOKKOS_ARCH_INTEL_XEHP) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.50.4" -D__STRICT_ANSI__ + ELSE() + COMPILER_SPECIFIC_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -D__STRICT_ANSI__ ) - ELSEIF(KOKKOS_ARCH_INTEL_PVC) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.60.7" -D__STRICT_ANSI__ + IF(KOKKOS_ARCH_INTEL_GEN9) + COMPILER_SPECIFIC_LINK_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen9" + ) + ELSEIF(KOKKOS_ARCH_INTEL_GEN11) + COMPILER_SPECIFIC_LINK_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen11" + ) + ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP) + COMPILER_SPECIFIC_LINK_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen12lp" + ) + ELSEIF(KOKKOS_ARCH_INTEL_DG1) + COMPILER_SPECIFIC_LINK_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device dg1" + ) + ELSEIF(KOKKOS_ARCH_INTEL_XEHP) + COMPILER_SPECIFIC_LINK_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.50.4" + ) + ELSEIF(KOKKOS_ARCH_INTEL_PVC) + COMPILER_SPECIFIC_LINK_OPTIONS( + IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.60.7" ) + ENDIF() ENDIF() ENDIF() @@ -1130,3 +1152,14 @@ MESSAGE(STATUS "Architectures:") FOREACH(Arch ${KOKKOS_ENABLED_ARCH_LIST}) MESSAGE(STATUS " ${Arch}") ENDFOREACH() + + +IF(KOKKOS_ENABLE_ATOMICS_BYPASS) + IF(NOT _HOST_PARALLEL STREQUAL "NoTypeDefined" OR NOT _DEVICE_PARALLEL STREQUAL "NoTypeDefined") + MESSAGE(FATAL_ERROR "Not allowed to disable atomics (via -DKokkos_ENABLE_AROMICS_BYPASS=ON) if neither a host parallel nor a device backend is enabled!") + ENDIF() + IF(NOT KOKKOS_ENABLE_SERIAL) + MESSAGE(FATAL_ERROR "Implementation bug") # safeguard + ENDIF() + MESSAGE(STATUS "Atomics: **DISABLED**") +ENDIF() diff --git a/cmake/kokkos_compiler_id.cmake b/cmake/kokkos_compiler_id.cmake index 04589befc3a..9135ca2b41c 100644 --- a/cmake/kokkos_compiler_id.cmake +++ b/cmake/kokkos_compiler_id.cmake @@ -152,6 +152,7 @@ ENDIF() SET(KOKKOS_MESSAGE_TEXT "Compiler not supported by Kokkos. Required compiler versions:") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CPU) 8.0.0 or higher") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CUDA) 10.0.0 or higher") +SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(OpenMPTarget) 15.0.0 or higher") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC 8.2.0 or higher") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel 19.0.5 or higher") SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(CPU) 2021.1.1 or higher") @@ -210,6 +211,10 @@ ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") ENDIF() ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL XL OR KOKKOS_CXX_COMPILER_ID STREQUAL XLClang) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") +ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_OPENMPTARGET) + IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 15.0.0) + MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + ENDIF() ENDIF() IF(NOT DEFINED KOKKOS_CXX_HOST_COMPILER_ID) diff --git a/cmake/kokkos_enable_options.cmake b/cmake/kokkos_enable_options.cmake index 89e23b019bd..a437f6132aa 100644 --- a/cmake/kokkos_enable_options.cmake +++ b/cmake/kokkos_enable_options.cmake @@ -48,7 +48,6 @@ KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to allow lambda # resolved but we keep the option around a bit longer to be safe. KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC ON "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)") KOKKOS_ENABLE_OPTION(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler") -KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_3 OFF "Whether code deprecated in major release 3 is available" ) KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available" ) KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" ) KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP") @@ -74,6 +73,7 @@ KOKKOS_ENABLE_OPTION(HIP_MULTIPLE_KERNEL_INSTANTIATIONS OFF "Whether multiple ke # This option will go away eventually, but allows fallback to old implementation when needed. KOKKOS_ENABLE_OPTION(DESUL_ATOMICS_EXTERNAL OFF "Whether to use an external desul installation") +KOKKOS_ENABLE_OPTION(ATOMICS_BYPASS OFF "**NOT RECOMMENDED** Whether to make atomics non-atomic for non-threaded MPI-only use cases") KOKKOS_ENABLE_OPTION(IMPL_MDSPAN OFF "Whether to enable experimental mdspan support") KOKKOS_ENABLE_OPTION(MDSPAN_EXTERNAL OFF BOOL "Whether to use an external version of mdspan") diff --git a/cmake/kokkos_pick_cxx_std.cmake b/cmake/kokkos_pick_cxx_std.cmake index d4eca651d42..ae14a10d531 100644 --- a/cmake/kokkos_pick_cxx_std.cmake +++ b/cmake/kokkos_pick_cxx_std.cmake @@ -7,6 +7,7 @@ KOKKOS_OPTION(CXX_STANDARD "" STRING "[[DEPRECATED - USE CMAKE_CXX_STANDARD INST SET(KOKKOS_ENABLE_CXX17 OFF) SET(KOKKOS_ENABLE_CXX20 OFF) SET(KOKKOS_ENABLE_CXX23 OFF) +SET(KOKKOS_ENABLE_CXX26 OFF) IF (KOKKOS_CXX_STANDARD) MESSAGE(FATAL_ERROR "Setting the variable Kokkos_CXX_STANDARD in configuration is deprecated - set CMAKE_CXX_STANDARD directly instead") ENDIF() diff --git a/cmake/kokkos_test_cxx_std.cmake b/cmake/kokkos_test_cxx_std.cmake index 7ad49fdd2d9..b075a3e36b5 100644 --- a/cmake/kokkos_test_cxx_std.cmake +++ b/cmake/kokkos_test_cxx_std.cmake @@ -74,6 +74,10 @@ ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "23") kokkos_set_cxx_standard_feature(23) SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2B") SET(KOKKOS_ENABLE_CXX23 ON) +ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "26") + kokkos_set_cxx_standard_feature(26) + SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2C") + SET(KOKKOS_ENABLE_CXX26 ON) ELSE() MESSAGE(FATAL_ERROR "Kokkos requires C++17 or newer but requested ${KOKKOS_CXX_STANDARD}!") ENDIF() diff --git a/cmake/kokkos_tpls.cmake b/cmake/kokkos_tpls.cmake index f124596a84e..6ef3b79bde2 100644 --- a/cmake/kokkos_tpls.cmake +++ b/cmake/kokkos_tpls.cmake @@ -32,19 +32,21 @@ FUNCTION(KOKKOS_TPL_OPTION PKG DEFAULT) ENDFUNCTION() KOKKOS_TPL_OPTION(HWLOC Off TRIBITS HWLOC) -KOKKOS_TPL_OPTION(MEMKIND Off) -IF(KOKKOS_ENABLE_MEMKIND) - SET(KOKKOS_ENABLE_HBWSPACE ON) -ENDIF() KOKKOS_TPL_OPTION(CUDA ${Kokkos_ENABLE_CUDA} TRIBITS CUDA) -KOKKOS_TPL_OPTION(LIBRT Off) IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT KOKKOS_HAS_TRILINOS) SET(ROCM_DEFAULT ON) ELSE() SET(ROCM_DEFAULT OFF) ENDIF() +IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_HAS_TRILINOS) + SET(ROCTHRUST_DEFAULT ON) +ELSE() + SET(ROCTHRUST_DEFAULT OFF) +ENDIF() KOKKOS_TPL_OPTION(ROCM ${ROCM_DEFAULT}) +KOKKOS_TPL_OPTION(ROCTHRUST ${ROCTHRUST_DEFAULT}) + IF(KOKKOS_ENABLE_SYCL AND NOT KOKKOS_HAS_TRILINOS) SET(ONEDPL_DEFAULT ON) ELSE() @@ -77,21 +79,18 @@ KOKKOS_TPL_OPTION(LIBQUADMATH ${LIBQUADMATH_DEFAULT} TRIBITS quadmath) #Make sure we use our local FindKokkosCuda.cmake KOKKOS_IMPORT_TPL(HPX INTERFACE) -IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - KOKKOS_IMPORT_TPL(CUDA INTERFACE) -ENDIF() +KOKKOS_IMPORT_TPL(CUDA INTERFACE) KOKKOS_IMPORT_TPL(HWLOC) -KOKKOS_IMPORT_TPL(LIBRT) KOKKOS_IMPORT_TPL(LIBDL) -KOKKOS_IMPORT_TPL(MEMKIND) IF (NOT WIN32) KOKKOS_IMPORT_TPL(THREADS INTERFACE) ENDIF() IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) KOKKOS_IMPORT_TPL(ROCM INTERFACE) - KOKKOS_IMPORT_TPL(ONEDPL INTERFACE) ENDIF() +KOKKOS_IMPORT_TPL(ONEDPL INTERFACE) KOKKOS_IMPORT_TPL(LIBQUADMATH) +KOKKOS_IMPORT_TPL(ROCTHRUST) IF (Kokkos_ENABLE_DESUL_ATOMICS_EXTERNAL) find_package(desul REQUIRED COMPONENTS atomics) @@ -119,7 +118,3 @@ STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}") #Convert to a regular variable UNSET(KOKKOS_TPL_EXPORTS CACHE) SET(KOKKOS_TPL_EXPORTS ${KOKKOS_TPL_EXPORT_TEMP}) -IF (KOKKOS_ENABLE_MEMKIND) - SET(KOKKOS_ENABLE_HBWSPACE) - LIST(APPEND KOKKOS_MEMSPACE_LIST HBWSpace) -ENDIF() diff --git a/cmake/kokkos_tribits.cmake b/cmake/kokkos_tribits.cmake index b30ca70ab95..060a7a8472c 100644 --- a/cmake/kokkos_tribits.cmake +++ b/cmake/kokkos_tribits.cmake @@ -237,18 +237,10 @@ ENDMACRO() ## KOKKOS_DECLARE is the declaration set ## KOKKOS_POST_INCLUDE is included at the end of Kokkos_Core.hpp MACRO(KOKKOS_CONFIGURE_CORE) - SET(FWD_BACKEND_LIST) - FOREACH(MEMSPACE ${KOKKOS_MEMSPACE_LIST}) - LIST(APPEND FWD_BACKEND_LIST ${MEMSPACE}) - ENDFOREACH() - FOREACH(BACKEND_ ${KOKKOS_ENABLED_DEVICES}) - LIST(APPEND FWD_BACKEND_LIST ${BACKEND_}) - ENDFOREACH() - MESSAGE(STATUS "Kokkos Devices: ${KOKKOS_ENABLED_DEVICES}, Kokkos Backends: ${FWD_BACKEND_LIST}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" "${FWD_BACKEND_LIST}") + MESSAGE(STATUS "Kokkos Backends: ${KOKKOS_ENABLED_DEVICES}") + KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" "${KOKKOS_ENABLED_DEVICES}") KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_SetupBackend.hpp "KOKKOS_SETUP" "setup/Kokkos_Setup" "${DEVICE_SETUP_LIST}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" "${FWD_BACKEND_LIST}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_PostInclude.hpp "KOKKOS_POST_INCLUDE" "Kokkos_Post_Include" "${KOKKOS_BACKEND_POST_INCLUDE_LIST}") + KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" "${KOKKOS_ENABLED_DEVICES}") SET(_DEFAULT_HOST_MEMSPACE "::Kokkos::HostSpace") KOKKOS_OPTION(DEFAULT_DEVICE_MEMORY_SPACE "" STRING "Override default device memory space") KOKKOS_OPTION(DEFAULT_HOST_MEMORY_SPACE "" STRING "Override default host memory space") @@ -309,7 +301,6 @@ MACRO(KOKKOS_INSTALL_ADDITIONAL_FILES) "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_FwdBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_SetupBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_DeclareBackend.hpp" - "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_PostInclude.hpp" DESTINATION ${KOKKOS_HEADER_DIR}) ENDMACRO() diff --git a/config/test_all_sandia b/config/test_all_sandia deleted file mode 100755 index 193a162a4e6..00000000000 --- a/config/test_all_sandia +++ /dev/null @@ -1,773 +0,0 @@ -#!/bin/bash -e - -# -# Global config -# - -set -o pipefail - -# Determine current machine. - -MACHINE="" -HOSTNAME=$(hostname) -PROCESSOR=`uname -p` - -if [[ "$HOSTNAME" =~ (white|ride).* ]]; then - MACHINE=white - module load git -fi - -if [[ "$HOSTNAME" =~ .*bowman.* ]]; then - MACHINE=bowman - module load git -fi - -if [[ "$HOSTNAME" == n* ]]; then # Warning: very generic name - if [[ "$PROCESSOR" = "aarch64" ]]; then - MACHINE=sullivan - module load git - fi -fi - -if [[ "$HOSTNAME" == node* ]]; then # Warning: very generic name - if [[ "$MACHINE" = "" ]]; then - MACHINE=shepard - module load git - fi -fi - -if [[ "$HOSTNAME" == apollo\.* ]]; then - MACHINE=apollo - module load git -fi - -if [[ "$HOSTNAME" == sullivan ]]; then - MACHINE=sullivan - module load git -fi - -if [[ "$HOSTNAME" == mayer\.* ]]; then - MACHINE=mayer -# module load git -fi -if [[ "$HOSTNAME" == cn* ]]; then # Warning: very generic name - MACHINE=mayer -fi - -if [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then - if [[ "$MACHINE" = "" ]]; then - MACHINE=sems - module load sems-git - fi -fi - -if [[ "$MACHINE" = "" ]]; then - echo "Unrecognized machine" >&2 - exit 1 -fi - -echo "Running on machine: $MACHINE" - -GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" -IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" -ARM_GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" -INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" -CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial" -CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial" -CUDA_IBM_BUILD_LIST="Cuda_OpenMP,Cuda_Serial" - -GCC_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized" -IBM_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" -CLANG_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" -INTEL_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" -#CUDA_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" -CUDA_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Wsign-compare,-Wtype-limits,-Wuninitialized" -PGI_WARNING_FLAGS="" - -# Default. Machine specific can override. -DEBUG=False -ARGS="" -CUSTOM_BUILD_LIST="" -DRYRUN=False -BUILD_ONLY=False -declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=1 -TEST_SCRIPT=False -SKIP_HWLOC=False -SPOT_CHECK=False - -PRINT_HELP=False -OPT_FLAG="" -CXX_FLAGS_EXTRA="" -LD_FLAGS_EXTRA="" -KOKKOS_OPTIONS="" - -# -# Handle arguments. -# - -while [[ $# > 0 ]] -do - key="$1" - - case $key in - --kokkos-path*) - KOKKOS_PATH="${key#*=}" - ;; - --build-list*) - CUSTOM_BUILD_LIST="${key#*=}" - ;; - --debug*) - DEBUG=True - ;; - --build-only*) - BUILD_ONLY=True - ;; - --test-script*) - TEST_SCRIPT=True - ;; - --skip-hwloc*) - SKIP_HWLOC=True - ;; - --num*) - NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}" - ;; - --dry-run*) - DRYRUN=True - ;; - --spot-check*) - SPOT_CHECK=True - ;; - --arch*) - ARCH_FLAG="--arch=${key#*=}" - ;; - --opt-flag*) - OPT_FLAG="${key#*=}" - ;; - --with-cuda-options*) - KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}" - ;; - --with-options*) - KOKKOS_OPTIONS="--with-options=enable_large_mem_tests,${key#*=}" - ;; - --cxxflags-extra*) - CXX_FLAGS_EXTRA="${key#*=}" - ;; - --ldflags-extra*) - LD_FLAGS_EXTRA="${key#*=}" - ;; - --help*) - PRINT_HELP=True - ;; - *) - # args, just append - ARGS="$ARGS $1" - ;; - esac - - shift -done - -SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd ) - -# Set kokkos path. -if [ -z "$KOKKOS_PATH" ]; then - KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT -else - # Ensure KOKKOS_PATH is abs path. - KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd ) -fi - -UNCOMMITTED=`cd ${KOKKOS_PATH}; git status --porcelain 2>/dev/null` -if ! [ -z "$UNCOMMITTED" ]; then - echo "WARNING!! THE FOLLOWING CHANGES ARE UNCOMMITTED!! :" - echo "$UNCOMMITTED" - echo "" -fi - -GITSTATUS=`cd ${KOKKOS_PATH}; git log -n 1 --format=oneline` -echo "Repository Status: " ${GITSTATUS} -echo "" -echo "" - -# -# Machine specific config. -# - -if [ "$MACHINE" = "sems" ]; then - source /projects/sems/modulefiles/utils/sems-modules-init.sh - - BASE_MODULE_LIST="sems-env,kokkos-env,kokkos-hwloc/1.10.1/base,sems-/" - CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-/,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base" - CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-/,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base" - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="" - fi - - if [ "$SPOT_CHECK" = "True" ]; then - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" - "gcc/6.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" - "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" - "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" - "cuda/8.0.44 $CUDA8_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - else - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/16.0.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.9.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "cuda/8.0.44 $CUDA8_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - fi -elif [ "$MACHINE" = "white" ]; then - source /etc/profile.d/modules.sh - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=32 - - BASE_MODULE_LIST="/" - IBM_MODULE_LIST="/xl/" - CUDA_MODULE_LIST="/,gcc/6.4.0,ibm/xl/16.1.0" - - # Don't do pthread on white. - GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/5.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/6.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" - "cuda/9.0.103 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=Power8,Kepler37" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "bowman" ]; then - source /etc/profile.d/modules.sh - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=32 - - BASE_MODULE_LIST="/compilers/" - - OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("intel/16.4.258 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/17.2.174 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/18.0.128 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - ) - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=KNL" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "sullivan" ]; then - source /etc/profile.d/modules.sh - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=96 - - BASE_MODULE_LIST="/" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/6.1.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS") - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=ARMv8-ThunderX" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "mayer" ]; then - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=96 - - BASE_MODULE_LIST="/" - ARM_MODULE_LIST="/" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/7.2.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "arm/1.4.0 $ARM_MODULE_LIST $ARM_GCC_BUILD_LIST armclang++ $CLANG_WARNING_FLAGS") - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=ARMv8-TX2" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "shepard" ]; then - source /etc/profile.d/modules.sh - SKIP_HWLOC=True - export SLURM_TASKS_PER_NODE=32 - - BASE_MODULE_LIST="/" - BASE_MODULE_LIST_INTEL="/compilers/" - - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("intel/17.4.196 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/18.0.128 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "pgi/17.10.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" - ) - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=HSW" - fi - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -elif [ "$MACHINE" = "apollo" ]; then - source /projects/sems/modulefiles/utils/sems-modules-init.sh - module use /home/projects/modulefiles/local/x86-64 - module load kokkos-env - - module load sems-git - module load sems-tex - module load sems-cmake/3.5.2 - module load sems-gdb - - SKIP_HWLOC=True - - BASE_MODULE_LIST="sems-env,kokkos-env,sems-/,kokkos-hwloc/1.10.1/base" - CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-/,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base" - CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-/,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base" - - CLANG_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,/,cuda/9.0.69" - NVCC_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,/,sems-gcc/5.3.0" - - BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_OpenMP" - BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_Pthread" - BUILD_LIST_CLANG="Serial,Pthread,OpenMP" - - if [ "$SPOT_CHECK" = "True" ]; then - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS" - "gcc/5.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" - "intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" - "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" - "clang/6.0 $CLANG_MODULE_LIST "Cuda_Pthread,OpenMP" clang++ $CUDA_WARNING_FLAGS" - "cuda/9.1 $CUDA_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - ) - else - # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("cuda/9.1 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - "clang/6.0 $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS" - "clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" - "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" - ) - fi - - if [ -z "$ARCH_FLAG" ]; then - ARCH_FLAG="--arch=SNB,Volta70" - fi - - NUM_JOBS_TO_RUN_IN_PARALLEL=1 - -else - echo "Unhandled machine $MACHINE" >&2 - exit 1 -fi - -export OMP_NUM_THREADS=4 - -declare -i NUM_RESULTS_TO_KEEP=7 - -RESULT_ROOT_PREFIX=TestAll - -if [ "$PRINT_HELP" = "True" ]; then - echo "test_all_sandia :" - echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory" - echo " Defaults to root repo containing this script" - echo "--debug: Run tests in debug. Defaults to False" - echo "--test-script: Test this script, not Kokkos" - echo "--skip-hwloc: Do not do hwloc tests" - echo "--num=N: Number of jobs to run in parallel" - echo "--spot-check: Minimal test set to issue pull request" - echo "--dry-run: Just print what would be executed" - echo "--build-only: Just do builds, don't run anything" - echo "--opt-flag=FLAG: Optimization flag (default: -O3)" - echo "--cxxflags-extra=FLAGS: Extra flags to be added to CXX_FLAGS" - echo "--ldflags-extra=FLAGS: Extra flags to be added to LD_FLAGS" - echo "--arch=ARCHITECTURE: overwrite architecture flags" - echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS" - echo "--build-list=BUILD,BUILD,BUILD..." - echo " Provide a comma-separated list of builds instead of running all builds" - echo " Valid items:" - echo " OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial" - echo " Cuda_OpenMP, Cuda_Pthread, Cuda_Serial" - echo "" - - echo "ARGS: list of expressions matching compilers to test" - echo " supported compilers sems" - for COMPILER_DATA in "${COMPILERS[@]}"; do - ARR=($COMPILER_DATA) - COMPILER=${ARR[0]} - echo " $COMPILER" - done - echo "" - - echo "Examples:" - echo " Run all tests" - echo " % test_all_sandia" - echo "" - echo " Run all gcc tests" - echo " % test_all_sandia gcc" - echo "" - echo " Run all gcc/4.8.4 and all intel tests" - echo " % test_all_sandia gcc/4.8.4 intel" - echo "" - echo " Run all tests in debug" - echo " % test_all_sandia --debug" - echo "" - echo " Run gcc/4.8.4 and only do OpenMP and OpenMP_Serial builds" - echo " % test_all_sandia gcc/4.8.4 --build-list=OpenMP,OpenMP_Serial" - echo "" - echo "If you want to kill the tests, do:" - echo " hit ctrl-z" - echo " % kill -9 %1" - echo - exit 0 -fi - -# Set build type. -if [ "$DEBUG" = "True" ]; then - BUILD_TYPE=debug -else - BUILD_TYPE=release -fi - -# If no args provided, do all compilers. -if [ -z "$ARGS" ]; then - ARGS='?' -fi - -# Process args to figure out which compilers to test. -COMPILERS_TO_TEST="" - -for ARG in $ARGS; do - for COMPILER_DATA in "${COMPILERS[@]}"; do - ARR=($COMPILER_DATA) - COMPILER=${ARR[0]} - - if [[ "$COMPILER" = $ARG* ]]; then - if [[ "$COMPILERS_TO_TEST" != *${COMPILER}* ]]; then - COMPILERS_TO_TEST="$COMPILERS_TO_TEST $COMPILER" - else - echo "Tried to add $COMPILER twice" - fi - fi - done -done - -# -# Functions. -# - -# get_compiler_name -get_compiler_name() { - echo $1 | cut -d/ -f1 -} - -# get_compiler_version -get_compiler_version() { - echo $1 | cut -d/ -f2 -} - -# Do not call directly. -get_compiler_data() { - local compiler=$1 - local item=$2 - local compiler_name=$(get_compiler_name $compiler) - local compiler_vers=$(get_compiler_version $compiler) - - local compiler_data - for compiler_data in "${COMPILERS[@]}" ; do - local arr=($compiler_data) - - if [ "$compiler" = "${arr[0]}" ]; then - echo "${arr[$item]}" | tr , ' ' | sed -e "s//$compiler_name/g" -e "s//$compiler_vers/g" - return 0 - fi - done - - # Not found. - echo "Unreconized compiler $compiler" >&2 - exit 1 -} - -# -# For all getters, usage: -# - -get_compiler_modules() { - get_compiler_data $1 1 -} - -get_compiler_build_list() { - get_compiler_data $1 2 -} - -get_compiler_exe_name() { - get_compiler_data $1 3 -} - -get_compiler_warning_flags() { - get_compiler_data $1 4 -} - -run_cmd() { - echo "RUNNING: $*" - if [ "$DRYRUN" != "True" ]; then - eval "$* 2>&1" - fi -} - -# report_and_log_test_results -report_and_log_test_result() { - # Use sane var names. - local success=$1; local desc=$2; local comment=$3; - - if [ "$success" = "0" ]; then - echo " PASSED $desc" - echo $comment > $PASSED_DIR/$desc - else - # For failures, comment should be the name of the phase that failed. - echo " FAILED $desc" >&2 - echo $comment > $FAILED_DIR/$desc - cat ${desc}.${comment}.log - fi -} - -setup_env() { - local compiler=$1 - local compiler_modules=$(get_compiler_modules $compiler) - - module purge - - local mod - for mod in $compiler_modules; do - echo "Loading module $mod" - module load $mod 2>&1 - # It is ridiculously hard to check for the success of a loaded - # module. Module does not return error codes and piping to grep - # causes module to run in a subshell. - module list 2>&1 | grep "$mod" >& /dev/null || return 1 - done - - return 0 -} - -# single_build_and_test -single_build_and_test() { - # Use sane var names. - local compiler=$1; local build=$2; local build_type=$3; - - # Set up env. - mkdir -p $ROOT_DIR/$compiler/"${build}-$build_type" - cd $ROOT_DIR/$compiler/"${build}-$build_type" - local desc=$(echo "${compiler}-${build}-${build_type}" | sed 's:/:-:g') - setup_env $compiler >& ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } - - # Set up flags. - local compiler_warning_flags=$(get_compiler_warning_flags $compiler) - local compiler_exe=$(get_compiler_exe_name $compiler) - - if [[ "$build_type" = hwloc* ]]; then - local extra_args=--with-hwloc=$(dirname $(dirname $(which hwloc-info))) - fi - - if [[ "$OPT_FLAG" = "" ]]; then - OPT_FLAG="-O3" - fi - - if [[ "$build_type" = *debug* ]]; then - local extra_args="$extra_args --debug" - local cxxflags="-g $compiler_warning_flags" - local ldflags="-g" - else - local cxxflags="$OPT_FLAG $compiler_warning_flags" - local ldflags="${OPT_FLAG}" - fi - - local cxxflags="${cxxflags} ${CXX_FLAGS_EXTRA}" - local ldflags="${ldflags} ${LD_FLAGS_EXTRA}" - - if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then - local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS" - fi - if [[ "$KOKKOS_OPTIONS" != "" ]]; then - local extra_args="$extra_args $KOKKOS_OPTIONS" - else - local extra_args="$extra_args --with-options=enable_large_mem_tests" - fi - - echo " Starting job $desc" - - local comment="no_comment" - - if [ "$TEST_SCRIPT" = "True" ]; then - local rand=$[ 1 + $[ RANDOM % 10 ]] - sleep $rand - - if [ $rand -gt 5 ]; then - run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; } - fi - else - run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --ldflags=\"$ldflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } - local -i build_start_time=$(date +%s) - run_cmd make -j 48 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; } - local -i build_end_time=$(date +%s) - comment="build_time=$(($build_end_time-$build_start_time))" - - if [[ "$BUILD_ONLY" == False ]]; then - run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; } - local -i run_end_time=$(date +%s) - comment="$comment run_time=$(($run_end_time-$build_end_time))" - fi - fi - - report_and_log_test_result 0 $desc "$comment" - - return 0 -} - -# wait_for_jobs -wait_for_jobs() { - local -i max_jobs=$1 - local -i num_active_jobs=$(jobs | wc -l) - while [ $num_active_jobs -ge $max_jobs ] - do - sleep 1 - num_active_jobs=$(jobs | wc -l) - jobs >& /dev/null - done -} - -# run_in_background -run_in_background() { - local compiler=$1 - - local -i num_jobs=$NUM_JOBS_TO_RUN_IN_PARALLEL - # Don't override command line input. - # if [[ "$BUILD_ONLY" == True ]]; then - # num_jobs=8 - # else - if [[ "$compiler" == cuda* ]]; then - num_jobs=1 - fi - if [[ "$compiler" == clang ]]; then - num_jobs=1 - fi - # fi - wait_for_jobs $num_jobs - - single_build_and_test $* & -} - -# build_and_test_all -build_and_test_all() { - # Get compiler data. - local compiler=$1 - if [ -z "$CUSTOM_BUILD_LIST" ]; then - local compiler_build_list=$(get_compiler_build_list $compiler) - else - local compiler_build_list=$(echo "$CUSTOM_BUILD_LIST" | tr , ' ') - fi - - # Do builds. - local build - for build in $compiler_build_list - do - run_in_background $compiler $build $BUILD_TYPE - - # If not cuda, do a hwloc test too. - if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then - run_in_background $compiler $build "hwloc-$BUILD_TYPE" - fi - done - - return 0 -} - -get_test_root_dir() { - local existing_results=$(find . -maxdepth 1 -name "$RESULT_ROOT_PREFIX*" | sort) - local -i num_existing_results=$(echo $existing_results | tr ' ' '\n' | wc -l) - local -i num_to_delete=${num_existing_results}-${NUM_RESULTS_TO_KEEP} - - if [ $num_to_delete -gt 0 ]; then - /bin/rm -rf $(echo $existing_results | tr ' ' '\n' | head -n $num_to_delete) - fi - - echo $(pwd)/${RESULT_ROOT_PREFIX}_$(date +"%Y-%m-%d_%H.%M.%S") -} - -wait_summarize_and_exit() { - wait_for_jobs 1 - - echo "#######################################################" - echo "PASSED TESTS" - echo "#######################################################" - - local passed_test - for passed_test in $(\ls -1 $PASSED_DIR | sort) - do - echo $passed_test $(cat $PASSED_DIR/$passed_test) - done - - local -i rv=0 - if [ "$(ls -A $FAILED_DIR)" ]; then - echo "#######################################################" - echo "FAILED TESTS" - echo "#######################################################" - - local failed_test - for failed_test in $(\ls -1 $FAILED_DIR | sort) - do - echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)" - rv=$rv+1 - done - fi - - exit $rv -} - -# -# Main. -# - -ROOT_DIR=$(get_test_root_dir) -mkdir -p $ROOT_DIR -cd $ROOT_DIR - -PASSED_DIR=$ROOT_DIR/results/passed -FAILED_DIR=$ROOT_DIR/results/failed -mkdir -p $PASSED_DIR -mkdir -p $FAILED_DIR - -echo "Going to test compilers: " $COMPILERS_TO_TEST -for COMPILER in $COMPILERS_TO_TEST; do - echo "Testing compiler $COMPILER" - build_and_test_all $COMPILER -done - -wait_summarize_and_exit diff --git a/config/yaml/volta.yaml b/config/yaml/volta.yaml deleted file mode 100644 index f67af9c2a44..00000000000 --- a/config/yaml/volta.yaml +++ /dev/null @@ -1,4 +0,0 @@ -packages: - kokkos: - variants: +cuda +openmp +volta70 +cuda_lambda +wrapper ^cuda@10.1 - compiler: [gcc@7.2.0] diff --git a/containers/src/Kokkos_Bitset.hpp b/containers/src/Kokkos_Bitset.hpp index cd5ca4ea512..f50ab0a0f7e 100644 --- a/containers/src/Kokkos_Bitset.hpp +++ b/containers/src/Kokkos_Bitset.hpp @@ -28,24 +28,6 @@ namespace Kokkos { -namespace Impl { -//! Either append to the label if the property already exists, or set it. -template -auto with_updated_label(const ViewCtorProp& view_ctor_prop, - const std::string& label) { - using vcp_t = ViewCtorProp; - //! If the label property is already set, append. Otherwise, set label. - if constexpr (vcp_t::has_label) { - vcp_t new_ctor_props(view_ctor_prop); - static_cast&>(new_ctor_props) - .value.append(label); - return new_ctor_props; - } else { - return Impl::with_properties_if_unset(view_ctor_prop, label); - } -} -} // namespace Impl - template class Bitset; @@ -92,9 +74,10 @@ class Bitset { using block_view_type = View>; public: - /// constructor + Bitset() = default; + /// arg_size := number of bit in set - Bitset(unsigned arg_size = 0u) : Bitset(Kokkos::view_alloc(), arg_size) {} + Bitset(unsigned arg_size) : Bitset(Kokkos::view_alloc(), arg_size) {} template Bitset(const Impl::ViewCtorProp& arg_prop, unsigned arg_size) @@ -108,9 +91,8 @@ class Bitset { "Allocation properties should not contain the 'pointer' property."); //! Update 'label' property and allocate. - const auto prop_copy = Kokkos::Impl::with_updated_label( - Impl::with_properties_if_unset(arg_prop, std::string("Bitset")), - " - blocks"); + const auto prop_copy = + Impl::with_properties_if_unset(arg_prop, std::string("Bitset")); m_blocks = block_view_type(prop_copy, ((m_size + block_mask) >> block_shift)); @@ -310,8 +292,8 @@ class Bitset { } private: - unsigned m_size; - unsigned m_last_block_mask; + unsigned m_size = 0; + unsigned m_last_block_mask = 0; block_view_type m_blocks; private: diff --git a/containers/src/Kokkos_DualView.hpp b/containers/src/Kokkos_DualView.hpp index 84bced2cc44..e821570a8d5 100644 --- a/containers/src/Kokkos_DualView.hpp +++ b/containers/src/Kokkos_DualView.hpp @@ -292,15 +292,6 @@ class DualView : public ViewTraits { d_view(src.d_view), h_view(src.h_view) {} - //! Copy assignment operator (shallow copy assignment) - template - DualView& operator=(const DualView& src) { - modified_flags = src.modified_flags; - d_view = src.d_view; - h_view = src.h_view; - return *this; - } - //! Subview constructor template DualView(const DualView& src, const Arg0& arg0, Args... args) diff --git a/containers/src/Kokkos_DynRankView.hpp b/containers/src/Kokkos_DynRankView.hpp index 52aa86d8ee4..5fa59f1b7cd 100644 --- a/containers/src/Kokkos_DynRankView.hpp +++ b/containers/src/Kokkos_DynRankView.hpp @@ -1340,7 +1340,7 @@ class ViewMapping< template struct apply { - static_assert(Kokkos::is_memory_traits::value, ""); + static_assert(Kokkos::is_memory_traits::value); using traits_type = Kokkos::ViewTraits::type, Args...>( - v.data(), v.impl_map().layout()); + auto layout = v.impl_map().layout(); + + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + is_layouttiled::value) { + for (int i = N; i < 7; ++i) + layout.dimension[i] = KOKKOS_IMPL_CTOR_DEFAULT_ARG; + } + + return View::type, Args...>(v.data(), layout); } template diff --git a/containers/src/Kokkos_OffsetView.hpp b/containers/src/Kokkos_OffsetView.hpp index 92aead28784..91a7e4a9273 100644 --- a/containers/src/Kokkos_OffsetView.hpp +++ b/containers/src/Kokkos_OffsetView.hpp @@ -124,15 +124,8 @@ KOKKOS_INLINE_FUNCTION void offsetview_verify_operator_bounds( args...); Kokkos::Impl::throw_runtime_exception(std::string(buffer));)) - KOKKOS_IF_ON_DEVICE(( - /* Check #1: is there a SharedAllocationRecord? - (we won't use it, but if it is not there then there isn't - a corresponding SharedAllocationHeader containing a label). - This check should cover the case of Views that don't - have the Unmanaged trait but were initialized by pointer. */ - if (tracker.has_record()) { - Kokkos::Impl::operator_bounds_error_on_device(map); - } else { Kokkos::abort("OffsetView bounds error"); })) + KOKKOS_IF_ON_DEVICE( + (Kokkos::abort("OffsetView bounds error"); (void)tracker;)) } } diff --git a/containers/src/Kokkos_UnorderedMap.hpp b/containers/src/Kokkos_UnorderedMap.hpp index e001c062de3..78a6a238ece 100644 --- a/containers/src/Kokkos_UnorderedMap.hpp +++ b/containers/src/Kokkos_UnorderedMap.hpp @@ -243,16 +243,16 @@ class UnorderedMap { using const_map_type = UnorderedMap; - static const bool is_set = std::is_void::value; - static const bool has_const_key = - std::is_same::value; - static const bool has_const_value = - is_set || std::is_same::value; + static constexpr bool is_set = std::is_void_v; + static constexpr bool has_const_key = + std::is_same_v; + static constexpr bool has_const_value = + is_set || std::is_same_v; - static const bool is_insertable_map = + static constexpr bool is_insertable_map = !has_const_key && (is_set || !has_const_value); - static const bool is_modifiable_map = has_const_key && !has_const_value; - static const bool is_const_map = has_const_key && has_const_value; + static constexpr bool is_modifiable_map = has_const_key && !has_const_value; + static constexpr bool is_const_map = has_const_key && has_const_value; using insert_result = UnorderedMapInsertResult; @@ -337,27 +337,27 @@ class UnorderedMap { Impl::get_property(prop_copy) + " - size")); m_available_indexes = - bitset_type(Kokkos::Impl::with_updated_label(prop_copy, " - bitset"), + bitset_type(Kokkos::Impl::append_to_label(prop_copy, " - bitset"), calculate_capacity(capacity_hint)); m_hash_lists = size_type_view( - Kokkos::Impl::with_updated_label(prop_copy_noinit, " - hash list"), + Kokkos::Impl::append_to_label(prop_copy_noinit, " - hash list"), Impl::find_hash_size(capacity())); m_next_index = size_type_view( - Kokkos::Impl::with_updated_label(prop_copy_noinit, " - next index"), + Kokkos::Impl::append_to_label(prop_copy_noinit, " - next index"), capacity() + 1); // +1 so that the *_at functions can always return a // valid reference - m_keys = key_type_view( - Kokkos::Impl::with_updated_label(prop_copy, " - keys"), capacity()); + m_keys = key_type_view(Kokkos::Impl::append_to_label(prop_copy, " - keys"), + capacity()); - m_values = value_type_view( - Kokkos::Impl::with_updated_label(prop_copy, " - values"), - is_set ? 0 : capacity()); + m_values = + value_type_view(Kokkos::Impl::append_to_label(prop_copy, " - values"), + is_set ? 0 : capacity()); m_scalars = - scalars_view(Kokkos::Impl::with_updated_label(prop_copy, " - scalars")); + scalars_view(Kokkos::Impl::append_to_label(prop_copy, " - scalars")); /** * Deep copies should also be done using the space instance if given. diff --git a/containers/src/impl/Kokkos_UnorderedMap_impl.hpp b/containers/src/impl/Kokkos_UnorderedMap_impl.hpp index 8f8cd9523b7..a979ee40d8c 100644 --- a/containers/src/impl/Kokkos_UnorderedMap_impl.hpp +++ b/containers/src/impl/Kokkos_UnorderedMap_impl.hpp @@ -27,6 +27,18 @@ namespace Kokkos { namespace Impl { +//! Append to the label contained in view_ctor_prop. +template +auto append_to_label(const ViewCtorProp& view_ctor_prop, + const std::string& label) { + using vcp_t = ViewCtorProp; + static_assert(vcp_t::has_label); + vcp_t new_ctor_props(view_ctor_prop); + static_cast&>(new_ctor_props) + .value.append(label); + return new_ctor_props; +} + uint32_t find_hash_size(uint32_t size); template diff --git a/containers/unit_tests/Makefile b/containers/unit_tests/Makefile index 2e35832cc89..18410882bca 100644 --- a/containers/unit_tests/Makefile +++ b/containers/unit_tests/Makefile @@ -35,8 +35,8 @@ TESTS = Bitset DualView DynamicView DynViewAPI_generic DynViewAPI_rank12345 DynV tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ tmp2 := $(foreach test, $(TESTS), \ $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include" > Test$(device)_$(test).cpp); \ - $(shell echo "\#include" >> Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include" > Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include" >> Test$(device)_$(test).cpp); \ )\ ) \ ) diff --git a/containers/unit_tests/TestBitset.hpp b/containers/unit_tests/TestBitset.hpp index 3ad0d2bf573..9923453f72c 100644 --- a/containers/unit_tests/TestBitset.hpp +++ b/containers/unit_tests/TestBitset.hpp @@ -23,6 +23,8 @@ #include #include +#include <../../core/unit_test/tools/include/ToolTestingUtilities.hpp> + namespace Test { namespace Impl { @@ -155,7 +157,7 @@ void test_bitset() { { unsigned ts = 100u; - bitset_type b1; + bitset_type b1(Kokkos::view_alloc("MyBitset"), 0); ASSERT_TRUE(b1.is_allocated()); b1 = bitset_type(ts); @@ -165,6 +167,9 @@ void test_bitset() { ASSERT_TRUE(b1.is_allocated()); ASSERT_TRUE(b2.is_allocated()); ASSERT_TRUE(b3.is_allocated()); + + bitset_type b4; + ASSERT_FALSE(b4.is_allocated()); } std::array test_sizes = { @@ -237,6 +242,24 @@ void test_bitset() { } TEST(TEST_CATEGORY, bitset) { test_bitset(); } + +TEST(TEST_CATEGORY, bitset_default_constructor_no_alloc) { + using namespace Kokkos::Test::Tools; + listen_tool_events(Config::DisableAll(), Config::EnableAllocs()); + + auto success = validate_absence( + [&]() { + Kokkos::Bitset bs; + EXPECT_FALSE(bs.is_allocated()); + }, + [&](AllocateDataEvent) { + return MatchDiagnostic{true, {"Found alloc event"}}; + }); + ASSERT_TRUE(success); + + listen_tool_events(Config::DisableAll()); +} + } // namespace Test #endif // KOKKOS_TEST_BITSET_HPP diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index 7f3916da312..e0dba03e1ec 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -50,8 +50,8 @@ ELSE() FetchContent_Declare( googlebenchmark DOWNLOAD_EXTRACT_TIMESTAMP FALSE - URL https://github.com/google/benchmark/archive/refs/tags/v1.6.2.tar.gz - URL_HASH MD5=14d14849e075af116143a161bc3b927b + URL https://github.com/google/benchmark/archive/refs/tags/v1.7.1.tar.gz + URL_HASH MD5=0459a6c530df9851bee6504c3e37c2e7 ) FetchContent_MakeAvailable(googlebenchmark) list(POP_BACK CMAKE_MESSAGE_INDENT) diff --git a/core/src/CMakeLists.txt b/core/src/CMakeLists.txt index 012af0a7d06..b84677e61b6 100644 --- a/core/src/CMakeLists.txt +++ b/core/src/CMakeLists.txt @@ -18,10 +18,16 @@ IF (NOT desul_FOUND) ENDIF() IF(KOKKOS_ENABLE_SYCL) SET(DESUL_ATOMICS_ENABLE_SYCL ON) + IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED AND NOT KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + SET(DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION ON) + ENDIF() ENDIF() IF(KOKKOS_ENABLE_OPENMPTARGET) SET(DESUL_ATOMICS_ENABLE_OPENMP ON) # not a typo Kokkos OpenMPTarget -> Desul OpenMP ENDIF() + IF(KOKKOS_ENABLE_OPENACC) + SET(DESUL_ATOMICS_ENABLE_OPENACC ON) + ENDIF() CONFIGURE_FILE( ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/Config.hpp.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/desul/atomics/Config.hpp @@ -80,10 +86,6 @@ IF (KOKKOS_ENABLE_HPX) APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.hpp) ENDIF() -IF (NOT KOKKOS_ENABLE_MEMKIND) - LIST(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/Kokkos_HBWSpace.cpp) -ENDIF() - IF (KOKKOS_ENABLE_SERIAL) APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.cpp) APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.hpp) @@ -180,20 +182,15 @@ IF (Kokkos_ENABLE_IMPL_MDSPAN) ENDIF() KOKKOS_LINK_TPL(kokkoscore PUBLIC HWLOC) -KOKKOS_LINK_TPL(kokkoscore PUBLIC MEMKIND) -IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - KOKKOS_LINK_TPL(kokkoscore PUBLIC CUDA) -ENDIF() +KOKKOS_LINK_TPL(kokkoscore PUBLIC CUDA) KOKKOS_LINK_TPL(kokkoscore PUBLIC HPX) KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBDL) -KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBRT) # On *nix-like systems (Linux, macOS) we need pthread for C++ std::thread IF (NOT WIN32) KOKKOS_LINK_TPL(kokkoscore PUBLIC THREADS) ENDIF() IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM) - KOKKOS_LINK_TPL(kokkoscore PUBLIC ONEDPL) ENDIF() # FIXME: We need a proper solution to figure out whether to enable diff --git a/core/src/Cuda/Kokkos_Cuda.hpp b/core/src/Cuda/Kokkos_Cuda.hpp index 8bfaf8317b6..276d03da265 100644 --- a/core/src/Cuda/Kokkos_Cuda.hpp +++ b/core/src/Cuda/Kokkos_Cuda.hpp @@ -46,7 +46,6 @@ static_assert(false, namespace Kokkos { namespace Impl { -class CudaExec; class CudaInternal; } // namespace Impl } // namespace Kokkos @@ -129,33 +128,16 @@ class Cuda { /// \brief True if and only if this method is being called in a /// thread-parallel function. - KOKKOS_INLINE_FUNCTION static int in_parallel() { + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static int in_parallel() { #if defined(__CUDA_ARCH__) return true; #else return false; #endif } - - /** \brief Set the device in a "sleep" state. - * - * This function sets the device in a "sleep" state in which it is - * not ready for work. This may consume less resources than if the - * device were in an "awake" state, but it may also take time to - * bring the device from a sleep state to be ready for work. - * - * \return True if the device is in the "sleep" state, else false if - * the device is actively working and could not enter the "sleep" - * state. - */ - static bool sleep(); - - /// \brief Wake the device from the 'sleep' state so it is ready for work. - /// - /// \return True if the device is in the "ready" state, else "false" - /// if the device is actively working (which also means that it's - /// awake). - static bool wake(); +#endif /// \brief Wait until all dispatched functors complete. /// @@ -199,18 +181,37 @@ class Cuda { //! Initialize, telling the CUDA run-time library which device to use. static void impl_initialize(InitializationSettings const&); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 /// \brief Cuda device architecture of the selected device. /// /// This matches the __CUDA_ARCH__ specification. - static size_type device_arch(); + KOKKOS_DEPRECATED static size_type device_arch() { + const cudaDeviceProp& cudaProp = Cuda().cuda_device_prop(); + return cudaProp.major * 100 + cudaProp.minor; + } //! Query device count. - static size_type detect_device_count(); + KOKKOS_DEPRECATED static size_type detect_device_count() { + int count; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count)); + return count; + } /** \brief Detect the available devices and their architecture * as defined by the __CUDA_ARCH__ specification. */ - static std::vector detect_device_arch(); + KOKKOS_DEPRECATED static std::vector detect_device_arch() { + int count; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count)); + std::vector out; + for (int i = 0; i < count; ++i) { + cudaDeviceProp prop; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(&prop, i)); + out.push_back(prop.major * 100 + prop.minor); + } + return out; + } +#endif cudaStream_t cuda_stream() const; int cuda_device() const; diff --git a/core/src/Cuda/Kokkos_CudaSpace.cpp b/core/src/Cuda/Kokkos_CudaSpace.cpp index c6512f44dad..0944937e1bf 100644 --- a/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -33,7 +33,6 @@ //#include #include -#include #include @@ -83,11 +82,11 @@ void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) { KOKKOS_IMPL_CUDA_SAFE_CALL( (CudaInternal::singleton().cuda_memcpy_async_wrapper( dst, src, n, cudaMemcpyDefault, s))); - Impl::cuda_stream_synchronize( - s, + Kokkos::Tools::Experimental::Impl::profile_fence_event( + "Kokkos::Impl::DeepCopyAsyncCuda: Deep Copy Stream Sync", Kokkos::Tools::Experimental::SpecialSynchronizationCases:: DeepCopyResourceSynchronization, - "Kokkos::Impl::DeepCopyAsyncCuda: Deep Copy Stream Sync"); + [&]() { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(s)); }); } } // namespace Impl @@ -135,11 +134,23 @@ void kokkos_impl_cuda_set_pin_uvm_to_host(bool val) { namespace Kokkos { -CudaSpace::CudaSpace() : m_device(Kokkos::Cuda().cuda_device()) {} +CudaSpace::CudaSpace() + : m_device(Kokkos::Cuda().cuda_device()), + m_stream(Kokkos::Cuda().cuda_stream()) {} +CudaSpace::CudaSpace(int device_id, cudaStream_t stream) + : m_device(device_id), m_stream(stream) {} -CudaUVMSpace::CudaUVMSpace() : m_device(Kokkos::Cuda().cuda_device()) {} +CudaUVMSpace::CudaUVMSpace() + : m_device(Kokkos::Cuda().cuda_device()), + m_stream(Kokkos::Cuda().cuda_stream()) {} +CudaUVMSpace::CudaUVMSpace(int device_id, cudaStream_t stream) + : m_device(device_id), m_stream(stream) {} -CudaHostPinnedSpace::CudaHostPinnedSpace() {} +CudaHostPinnedSpace::CudaHostPinnedSpace() + : m_device(Kokkos::Cuda().cuda_device()), + m_stream(Kokkos::Cuda().cuda_stream()) {} +CudaHostPinnedSpace::CudaHostPinnedSpace(int device_id, cudaStream_t stream) + : m_device(device_id), m_stream(stream) {} size_t memory_threshold_g = 40000; // 40 kB @@ -161,52 +172,38 @@ void *CudaSpace::allocate(const char *arg_label, const size_t arg_alloc_size, } namespace { -void *impl_allocate_common(const Cuda &exec_space, const char *arg_label, - const size_t arg_alloc_size, +void *impl_allocate_common(const int device_id, + [[maybe_unused]] const cudaStream_t stream, + const char *arg_label, const size_t arg_alloc_size, const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle, - bool exec_space_provided) { + [[maybe_unused]] bool stream_sync_only) { void *ptr = nullptr; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(device_id)); + cudaError_t error_code = cudaSuccess; #ifndef CUDART_VERSION #error CUDART_VERSION undefined! #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) - cudaError_t error_code; if (arg_alloc_size >= memory_threshold_g) { - if (exec_space_provided) { - error_code = - exec_space.impl_internal_space_instance()->cuda_malloc_async_wrapper( - &ptr, arg_alloc_size); - exec_space.fence("Kokkos::Cuda: backend fence after async malloc"); - } else { - error_code = Impl::CudaInternal::singleton().cuda_malloc_async_wrapper( - &ptr, arg_alloc_size); - Impl::cuda_device_synchronize( - "Kokkos::Cuda: backend fence after async malloc"); + error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream); + + if (error_code == cudaSuccess) { + if (stream_sync_only) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream)); + } else { + Impl::cuda_device_synchronize( + "Kokkos::Cuda: backend fence after async malloc"); + } } - } else { - error_code = - (exec_space_provided - ? exec_space.impl_internal_space_instance()->cuda_malloc_wrapper( - &ptr, arg_alloc_size) - : Impl::CudaInternal::singleton().cuda_malloc_wrapper( - &ptr, arg_alloc_size)); - } -#else - cudaError_t error_code; - if (exec_space_provided) { - error_code = exec_space.impl_internal_space_instance()->cuda_malloc_wrapper( - &ptr, arg_alloc_size); - } else { - error_code = Impl::CudaInternal::singleton().cuda_malloc_wrapper( - &ptr, arg_alloc_size); - } + } else #endif + { error_code = cudaMalloc(&ptr, arg_alloc_size); } if (error_code != cudaSuccess) { // TODO tag as unlikely branch // This is the only way to clear the last error, which // we should do here since we're turning it into an // exception here - exec_space.impl_internal_space_instance()->cuda_get_last_error_wrapper(); + cudaGetLastError(); throw Experimental::CudaRawMemoryAllocationFailure( arg_alloc_size, error_code, Experimental::RawMemoryAllocationFailure::AllocationMechanism:: @@ -226,7 +223,7 @@ void *CudaSpace::impl_allocate( const char *arg_label, const size_t arg_alloc_size, const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle) const { - return impl_allocate_common(Kokkos::Cuda{}, arg_label, arg_alloc_size, + return impl_allocate_common(m_device, m_stream, arg_label, arg_alloc_size, arg_logical_size, arg_handle, false); } @@ -234,8 +231,9 @@ void *CudaSpace::impl_allocate( const Cuda &exec_space, const char *arg_label, const size_t arg_alloc_size, const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle) const { - return impl_allocate_common(exec_space, arg_label, arg_alloc_size, - arg_logical_size, arg_handle, true); + return impl_allocate_common( + exec_space.cuda_device(), exec_space.cuda_stream(), arg_label, + arg_alloc_size, arg_logical_size, arg_handle, true); } void *CudaUVMSpace::allocate(const size_t arg_alloc_size) const { @@ -256,28 +254,27 @@ void *CudaUVMSpace::impl_allocate( if (arg_alloc_size > 0) { Kokkos::Impl::num_uvm_allocations++; - auto error_code = - Impl::CudaInternal::singleton().cuda_malloc_managed_wrapper( - &ptr, arg_alloc_size, cudaMemAttachGlobal); - -#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST - if (Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host()) - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_mem_advise_wrapper( - ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation, - cudaCpuDeviceId))); -#endif + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + cudaError_t error_code = + cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal); if (error_code != cudaSuccess) { // TODO tag as unlikely branch // This is the only way to clear the last error, which // we should do here since we're turning it into an // exception here - Impl::CudaInternal::singleton().cuda_get_last_error_wrapper(); + cudaGetLastError(); throw Experimental::CudaRawMemoryAllocationFailure( arg_alloc_size, error_code, Experimental::RawMemoryAllocationFailure::AllocationMechanism:: CudaMallocManaged); } + +#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST + if (Kokkos::CudaUVMSpace::cuda_pin_uvm_to_host()) + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMemAdvise(ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation, + cudaCpuDeviceId)); +#endif } Cuda::impl_static_fence( "Kokkos::CudaUVMSpace::impl_allocate: Post UVM Allocation"); @@ -302,13 +299,14 @@ void *CudaHostPinnedSpace::impl_allocate( const Kokkos::Tools::SpaceHandle arg_handle) const { void *ptr = nullptr; - auto error_code = Impl::CudaInternal::singleton().cuda_host_alloc_wrapper( - &ptr, arg_alloc_size, cudaHostAllocDefault); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + cudaError_t error_code = + cudaHostAlloc(&ptr, arg_alloc_size, cudaHostAllocDefault); if (error_code != cudaSuccess) { // TODO tag as unlikely branch // This is the only way to clear the last error, which // we should do here since we're turning it into an // exception here - Impl::CudaInternal::singleton().cuda_get_last_error_wrapper(); + cudaGetLastError(); throw Experimental::CudaRawMemoryAllocationFailure( arg_alloc_size, error_code, Experimental::RawMemoryAllocationFailure::AllocationMechanism:: @@ -350,18 +348,17 @@ void CudaSpace::impl_deallocate( if (arg_alloc_size >= memory_threshold_g) { Impl::cuda_device_synchronize( "Kokkos::Cuda: backend fence before async free"); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_free_async_wrapper( - arg_alloc_ptr))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeAsync(arg_alloc_ptr, m_stream)); Impl::cuda_device_synchronize( "Kokkos::Cuda: backend fence after async free"); } else { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); } #else - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); #endif } catch (...) { } @@ -393,8 +390,8 @@ void CudaUVMSpace::impl_deallocate( try { if (arg_alloc_ptr != nullptr) { Kokkos::Impl::num_uvm_allocations--; - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_free_wrapper(arg_alloc_ptr))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); } } catch (...) { } @@ -424,8 +421,8 @@ void CudaHostPinnedSpace::impl_deallocate( reported_size); } try { - KOKKOS_IMPL_CUDA_SAFE_CALL(( - Impl::CudaInternal::singleton().cuda_free_host_wrapper(arg_alloc_ptr))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr)); } catch (...) { } } @@ -438,160 +435,6 @@ void CudaHostPinnedSpace::impl_deallocate( namespace Kokkos { namespace Impl { -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord - SharedAllocationRecord::s_root_record; - -SharedAllocationRecord - SharedAllocationRecord::s_root_record; - -SharedAllocationRecord - SharedAllocationRecord::s_root_record; -#endif - -//============================================================================== -// {{{1 - -SharedAllocationRecord::~SharedAllocationRecord() { - auto alloc_size = SharedAllocationRecord::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - alloc_size, (alloc_size - sizeof(SharedAllocationHeader))); -} - -void SharedAllocationRecord::deep_copy_header_no_exec( - void *ptr, const void *header) { - Kokkos::Cuda exec; - Kokkos::Impl::DeepCopy(exec, ptr, header, - sizeof(SharedAllocationHeader)); - exec.fence( - "SharedAllocationRecord::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size, - (SharedAllocationRecord::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size, - (SharedAllocationRecord::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -// end SharedAllocationRecord destructors }}}1 -//============================================================================== - -//============================================================================== -// {{{1 - -SharedAllocationRecord::SharedAllocationRecord( - const Kokkos::CudaSpace &arg_space, const std::string &arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - Kokkos::Cuda exec; - Kokkos::Impl::DeepCopy( - exec, RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - exec.fence( - "SharedAllocationRecord::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -SharedAllocationRecord::SharedAllocationRecord( - const Kokkos::Cuda &arg_exec_space, const Kokkos::CudaSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_exec_space, arg_space, - arg_label, arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - Kokkos::Impl::DeepCopy(arg_exec_space, - RecordBase::m_alloc_ptr, &header, - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord::SharedAllocationRecord( - const Kokkos::CudaUVMSpace &arg_space, const std::string &arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::CudaHostPinnedSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -// end SharedAllocationRecord constructors }}}1 -//============================================================================== - void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes, bool to_device) { if ((ptr == nullptr) || (bytes == 0)) return; @@ -620,19 +463,12 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes, #include -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class SharedAllocationRecordCommon; -template class HostInaccessibleSharedAllocationRecordCommon; -template class SharedAllocationRecordCommon; -template class SharedAllocationRecordCommon; - -} // end namespace Impl -} // end namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::CudaSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::CudaUVMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::CudaHostPinnedSpace); // end Explicit instantiations of CRTP Base classes }}}1 //============================================================================== diff --git a/core/src/Cuda/Kokkos_CudaSpace.hpp b/core/src/Cuda/Kokkos_CudaSpace.hpp index b8fa335cd3b..0e20193e8b4 100644 --- a/core/src/Cuda/Kokkos_CudaSpace.hpp +++ b/core/src/Cuda/Kokkos_CudaSpace.hpp @@ -68,6 +68,11 @@ class CudaSpace { /*--------------------------------*/ CudaSpace(); + + private: + CudaSpace(int device_id, cudaStream_t stream); + + public: CudaSpace(CudaSpace&& rhs) = default; CudaSpace(const CudaSpace& rhs) = default; CudaSpace& operator=(CudaSpace&& rhs) = default; @@ -89,9 +94,11 @@ class CudaSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; + static CudaSpace impl_create(int device_id, cudaStream_t stream) { + return CudaSpace(device_id, stream); + } + private: - template - friend class Kokkos::Experimental::LogicalMemorySpace; void* impl_allocate(const Cuda& exec_space, const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, @@ -112,10 +119,10 @@ class CudaSpace { static constexpr const char* name() { return m_name; } private: - int m_device; ///< Which Cuda device + int m_device; + cudaStream_t m_stream; static constexpr const char* m_name = "Cuda"; - friend class Kokkos::Impl::SharedAllocationRecord; }; template <> @@ -149,6 +156,11 @@ class CudaUVMSpace { /*--------------------------------*/ CudaUVMSpace(); + + private: + CudaUVMSpace(int device_id, cudaStream_t stream); + + public: CudaUVMSpace(CudaUVMSpace&& rhs) = default; CudaUVMSpace(const CudaUVMSpace& rhs) = default; CudaUVMSpace& operator=(CudaUVMSpace&& rhs) = default; @@ -156,6 +168,16 @@ class CudaUVMSpace { ~CudaUVMSpace() = default; /**\brief Allocate untracked memory in the cuda space */ + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -167,8 +189,6 @@ class CudaUVMSpace { const size_t arg_logical_size = 0) const; private: - template - friend class Kokkos::Experimental::LogicalMemorySpace; void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -189,8 +209,13 @@ class CudaUVMSpace { #endif /*--------------------------------*/ + static CudaUVMSpace impl_create(int device_id, cudaStream_t stream) { + return CudaUVMSpace(device_id, stream); + } + private: - int m_device; ///< Which Cuda device + int m_device; + cudaStream_t m_stream; #ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST static bool kokkos_impl_cuda_pin_uvm_to_host_v; @@ -223,6 +248,11 @@ class CudaHostPinnedSpace { /*--------------------------------*/ CudaHostPinnedSpace(); + + private: + CudaHostPinnedSpace(int device_id, cudaStream_t stream); + + public: CudaHostPinnedSpace(CudaHostPinnedSpace&& rhs) = default; CudaHostPinnedSpace(const CudaHostPinnedSpace& rhs) = default; CudaHostPinnedSpace& operator=(CudaHostPinnedSpace&& rhs) = default; @@ -230,6 +260,16 @@ class CudaHostPinnedSpace { ~CudaHostPinnedSpace() = default; /**\brief Allocate untracked memory in the space */ + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -240,9 +280,11 @@ class CudaHostPinnedSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; + static CudaHostPinnedSpace impl_create(int device_id, cudaStream_t stream) { + return CudaHostPinnedSpace(device_id, stream); + } + private: - template - friend class Kokkos::Experimental::LogicalMemorySpace; void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -258,6 +300,9 @@ class CudaHostPinnedSpace { static constexpr const char* name() { return m_name; } private: + int m_device; + cudaStream_t m_stream; + static constexpr const char* m_name = "CudaHostPinned"; /*--------------------------------*/ @@ -280,15 +325,12 @@ const std::unique_ptr& cuda_get_deep_copy_space( bool initialize = true); static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); -static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaSpace>::assignable); +static_assert(Kokkos::Impl::MemorySpaceAccess< + Kokkos::CudaUVMSpace, Kokkos::CudaUVMSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaHostPinnedSpace>::assignable); //---------------------------------------- @@ -516,179 +558,10 @@ struct DeepCopy -class SharedAllocationRecord - : public HostInaccessibleSharedAllocationRecordCommon { - private: - friend class SharedAllocationRecord; - friend class SharedAllocationRecordCommon; - friend class HostInaccessibleSharedAllocationRecordCommon; - - using RecordBase = SharedAllocationRecord; - using base_t = - HostInaccessibleSharedAllocationRecordCommon; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; -#endif - - const Kokkos::CudaSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - // This constructor does not forward to the one without exec_space arg - // in order to work around https://github.com/kokkos/kokkos/issues/5258 - // This constructor is templated so I can't just put it into the cpp file - // like the other constructor. - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, const Kokkos::CudaSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - // workaround for issue with NVCC and MSVC - // https://github.com/kokkos/kokkos/issues/5258 - deep_copy_header_no_exec(RecordBase::m_alloc_ptr, &header); - } - - SharedAllocationRecord( - const Kokkos::Cuda& exec_space, const Kokkos::CudaSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const Kokkos::CudaSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - // helper function to work around MSVC+NVCC issue - // https://github.com/kokkos/kokkos/issues/5258 - static void deep_copy_header_no_exec(void*, const void*); -}; - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon { - private: - friend class SharedAllocationRecordCommon; - - using base_t = SharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - static RecordBase s_root_record; - - const Kokkos::CudaUVMSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - // This constructor does not forward to the one without exec_space arg - // in order to work around https://github.com/kokkos/kokkos/issues/5258 - // This constructor is templated so I can't just put it into the cpp file - // like the other constructor. - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::CudaUVMSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); - } - - SharedAllocationRecord( - const Kokkos::CudaUVMSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon { - private: - friend class SharedAllocationRecordCommon; - - using RecordBase = SharedAllocationRecord; - using base_t = SharedAllocationRecordCommon; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - static RecordBase s_root_record; - - const Kokkos::CudaHostPinnedSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - // This constructor does not forward to the one without exec_space arg - // in order to work around https://github.com/kokkos/kokkos/issues/5258 - // This constructor is templated so I can't just put it into the cpp file - // like the other constructor. - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::CudaHostPinnedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); - } - - SharedAllocationRecord( - const Kokkos::CudaHostPinnedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -} // namespace Impl -} // namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::CudaSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaUVMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaHostPinnedSpace); //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/core/src/Cuda/Kokkos_Cuda_Error.hpp b/core/src/Cuda/Kokkos_Cuda_Error.hpp index f68e05f7804..c4458c910ca 100644 --- a/core/src/Cuda/Kokkos_Cuda_Error.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Error.hpp @@ -27,10 +27,6 @@ namespace Kokkos { namespace Impl { -void cuda_stream_synchronize( - const cudaStream_t stream, - Kokkos::Tools::Experimental::SpecialSynchronizationCases reason, - const std::string& name); void cuda_device_synchronize(const std::string& name); void cuda_stream_synchronize(const cudaStream_t stream, const std::string& name); diff --git a/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp b/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp index a4d064e544a..5a821ab64a3 100644 --- a/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp +++ b/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp @@ -23,8 +23,7 @@ #include -#include // GraphAccess needs to be complete -#include // SharedAllocationRecord +#include // GraphAccess needs to be complete #include #include @@ -50,10 +49,6 @@ class GraphNodeKernelImpl m_graph_ptr = nullptr; Kokkos::ObservingRawPtr m_graph_node_ptr = nullptr; - // Note: owned pointer to CudaSpace memory (used for global memory launches), - // which we're responsible for deallocating, but not responsible for calling - // its destructor. - using Record = Kokkos::Impl::SharedAllocationRecord; // Basically, we have to make this mutable for the same reasons that the // global kernel buffers in the Cuda instance are mutable... mutable Kokkos::OwningRawPtr m_driver_storage = nullptr; @@ -82,9 +77,7 @@ class GraphNodeKernelImpl allocate_driver_memory_buffer() const { KOKKOS_EXPECTS(m_driver_storage == nullptr) - - auto* record = Record::allocate( - Kokkos::CudaSpace{}, "GraphNodeKernel global memory functor storage", - sizeof(base_t)); - - Record::increment(record); - m_driver_storage = reinterpret_cast(record->data()); + m_driver_storage = static_cast(Kokkos::CudaSpace().allocate( + "GraphNodeKernel global memory functor storage", sizeof(base_t))); KOKKOS_ENSURES(m_driver_storage != nullptr) return m_driver_storage; } diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp index d7f853d9910..849e8b3b30e 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -26,10 +26,10 @@ #include -#include -#include -#include -#include +//#include +//#include +//#include +//#include #include #include #include @@ -97,21 +97,21 @@ __global__ void query_cuda_kernel_arch(int *d_arch) { } /** Query what compute capability is actually launched to the device: */ -int cuda_kernel_arch() { +int cuda_kernel_arch(int device_id) { int arch = 0; int *d_arch = nullptr; - KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_malloc_wrapper( - reinterpret_cast(&d_arch), sizeof(int)))); - KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_memcpy_wrapper( - d_arch, &arch, sizeof(int), cudaMemcpyDefault))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(device_id)); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMalloc(reinterpret_cast(&d_arch), sizeof(int))); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMemcpy(d_arch, &arch, sizeof(int), cudaMemcpyDefault)); query_cuda_kernel_arch<<<1, 1>>>(d_arch); - KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_memcpy_wrapper( - &arch, d_arch, sizeof(int), cudaMemcpyDefault))); KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_free_wrapper(d_arch))); + cudaMemcpy(&arch, d_arch, sizeof(int), cudaMemcpyDefault)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(d_arch)); return arch; } @@ -135,7 +135,6 @@ Kokkos::View cuda_global_unique_token_locks( return locks; } -// FIXME_CUDA_MULTIPLE_DEVICES void cuda_device_synchronize(const std::string &name) { Kokkos::Tools::Experimental::Impl::profile_fence_event( name, @@ -144,16 +143,16 @@ void cuda_device_synchronize(const std::string &name) { #if defined(KOKKOS_COMPILER_CLANG) // annotate with __host__ silence a clang warning about using // cudaDeviceSynchronize in device code - [] __host__() { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_synchronize_wrapper())); - }); + [] __host__() #else - []() { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_synchronize_wrapper())); - }); + []() #endif + { + for (int cuda_device : Kokkos::Impl::CudaInternal::cuda_devices) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize()); + } + }); } void cuda_stream_synchronize(const cudaStream_t stream, const CudaInternal *ptr, @@ -168,25 +167,11 @@ void cuda_stream_synchronize(const cudaStream_t stream, const CudaInternal *ptr, }); } -void cuda_stream_synchronize( - const cudaStream_t stream, - Kokkos::Tools::Experimental::SpecialSynchronizationCases reason, - const std::string &name) { - Kokkos::Tools::Experimental::Impl::profile_fence_event( - name, reason, [&]() { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_stream_synchronize_wrapper( - stream))); - }); -} - void cuda_internal_error_throw(cudaError e, const char *name, const char *file, const int line) { std::ostringstream out; - out << name << " error( " - << CudaInternal::singleton().cuda_get_error_name_wrapper(e) - << "): " - << CudaInternal::singleton().cuda_get_error_string_wrapper(e); + out << name << " error( " << cudaGetErrorName(e) + << "): " << cudaGetErrorString(e); if (file) { out << " " << file << ":" << line; } @@ -196,10 +181,8 @@ void cuda_internal_error_throw(cudaError e, const char *name, const char *file, void cuda_internal_error_abort(cudaError e, const char *name, const char *file, const int line) { std::ostringstream out; - out << name << " error( " - << CudaInternal::singleton().cuda_get_error_name_wrapper(e) - << "): " - << CudaInternal::singleton().cuda_get_error_string_wrapper(e); + out << name << " error( " << cudaGetErrorName(e) + << "): " << cudaGetErrorString(e); if (file) { out << " " << file << ":" << line; } @@ -208,96 +191,6 @@ void cuda_internal_error_abort(cudaError e, const char *name, const char *file, host_abort(out.str().c_str()); } -//---------------------------------------------------------------------------- -// Some significant cuda device properties: -// -// cudaDeviceProp::name : Text label for device -// cudaDeviceProp::major : Device major number -// cudaDeviceProp::minor : Device minor number -// cudaDeviceProp::warpSize : number of threads per warp -// cudaDeviceProp::multiProcessorCount : number of multiprocessors -// cudaDeviceProp::sharedMemPerBlock : capacity of shared memory per block -// cudaDeviceProp::totalConstMem : capacity of constant memory -// cudaDeviceProp::totalGlobalMem : capacity of global memory -// cudaDeviceProp::maxGridSize[3] : maximum grid size - -// -// Section 4.4.2.4 of the CUDA Toolkit Reference Manual -// -// struct cudaDeviceProp { -// char name[256]; -// size_t totalGlobalMem; -// size_t sharedMemPerBlock; -// int regsPerBlock; -// int warpSize; -// size_t memPitch; -// int maxThreadsPerBlock; -// int maxThreadsDim[3]; -// int maxGridSize[3]; -// size_t totalConstMem; -// int major; -// int minor; -// int clockRate; -// size_t textureAlignment; -// int deviceOverlap; -// int multiProcessorCount; -// int kernelExecTimeoutEnabled; -// int integrated; -// int canMapHostMemory; -// int computeMode; -// int concurrentKernels; -// int ECCEnabled; -// int pciBusID; -// int pciDeviceID; -// int tccDriver; -// int asyncEngineCount; -// int unifiedAddressing; -// int memoryClockRate; -// int memoryBusWidth; -// int l2CacheSize; -// int maxThreadsPerMultiProcessor; -// }; - -namespace { - -class CudaInternalDevices { - public: - enum { MAXIMUM_DEVICE_COUNT = 64 }; - struct cudaDeviceProp m_cudaProp[MAXIMUM_DEVICE_COUNT]; - int m_cudaDevCount; - - CudaInternalDevices(); - - static const CudaInternalDevices &singleton(); -}; - -CudaInternalDevices::CudaInternalDevices() { - // See 'cudaSetDeviceFlags' for host-device thread interaction - // Section 4.4.2.6 of the CUDA Toolkit Reference Manual - - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_device_count_wrapper( - &m_cudaDevCount))); - - if (m_cudaDevCount > MAXIMUM_DEVICE_COUNT) { - Kokkos::abort( - "Sorry, you have more GPUs per node than we thought anybody would ever " - "have. Please report this to github.com/kokkos/kokkos."); - } - for (int i = 0; i < m_cudaDevCount; ++i) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_device_properties_wrapper( - m_cudaProp + i, i))); - } -} - -const CudaInternalDevices &CudaInternalDevices::singleton() { - static CudaInternalDevices self; - return self; -} - -} // namespace - //---------------------------------------------------------------------------- int Impl::CudaInternal::concurrency() { @@ -307,8 +200,6 @@ int Impl::CudaInternal::concurrency() { } void CudaInternal::print_configuration(std::ostream &s) const { - const CudaInternalDevices &dev_info = CudaInternalDevices::singleton(); - #if defined(KOKKOS_ENABLE_CUDA) s << "macro KOKKOS_ENABLE_CUDA : defined\n"; #endif @@ -317,22 +208,23 @@ void CudaInternal::print_configuration(std::ostream &s) const { << CUDA_VERSION / 1000 << "." << (CUDA_VERSION % 1000) / 10 << '\n'; #endif - for (int i = 0; i < dev_info.m_cudaDevCount; ++i) { - s << "Kokkos::Cuda[ " << i << " ] " << dev_info.m_cudaProp[i].name - << " capability " << dev_info.m_cudaProp[i].major << "." - << dev_info.m_cudaProp[i].minor << ", Total Global Memory: " - << human_memory_size(dev_info.m_cudaProp[i].totalGlobalMem) + for (int i : get_visible_devices()) { + cudaDeviceProp prop; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(&prop, i)); + s << "Kokkos::Cuda[ " << i << " ] " << prop.name << " capability " + << prop.major << "." << prop.minor + << ", Total Global Memory: " << human_memory_size(prop.totalGlobalMem) << ", Shared Memory per Block: " - << human_memory_size(dev_info.m_cudaProp[i].sharedMemPerBlock); + << human_memory_size(prop.sharedMemPerBlock); if (m_cudaDev == i) s << " : Selected"; - s << std::endl; + s << '\n'; } } //---------------------------------------------------------------------------- CudaInternal::~CudaInternal() { - if (m_stream || m_scratchSpace || m_scratchFlags || m_scratchUnified) { + if (m_scratchSpace || m_scratchFlags || m_scratchUnified) { std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()" << std::endl; } @@ -370,45 +262,53 @@ void CudaInternal::fence() const { fence("Kokkos::CudaInternal::fence(): Unnamed Instance Fence"); } -void CudaInternal::initialize(cudaStream_t stream, bool manage_stream) { +void CudaInternal::initialize(cudaStream_t stream) { KOKKOS_EXPECTS(!is_initialized()); if (was_finalized) Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n"); was_initialized = true; + // Check that the device associated with the stream matches cuda_device + CUcontext context; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuStreamGetCtx(stream, &context))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuCtxPushCurrent(context))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuCtxGetDevice(&m_cudaDev))); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev)); + + m_stream = stream; + CudaInternal::cuda_devices.insert(m_cudaDev); + + // Allocate a staging buffer for constant mem in pinned host memory + // and an event to avoid overwriting driver for previous kernel launches + if (!constantMemHostStagingPerDevice[m_cudaDev]) + KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_malloc_host_wrapper( + reinterpret_cast(&constantMemHostStagingPerDevice[m_cudaDev]), + CudaTraits::ConstantMemoryUsage))); + + if (!constantMemReusablePerDevice[m_cudaDev]) + KOKKOS_IMPL_CUDA_SAFE_CALL( + (cuda_event_create_wrapper(&constantMemReusablePerDevice[m_cudaDev]))); + //---------------------------------- // Multiblock reduction uses scratch flags for counters // and scratch space for partial reduction values. // Allocate some initial space. This will grow as needed. { - const unsigned reduce_block_count = - m_maxWarpCount * Impl::CudaTraits::WarpSize; + // Maximum number of warps, + // at most one warp per thread in a warp for reduction. + auto const maxWarpCount = std::min( + m_deviceProp.maxThreadsPerBlock / CudaTraits::WarpSize, + CudaTraits::WarpSize); + unsigned const reduce_block_count = + maxWarpCount * Impl::CudaTraits::WarpSize; (void)scratch_unified(16 * sizeof(size_type)); (void)scratch_flags(reduce_block_count * 2 * sizeof(size_type)); (void)scratch_space(reduce_block_count * 16 * sizeof(size_type)); } - // Init the array for used for arbitrarily sized atomics - if (this == &singleton()) { - desul::Impl::init_lock_arrays(); // FIXME - } - - // Allocate a staging buffer for constant mem in pinned host memory - // and an event to avoid overwriting driver for previous kernel launches - if (this == &singleton()) { - KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_malloc_host_wrapper( - reinterpret_cast(&constantMemHostStaging), - CudaTraits::ConstantMemoryUsage))); - - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_event_create_wrapper(&constantMemReusable))); - } - - m_stream = stream; - m_manage_stream = manage_stream; for (int i = 0; i < m_n_team_scratch; ++i) { m_team_scratch_current_size[i] = 0; m_team_scratch_ptr[i] = nullptr; @@ -427,22 +327,23 @@ void CudaInternal::initialize(cudaStream_t stream, bool manage_stream) { Cuda::size_type *CudaInternal::scratch_flags(const std::size_t size) const { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { - m_scratchFlagsCount = scratch_count(size); + auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); - using Record = - Kokkos::Impl::SharedAllocationRecord; + if (m_scratchFlags) { + mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); + } - if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags)); + m_scratchFlagsCount = scratch_count(size); std::size_t alloc_size = multiply_overflow_abort(m_scratchFlagsCount, sizeScratchGrain); - Record *const r = Record::allocate( - Kokkos::CudaSpace(), "Kokkos::InternalScratchFlags", alloc_size); - - Record::increment(r); - - m_scratchFlags = reinterpret_cast(r->data()); + m_scratchFlags = static_cast( + mem_space.allocate("Kokkos::InternalScratchFlags", alloc_size)); + // We only zero-initialize the allocation when we actually allocate. + // It's the responsibility of the features using scratch_flags, + // namely parallel_reduce and parallel_scan, to reset the used values to 0. KOKKOS_IMPL_CUDA_SAFE_CALL( (cuda_memset_wrapper(m_scratchFlags, 0, alloc_size))); } @@ -453,21 +354,19 @@ Cuda::size_type *CudaInternal::scratch_flags(const std::size_t size) const { Cuda::size_type *CudaInternal::scratch_space(const std::size_t size) const { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { - m_scratchSpaceCount = scratch_count(size); + auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); - using Record = - Kokkos::Impl::SharedAllocationRecord; + if (m_scratchSpace) { + mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + } - if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace)); + m_scratchSpaceCount = scratch_count(size); std::size_t alloc_size = multiply_overflow_abort(m_scratchSpaceCount, sizeScratchGrain); - Record *const r = Record::allocate( - Kokkos::CudaSpace(), "Kokkos::InternalScratchSpace", alloc_size); - - Record::increment(r); - - m_scratchSpace = reinterpret_cast(r->data()); + m_scratchSpace = static_cast( + mem_space.allocate("Kokkos::InternalScratchSpace", alloc_size)); } return m_scratchSpace; @@ -476,23 +375,20 @@ Cuda::size_type *CudaInternal::scratch_space(const std::size_t size) const { Cuda::size_type *CudaInternal::scratch_unified(const std::size_t size) const { if (verify_is_initialized("scratch_unified") && m_scratchUnifiedCount < scratch_count(size)) { - m_scratchUnifiedCount = scratch_count(size); + auto mem_space = + Kokkos::CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream); - using Record = - Kokkos::Impl::SharedAllocationRecord; + if (m_scratchUnified) { + mem_space.deallocate(m_scratchUnified, + m_scratchUnifiedCount * sizeScratchGrain); + } - if (m_scratchUnified) - Record::decrement(Record::get_record(m_scratchUnified)); + m_scratchUnifiedCount = scratch_count(size); std::size_t alloc_size = multiply_overflow_abort(m_scratchUnifiedCount, sizeScratchGrain); - Record *const r = - Record::allocate(Kokkos::CudaHostPinnedSpace(), - "Kokkos::InternalScratchUnified", alloc_size); - - Record::increment(r); - - m_scratchUnified = reinterpret_cast(r->data()); + m_scratchUnified = static_cast( + mem_space.allocate("Kokkos::InternalScratchUnified", alloc_size)); } return m_scratchUnified; @@ -500,21 +396,16 @@ Cuda::size_type *CudaInternal::scratch_unified(const std::size_t size) const { Cuda::size_type *CudaInternal::scratch_functor(const std::size_t size) const { if (verify_is_initialized("scratch_functor") && m_scratchFunctorSize < size) { - m_scratchFunctorSize = size; - - using Record = - Kokkos::Impl::SharedAllocationRecord; - - if (m_scratchFunctor) - Record::decrement(Record::get_record(m_scratchFunctor)); + auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); - Record *const r = - Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchFunctor", - m_scratchFunctorSize); + if (m_scratchFunctor) { + mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize); + } - Record::increment(r); + m_scratchFunctorSize = size; - m_scratchFunctor = reinterpret_cast(r->data()); + m_scratchFunctor = static_cast(mem_space.allocate( + "Kokkos::InternalScratchFunctor", m_scratchFunctorSize)); } return m_scratchFunctor; @@ -537,21 +428,21 @@ void *CudaInternal::resize_team_scratch_space(int scratch_pool_id, // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race // condition. + auto mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); if (m_team_scratch_current_size[scratch_pool_id] == 0) { m_team_scratch_current_size[scratch_pool_id] = bytes; m_team_scratch_ptr[scratch_pool_id] = - Kokkos::kokkos_malloc( - "Kokkos::CudaSpace::TeamScratchMemory", - m_team_scratch_current_size[scratch_pool_id]); + mem_space.allocate("Kokkos::CudaSpace::TeamScratchMemory", + m_team_scratch_current_size[scratch_pool_id]); } if ((bytes > m_team_scratch_current_size[scratch_pool_id]) || ((bytes < m_team_scratch_current_size[scratch_pool_id]) && (force_shrink))) { + mem_space.deallocate(m_team_scratch_ptr[scratch_pool_id], + m_team_scratch_current_size[scratch_pool_id]); m_team_scratch_current_size[scratch_pool_id] = bytes; m_team_scratch_ptr[scratch_pool_id] = - Kokkos::kokkos_realloc( - m_team_scratch_ptr[scratch_pool_id], - m_team_scratch_current_size[scratch_pool_id]); + mem_space.allocate("Kokkos::CudaSpace::TeamScratchMemory", bytes); } return m_team_scratch_ptr[scratch_pool_id]; } @@ -568,50 +459,33 @@ void CudaInternal::finalize() { was_finalized = true; - // Only finalize this if we're the singleton - if (this == &singleton()) { - (void)Impl::cuda_global_unique_token_locks(true); - desul::Impl::finalize_lock_arrays(); // FIXME - - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_free_host_wrapper(constantMemHostStaging))); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_event_destroy_wrapper(constantMemReusable))); - auto &deep_copy_space = - Kokkos::Impl::cuda_get_deep_copy_space(/*initialize*/ false); - if (deep_copy_space) - deep_copy_space->impl_internal_space_instance()->finalize(); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_stream_destroy_wrapper(cuda_get_deep_copy_stream()))); - } - + auto cuda_mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) { - using RecordCuda = Kokkos::Impl::SharedAllocationRecord; - using RecordHost = - Kokkos::Impl::SharedAllocationRecord; - - RecordCuda::decrement(RecordCuda::get_record(m_scratchFlags)); - RecordCuda::decrement(RecordCuda::get_record(m_scratchSpace)); - RecordHost::decrement(RecordHost::get_record(m_scratchUnified)); - if (m_scratchFunctorSize > 0) - RecordCuda::decrement(RecordCuda::get_record(m_scratchFunctor)); + auto host_mem_space = + Kokkos::CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream); + cuda_mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); + cuda_mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + host_mem_space.deallocate(m_scratchUnified, + m_scratchUnifiedCount * sizeScratchGrain); + if (m_scratchFunctorSize > 0) { + cuda_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize); + } } for (int i = 0; i < m_n_team_scratch; ++i) { if (m_team_scratch_current_size[i] > 0) - Kokkos::kokkos_free(m_team_scratch_ptr[i]); + cuda_mem_space.deallocate(m_team_scratch_ptr[i], + m_team_scratch_current_size[i]); } - if (m_manage_stream && get_stream() != nullptr) - KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_stream_destroy_wrapper(m_stream))); - m_scratchSpaceCount = 0; m_scratchFlagsCount = 0; m_scratchUnifiedCount = 0; m_scratchSpace = nullptr; m_scratchFlags = nullptr; m_scratchUnified = nullptr; - m_stream = nullptr; for (int i = 0; i < m_n_team_scratch; ++i) { m_team_scratch_current_size[i] = 0; m_team_scratch_ptr[i] = nullptr; @@ -624,30 +498,6 @@ void CudaInternal::finalize() { //---------------------------------------------------------------------------- -Cuda::size_type cuda_internal_multiprocessor_count() { - return CudaInternal::singleton().m_multiProcCount; -} - -CudaSpace::size_type cuda_internal_maximum_concurrent_block_count() { -#if defined(KOKKOS_ARCH_KEPLER) - // Compute capability 3.0 through 3.7 - enum : int { max_resident_blocks_per_multiprocessor = 16 }; -#else - // Compute capability 5.0 through 6.2 - enum : int { max_resident_blocks_per_multiprocessor = 32 }; -#endif - return CudaInternal::singleton().m_multiProcCount * - max_resident_blocks_per_multiprocessor; -}; - -Cuda::size_type cuda_internal_maximum_warp_count() { - return CudaInternal::singleton().m_maxWarpCount; -} - -std::array cuda_internal_maximum_grid_count() { - return CudaInternal::singleton().m_maxBlock; -} - Cuda::size_type *cuda_internal_scratch_space(const Cuda &instance, const std::size_t size) { return instance.impl_internal_space_instance()->scratch_space(size); @@ -670,10 +520,6 @@ Cuda::size_type *cuda_internal_scratch_unified(const Cuda &instance, namespace Kokkos { -Cuda::size_type Cuda::detect_device_count() { - return Impl::CudaInternalDevices::singleton().m_cudaDevCount; -} - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 int Cuda::concurrency() { #else @@ -687,25 +533,23 @@ int Cuda::impl_is_initialized() { } void Cuda::impl_initialize(InitializationSettings const &settings) { - const int cuda_device_id = Impl::get_gpu(settings); - const auto &dev_info = Impl::CudaInternalDevices::singleton(); - - const struct cudaDeviceProp &cudaProp = dev_info.m_cudaProp[cuda_device_id]; + const std::vector &visible_devices = Impl::get_visible_devices(); + const int cuda_device_id = + Impl::get_gpu(settings).value_or(visible_devices[0]); - Impl::CudaInternal::m_cudaDev = cuda_device_id; + cudaDeviceProp cudaProp; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaGetDeviceProperties(&cudaProp, cuda_device_id)); Impl::CudaInternal::m_deviceProp = cudaProp; - - Kokkos::Impl::cuda_device_synchronize( - "Kokkos::CudaInternal::initialize: Fence on space initialization"); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize()); // Query what compute capability architecture a kernel executes: - Impl::CudaInternal::m_cudaArch = Impl::cuda_kernel_arch(); + Impl::CudaInternal::m_cudaArch = Impl::cuda_kernel_arch(cuda_device_id); if (Impl::CudaInternal::m_cudaArch == 0) { - std::stringstream ss; - ss << "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture\n"; - std::string msg = ss.str(); - Kokkos::abort(msg.c_str()); + Kokkos::abort( + "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture\n"); } int compiled_major = Impl::CudaInternal::m_cudaArch / 100; @@ -761,77 +605,41 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default } #endif - //---------------------------------- - // number of multiprocessors - Impl::CudaInternal::m_multiProcCount = cudaProp.multiProcessorCount; - - //---------------------------------- - // Maximum number of warps, - // at most one warp per thread in a warp for reduction. - Impl::CudaInternal::m_maxWarpCount = - cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize; - - if (Impl::CudaTraits::WarpSize < Impl::CudaInternal::m_maxWarpCount) { - Impl::CudaInternal::m_maxWarpCount = Impl::CudaTraits::WarpSize; - } - - //---------------------------------- - // Maximum number of blocks: - - Impl::CudaInternal::m_maxBlock[0] = cudaProp.maxGridSize[0]; - Impl::CudaInternal::m_maxBlock[1] = cudaProp.maxGridSize[1]; - Impl::CudaInternal::m_maxBlock[2] = cudaProp.maxGridSize[2]; - - Impl::CudaInternal::m_shmemPerSM = cudaProp.sharedMemPerMultiprocessor; - Impl::CudaInternal::m_maxShmemPerBlock = cudaProp.sharedMemPerBlock; - Impl::CudaInternal::m_maxBlocksPerSM = - Impl::CudaInternal::m_cudaArch < 500 - ? 16 - : (Impl::CudaInternal::m_cudaArch < 750 - ? 32 - : (Impl::CudaInternal::m_cudaArch == 750 ? 16 : 32)); - Impl::CudaInternal::m_maxThreadsPerSM = cudaProp.maxThreadsPerMultiProcessor; - Impl::CudaInternal::m_maxThreadsPerBlock = cudaProp.maxThreadsPerBlock; - //---------------------------------- cudaStream_t singleton_stream; - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Impl::CudaInternal::singleton().cuda_stream_create_wrapper( - &singleton_stream))); - - auto &cuda_singleton = Impl::CudaInternal::singleton(); - cuda_singleton.initialize(singleton_stream, /*manage*/ true); -} + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&singleton_stream)); -std::vector Cuda::detect_device_arch() { - const Impl::CudaInternalDevices &s = Impl::CudaInternalDevices::singleton(); - - std::vector output(s.m_cudaDevCount); - - for (int i = 0; i < s.m_cudaDevCount; ++i) { - output[i] = s.m_cudaProp[i].major * 100 + s.m_cudaProp[i].minor; - } + // Init the array for used for arbitrarily sized atomics + desul::Impl::init_lock_arrays(); // FIXME - return output; + Impl::CudaInternal::singleton().initialize(singleton_stream); } -Cuda::size_type Cuda::device_arch() { - const int dev_id = Impl::CudaInternal::singleton().m_cudaDev; +void Cuda::impl_finalize() { + (void)Impl::cuda_global_unique_token_locks(true); + desul::Impl::finalize_lock_arrays(); // FIXME - int dev_arch = 0; - - if (0 <= dev_id) { - const struct cudaDeviceProp &cudaProp = - Impl::CudaInternalDevices::singleton().m_cudaProp[dev_id]; - - dev_arch = cudaProp.major * 100 + cudaProp.minor; + for (const auto cuda_device : Kokkos::Impl::CudaInternal::cuda_devices) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaFreeHost(Kokkos::Impl::CudaInternal::constantMemHostStagingPerDevice + [cuda_device])); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventDestroy( + Kokkos::Impl::CudaInternal::constantMemReusablePerDevice[cuda_device])); } - return dev_arch; -} + auto &deep_copy_space = Impl::cuda_get_deep_copy_space(/*initialize*/ false); + if (deep_copy_space) + deep_copy_space->impl_internal_space_instance()->finalize(); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaStreamDestroy(Impl::cuda_get_deep_copy_stream())); -void Cuda::impl_finalize() { Impl::CudaInternal::singleton().finalize(); } + Impl::CudaInternal::singleton().finalize(); + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaStreamDestroy(Impl::CudaInternal::singleton().m_stream)); +} Cuda::Cuda() : m_space_instance(&Impl::CudaInternal::singleton(), @@ -845,13 +653,17 @@ KOKKOS_DEPRECATED Cuda::Cuda(cudaStream_t stream, bool manage_stream) manage_stream ? Impl::ManageStream::yes : Impl::ManageStream::no) {} Cuda::Cuda(cudaStream_t stream, Impl::ManageStream manage_stream) - : m_space_instance(new Impl::CudaInternal, [](Impl::CudaInternal *ptr) { - ptr->finalize(); - delete ptr; - }) { + : m_space_instance( + new Impl::CudaInternal, [manage_stream](Impl::CudaInternal *ptr) { + ptr->finalize(); + if (static_cast(manage_stream)) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(ptr->m_stream)); + } + delete ptr; + }) { Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor"); - m_space_instance->initialize(stream, static_cast(manage_stream)); + m_space_instance->initialize(stream); } void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const { diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp index a324adecfeb..24f4af31019 100644 --- a/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -22,6 +22,10 @@ #include #include #include +#include "Kokkos_CudaSpace.hpp" + +#include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -55,27 +59,10 @@ struct CudaTraits { unsigned long[ConstantMemoryUsage / sizeof(unsigned long)]; static constexpr int ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */; - - KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_count( - CudaSpace::size_type i) { - return (i + WarpIndexMask) >> WarpIndexShift; - } - - KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_align( - CudaSpace::size_type i) { - constexpr CudaSpace::size_type Mask = ~WarpIndexMask; - return (i + WarpIndexMask) & Mask; - } }; //---------------------------------------------------------------------------- -CudaSpace::size_type cuda_internal_multiprocessor_count(); -CudaSpace::size_type cuda_internal_maximum_warp_count(); -std::array cuda_internal_maximum_grid_count(); - -CudaSpace::size_type cuda_internal_maximum_concurrent_block_count(); - CudaSpace::size_type* cuda_internal_scratch_flags(const Cuda&, const std::size_t size); CudaSpace::size_type* cuda_internal_scratch_space(const Cuda&, @@ -101,18 +88,10 @@ class CudaInternal { public: using size_type = Cuda::size_type; - inline static int m_cudaDev = -1; + int m_cudaDev = -1; // Device Properties - inline static int m_cudaArch = -1; - inline static unsigned m_multiProcCount = 0; - inline static unsigned m_maxWarpCount = 0; - inline static std::array m_maxBlock = {0, 0, 0}; - inline static int m_shmemPerSM = 0; - inline static int m_maxShmemPerBlock = 0; - inline static int m_maxBlocksPerSM = 0; - inline static int m_maxThreadsPerSM = 0; - inline static int m_maxThreadsPerBlock = 0; + inline static int m_cudaArch = -1; static int concurrency(); inline static cudaDeviceProp m_deviceProp; @@ -129,7 +108,6 @@ class CudaInternal { mutable size_type* m_scratchFunctor; cudaStream_t m_stream; uint32_t m_instance_id; - bool m_manage_stream; // Team Scratch Level 1 Space int m_n_team_scratch = 10; @@ -142,11 +120,11 @@ class CudaInternal { bool was_initialized = false; bool was_finalized = false; - // FIXME_CUDA: these want to be per-device, not per-stream... use of 'static' - // here will break once there are multiple devices though - inline static unsigned long* constantMemHostStaging = nullptr; - inline static cudaEvent_t constantMemReusable = nullptr; - inline static std::mutex constantMemMutex; + inline static std::set cuda_devices = {}; + inline static std::map constantMemHostStagingPerDevice = + {}; + inline static std::map constantMemReusablePerDevice = {}; + inline static std::map constantMemMutexPerDevice = {}; static CudaInternal& singleton(); @@ -156,7 +134,7 @@ class CudaInternal { return nullptr != m_scratchSpace && nullptr != m_scratchFlags; } - void initialize(cudaStream_t stream, bool manage_stream); + void initialize(cudaStream_t stream); void finalize(); void print_configuration(std::ostream&) const; @@ -247,12 +225,6 @@ class CudaInternal { return cudaDeviceSetLimit(limit, value); } - template - cudaError_t cuda_device_synchronize_wrapper() const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaDeviceSynchronize(); - } - template cudaError_t cuda_event_create_wrapper(cudaEvent_t* event) const { if constexpr (setCudaDevice) set_cuda_device(); @@ -290,37 +262,6 @@ class CudaInternal { return cudaFreeHost(ptr); } - template - cudaError_t cuda_get_device_count_wrapper(int* count) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetDeviceCount(count); - } - - template - cudaError_t cuda_get_device_properties_wrapper(cudaDeviceProp* prop, - int device) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetDeviceProperties(prop, device); - } - - template - const char* cuda_get_error_name_wrapper(cudaError_t error) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetErrorName(error); - } - - template - const char* cuda_get_error_string_wrapper(cudaError_t error) const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetErrorString(error); - } - - template - cudaError_t cuda_get_last_error_wrapper() const { - if constexpr (setCudaDevice) set_cuda_device(); - return cudaGetLastError(); - } - template cudaError_t cuda_graph_add_dependencies_wrapper( cudaGraph_t graph, const cudaGraphNode_t* from, const cudaGraphNode_t* to, @@ -506,10 +447,10 @@ class CudaInternal { } template - cudaError_t cuda_func_set_attributes_wrapper(T* entry, cudaFuncAttribute attr, - int value) const { + cudaError_t cuda_func_set_attribute_wrapper(T* entry, cudaFuncAttribute attr, + int value) const { if constexpr (setCudaDevice) set_cuda_device(); - return cudaFuncSetAttributes(entry, attr, value); + return cudaFuncSetAttribute(entry, attr, value); } template diff --git a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index 82a72b69021..b0dadb45f72 100644 --- a/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -21,7 +21,6 @@ #ifdef KOKKOS_ENABLE_CUDA #include -#include #include #include #include @@ -118,42 +117,43 @@ inline bool is_empty_launch(dim3 const& grid, dim3 const& block) { } inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) { - if (cuda_instance->m_maxShmemPerBlock < shmem) { + int const maxShmemPerBlock = cuda_instance->m_deviceProp.sharedMemPerBlock; + if (maxShmemPerBlock < shmem) { Kokkos::Impl::throw_runtime_exception( - std::string("CudaParallelLaunch (or graph node creation) FAILED: shared" - " memory request is too large")); + "CudaParallelLaunch (or graph node creation) FAILED: shared memory " + "request is too large"); } } // These functions need to be templated on DriverType and LaunchBounds // so that the static bool is unique for each type combo // KernelFuncPtr does not necessarily contain that type information. -// FIXME_CUDA_MULTIPLE_DEVICES template const cudaFuncAttributes& get_cuda_kernel_func_attributes( - const KernelFuncPtr& func) { + int cuda_device, const KernelFuncPtr& func) { // Only call cudaFuncGetAttributes once for each unique kernel // by leveraging static variable initialization rules - auto wrap_get_attributes = [&]() -> cudaFuncAttributes { + static std::map func_attr; + if (func_attr.find(cuda_device) == func_attr.end()) { cudaFuncAttributes attr; - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_func_get_attributes_wrapper(&attr, - func))); - return attr; - }; - static cudaFuncAttributes func_attr = wrap_get_attributes(); - return func_attr; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncGetAttributes(&attr, func)); + func_attr.emplace(cuda_device, attr); + } + return func_attr[cuda_device]; } template -inline void configure_shmem_preference(const KernelFuncPtr& func, +inline void configure_shmem_preference(const int cuda_device, + const KernelFuncPtr& func, const cudaDeviceProp& device_props, const size_t block_size, int& shmem, const size_t occupancy) { #ifndef KOKKOS_ARCH_KEPLER const auto& func_attr = - get_cuda_kernel_func_attributes(func); + get_cuda_kernel_func_attributes(cuda_device, + func); // Compute limits for number of blocks due to registers/SM const size_t regs_per_sm = device_props.regsPerMultiprocessor; @@ -222,7 +222,7 @@ inline void configure_shmem_preference(const KernelFuncPtr& func, // FIXME_CUDA_MULTIPLE_DEVICES auto set_cache_config = [&] { KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_func_set_attributes_wrapper( + (CudaInternal::singleton().cuda_func_set_attribute_wrapper( func, cudaFuncAttributePreferredSharedMemoryCarveout, carveout))); return carveout; }; @@ -387,8 +387,8 @@ struct CudaParallelLaunchKernelInvoker< driver.get_policy().impl_get_desired_occupancy().value(); size_t block_size = block.x * block.y * block.z; Impl::configure_shmem_preference( - base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, - shmem, desired_occupancy); + cuda_instance->m_cudaDev, base_t::get_kernel_func(), + cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy); } void const* args[] = {&driver}; @@ -487,8 +487,8 @@ struct CudaParallelLaunchKernelInvoker< driver.get_policy().impl_get_desired_occupancy().value(); size_t block_size = block.x * block.y * block.z; Impl::configure_shmem_preference( - base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, - shmem, desired_occupancy); + cuda_instance->m_cudaDev, base_t::get_kernel_func(), + cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy); } auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); @@ -576,13 +576,16 @@ struct CudaParallelLaunchKernelInvoker< static void invoke_kernel(DriverType const& driver, dim3 const& grid, dim3 const& block, int shmem, CudaInternal const* cuda_instance) { + int cuda_device = cuda_instance->m_cudaDev; // Wait until the previous kernel that uses the constant buffer is done - std::lock_guard lock(CudaInternal::constantMemMutex); + std::lock_guard lock( + CudaInternal::constantMemMutexPerDevice[cuda_device]); KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_event_synchronize_wrapper( - CudaInternal::constantMemReusable))); + CudaInternal::constantMemReusablePerDevice[cuda_device]))); // Copy functor (synchronously) to staging buffer in pinned host memory - unsigned long* staging = cuda_instance->constantMemHostStaging; + unsigned long* staging = + cuda_instance->constantMemHostStagingPerDevice[cuda_device]; memcpy(staging, &driver, sizeof(DriverType)); // Copy functor asynchronously from there to constant memory on the device @@ -597,7 +600,7 @@ struct CudaParallelLaunchKernelInvoker< // Record an event that says when the constant buffer can be reused KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_instance->cuda_event_record_wrapper( - CudaInternal::constantMemReusable))); + CudaInternal::constantMemReusablePerDevice[cuda_device]))); } inline static void create_parallel_launch_graph_node( @@ -665,8 +668,8 @@ struct CudaParallelLaunchImpl< Impl::configure_shmem_preference< DriverType, Kokkos::LaunchBounds>( - base_t::get_kernel_func(), cuda_instance->m_deviceProp, block_size, - shmem, desired_occupancy); + cuda_instance->m_cudaDev, base_t::get_kernel_func(), + cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy); } desul::ensure_cuda_lock_arrays_on_device(); @@ -675,18 +678,17 @@ struct CudaParallelLaunchImpl< base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance); #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - KOKKOS_IMPL_CUDA_SAFE_CALL( - (cuda_instance->cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); cuda_instance->fence( "Kokkos::Impl::launch_kernel: Debug Only Check for Execution Error"); #endif } } - static cudaFuncAttributes get_cuda_func_attributes() { + static cudaFuncAttributes get_cuda_func_attributes(int cuda_device) { return get_cuda_kernel_func_attributes< DriverType, Kokkos::LaunchBounds>( - base_t::get_kernel_func()); + cuda_device, base_t::get_kernel_func()); } }; diff --git a/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp b/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp index 7492ab49e56..2c7eba7a18f 100644 --- a/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp +++ b/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp @@ -40,8 +40,8 @@ template <> inline TileSizeProperties get_tile_size_properties( const Kokkos::Cuda& space) { TileSizeProperties properties; - properties.max_threads = - space.impl_internal_space_instance()->m_maxThreadsPerSM; + properties.max_threads = space.impl_internal_space_instance() + ->m_deviceProp.maxThreadsPerMultiProcessor; properties.default_largest_tile_size = 16; properties.default_tile_size = 2; properties.max_total_tile_size = 512; diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp index 49d6c112e37..63038984004 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp @@ -28,7 +28,6 @@ #include #include #include -#include #include #include @@ -42,8 +41,8 @@ namespace Impl { template int max_tile_size_product_helper(const Policy& pol, const LaunchBounds&) { cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); + CudaParallelLaunch::get_cuda_func_attributes( + pol.space().cuda_device()); auto const& prop = pol.space().cuda_device_prop(); // Limits due to registers/SM, MDRange doesn't have @@ -96,7 +95,7 @@ class ParallelFor, Kokkos::Cuda> { inline void execute() const { if (m_rp.m_num_tiles == 0) return; - const auto maxblocks = cuda_internal_maximum_grid_count(); + const auto maxblocks = m_rp.space().cuda_device_prop().maxGridSize; if (RP::rank == 2) { const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], 1); KOKKOS_ASSERT(block.x > 0); @@ -325,19 +324,18 @@ class ParallelReduce( f, n); using closure_type = Impl::ParallelReduce, Policy, Kokkos::Cuda>; - cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); + cudaFuncAttributes attr = CudaParallelLaunch:: + get_cuda_func_attributes(m_policy.space().cuda_device()); while ( - (n && - (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < - shmem_size)) || + (n && (maxShmemPerBlock < shmem_size)) || (n > static_cast( Kokkos::Impl::cuda_get_max_block_size( diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp index 34729992812..0f052be3c30 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp @@ -28,7 +28,6 @@ #include #include #include -#include #include #include @@ -86,18 +85,18 @@ class ParallelFor, Kokkos::Cuda> { const typename Policy::index_type nwork = m_policy.end() - m_policy.begin(); cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); + CudaParallelLaunch::get_cuda_func_attributes( + m_policy.space().cuda_device()); const int block_size = Kokkos::Impl::cuda_get_opt_block_size( m_policy.space().impl_internal_space_instance(), attr, m_functor, 1, 0, 0); KOKKOS_ASSERT(block_size > 0); dim3 block(1, block_size, 1); + const int maxGridSizeX = m_policy.space().cuda_device_prop().maxGridSize[0]; dim3 grid( - std::min( - typename Policy::index_type((nwork + block.y - 1) / block.y), - typename Policy::index_type(cuda_internal_maximum_grid_count()[0])), + std::min(typename Policy::index_type((nwork + block.y - 1) / block.y), + typename Policy::index_type(maxGridSizeX)), 1, 1); #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION if (Kokkos::Impl::CudaInternal::cuda_use_serial_execution()) { @@ -244,10 +243,10 @@ class ParallelReduce, if (CudaTraits::WarpSize < word_count.value) { __syncthreads(); } else if (word_count.value > 1) { - // Inside cuda_single_inter_block_reduce_scan() above, shared[i] below - // might have been updated by a single thread within a warp without - // synchronization afterwards. Synchronize threads within warp to avoid - // potential racecondition. + // Inside cuda_single_inter_block_reduce_scan() and final() above, + // shared[i] below might have been updated by a single thread within a + // warp without synchronization afterwards. Synchronize threads within + // warp to avoid potential race condition. __syncwarp(0xffffffff); } @@ -260,19 +259,18 @@ class ParallelReduce, // Determine block size constrained by shared memory: inline unsigned local_block_size(const FunctorType& f) { unsigned n = CudaTraits::WarpSize * 8; + const int maxShmemPerBlock = + m_policy.space().cuda_device_prop().sharedMemPerBlock; int shmem_size = cuda_single_inter_block_reduce_scan_shmem( f, n); using closure_type = Impl::ParallelReduce, Policy, Kokkos::Cuda>; - cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); + cudaFuncAttributes attr = CudaParallelLaunch:: + get_cuda_func_attributes(m_policy.space().cuda_device()); while ( - (n && - (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < - shmem_size)) || + (n && (maxShmemPerBlock < shmem_size)) || (n > static_cast( Kokkos::Impl::cuda_get_max_block_size( @@ -615,11 +613,11 @@ class ParallelScan, Kokkos::Cuda> { // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit // testing + const int maxShmemPerBlock = + m_policy.space().cuda_device_prop().sharedMemPerBlock; unsigned n = CudaTraits::WarpSize * 4; while (n && - unsigned(m_policy.space() - .impl_internal_space_instance() - ->m_maxShmemPerBlock) < + unsigned(maxShmemPerBlock) < cuda_single_inter_block_reduce_scan_shmem(f, n)) { n >>= 1; @@ -939,11 +937,11 @@ class ParallelScanWithTotal, // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit // testing + const int maxShmemPerBlock = + m_policy.space().cuda_device_prop().sharedMemPerBlock; unsigned n = CudaTraits::WarpSize * 4; while (n && - unsigned(m_policy.space() - .impl_internal_space_instance() - ->m_maxShmemPerBlock) < + unsigned(maxShmemPerBlock) < cuda_single_inter_block_reduce_scan_shmem(f, n)) { n >>= 1; diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp index b4679b4e0da..9f7be45c839 100644 --- a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include @@ -98,7 +98,7 @@ class TeamPolicyInternal Impl::ParallelFor>; cudaFuncAttributes attr = CudaParallelLaunch:: - get_cuda_func_attributes(); + get_cuda_func_attributes(space().cuda_device()); int block_size = Kokkos::Impl::cuda_get_max_block_size( @@ -137,7 +137,7 @@ class TeamPolicyInternal Impl::ParallelFor>; cudaFuncAttributes attr = CudaParallelLaunch:: - get_cuda_func_attributes(); + get_cuda_func_attributes(space().cuda_device()); const int block_size = Kokkos::Impl::cuda_get_opt_block_size( @@ -262,7 +262,8 @@ class TeamPolicyInternal m_tune_team(bool(team_size_request <= 0)), m_tune_vector(bool(vector_length_request <= 0)) { // Make sure league size is permissible - if (league_size_ >= int(Impl::cuda_internal_maximum_grid_count()[0])) + const int maxGridSizeX = m_space.cuda_device_prop().maxGridSize[0]; + if (league_size_ >= maxGridSizeX) Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution " "space."); @@ -369,7 +370,7 @@ class TeamPolicyInternal cudaFuncAttributes attr = CudaParallelLaunch:: - get_cuda_func_attributes(); + get_cuda_func_attributes(space().cuda_device()); const int block_size = std::forward(block_size_callable)( space().impl_internal_space_instance(), attr, f, (size_t)impl_vector_length(), @@ -539,8 +540,8 @@ class ParallelFor, auto internal_space_instance = m_policy.space().impl_internal_space_instance(); cudaFuncAttributes attr = - CudaParallelLaunch::get_cuda_func_attributes(); + CudaParallelLaunch::get_cuda_func_attributes( + internal_space_instance->m_cudaDev); m_team_size = m_team_size >= 0 ? m_team_size @@ -575,10 +576,11 @@ class ParallelFor, static_cast(m_league_size)))); } + const int maxShmemPerBlock = + m_policy.space().cuda_device_prop().sharedMemPerBlock; const int shmem_size_total = m_shmem_begin + m_shmem_size; - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { - printf("%i %i\n", internal_space_instance->m_maxShmemPerBlock, - shmem_size_total); + if (maxShmemPerBlock < shmem_size_total) { + printf("%i %i\n", maxShmemPerBlock, shmem_size_total); Kokkos::Impl::throw_runtime_exception(std::string( "Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory")); } @@ -623,6 +625,22 @@ class ParallelReduce 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + using word_size_type = std::conditional_t< + sizeof(value_type) < sizeof(Kokkos::Cuda::size_type), + std::conditional_t, + Kokkos::Cuda::size_type>; using size_type = Cuda::size_type; using reducer_type = ReducerType; @@ -646,9 +664,11 @@ class ParallelReduce + const integral_nonzero_constant word_count(m_functor_reducer.get_reducer().value_size() / - sizeof(size_type)); + sizeof(word_size_type)); reference_type value = m_functor_reducer.get_reducer().init( - kokkos_impl_cuda_shared_memory() + + kokkos_impl_cuda_shared_memory() + threadIdx.y * word_count.value); // Iterate this block through the league @@ -721,18 +742,19 @@ class ParallelReduce( m_functor_reducer.get_reducer(), blockIdx.x, gridDim.x, - kokkos_impl_cuda_shared_memory(), m_scratch_space, + kokkos_impl_cuda_shared_memory(), m_scratch_space, m_scratch_flags); if (do_final_reduction) { // This is the final block with the final result at the final threads' // location - size_type* const shared = kokkos_impl_cuda_shared_memory() + - (blockDim.y - 1) * word_count.value; + word_size_type* const shared = + kokkos_impl_cuda_shared_memory() + + (blockDim.y - 1) * word_count.value; size_type* const global = m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) + ? reinterpret_cast(m_result_ptr) : (m_unified_space ? m_unified_space : m_scratch_space); if (threadIdx.y == 0) { @@ -787,7 +809,8 @@ class ParallelReduce(m_scratch_space), result, + m_scratch_flags, blockDim.y)) { const unsigned id = threadIdx.y * blockDim.x + threadIdx.x; if (id == 0) { m_functor_reducer.get_reducer().final(&value); @@ -808,13 +831,15 @@ class ParallelReduce(cuda_internal_scratch_space( + m_policy.space(), + m_functor_reducer.get_reducer().value_size() * block_count)); m_scratch_flags = cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type)); - m_unified_space = cuda_internal_scratch_unified( - m_policy.space(), m_functor_reducer.get_reducer().value_size()); + m_unified_space = + reinterpret_cast(cuda_internal_scratch_unified( + m_policy.space(), m_functor_reducer.get_reducer().value_size())); dim3 block(m_vector_size, m_team_size, 1); dim3 grid(block_count, 1, 1); @@ -847,7 +872,8 @@ class ParallelReduce(m_result_ptr, m_scratch_space, size); + DeepCopy(m_policy.space(), m_result_ptr, + m_scratch_space, size); } } } @@ -883,9 +909,8 @@ class ParallelReduce::get_cuda_func_attributes(); + cudaFuncAttributes attr = CudaParallelLaunch:: + get_cuda_func_attributes(internal_space_instance->m_cudaDev); m_team_size = m_team_size >= 0 ? m_team_size @@ -940,6 +965,8 @@ class ParallelReduce bad team size")); } - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { + if (maxShmemPerBlock < shmem_size_total) { Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much " "L0 scratch memory")); diff --git a/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp index 7ccedbfe28d..3037c4ab541 100644 --- a/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp +++ b/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp @@ -103,7 +103,7 @@ template __device__ bool cuda_inter_block_reduction( typename FunctorType::reference_type value, typename FunctorType::reference_type neutral, const FunctorType& reducer, - Cuda::size_type* const m_scratch_space, + typename FunctorType::pointer_type const m_scratch_space, typename FunctorType::pointer_type const /*result*/, Cuda::size_type* const m_scratch_flags, const int max_active_thread = blockDim.y) { @@ -117,7 +117,7 @@ __device__ bool cuda_inter_block_reduction( // One thread in the block writes block result to global scratch_memory if (id == 0) { - pointer_type global = ((pointer_type)m_scratch_space) + blockIdx.x; + pointer_type global = m_scratch_space + blockIdx.x; *global = value; } @@ -140,7 +140,7 @@ __device__ bool cuda_inter_block_reduction( last_block = true; value = neutral; - pointer_type const volatile global = (pointer_type)m_scratch_space; + pointer_type const volatile global = m_scratch_space; // Reduce all global values with splitting work over threads in one warp const int step_size = @@ -702,8 +702,7 @@ inline void check_reduced_view_shmem_size(const Policy& policy, unsigned reqShmemSize = cuda_single_inter_block_reduce_scan_shmem( functor, minBlockSize); - size_t maxShmemPerBlock = - policy.space().impl_internal_space_instance()->m_maxShmemPerBlock; + size_t maxShmemPerBlock = policy.space().cuda_device_prop().sharedMemPerBlock; if (reqShmemSize > maxShmemPerBlock) { Kokkos::Impl::throw_runtime_exception( diff --git a/core/src/Cuda/Kokkos_Cuda_Task.hpp b/core/src/Cuda/Kokkos_Cuda_Task.hpp index baff7ef3f55..86d6d91bbee 100644 --- a/core/src/Cuda/Kokkos_Cuda_Task.hpp +++ b/core/src/Cuda/Kokkos_Cuda_Task.hpp @@ -84,8 +84,8 @@ class TaskQueueSpecialization> { KOKKOS_INLINE_FUNCTION static void iff_single_thread_recursive_execute(scheduler_type const&) {} - static int get_max_team_count(execution_space const&) { - return Kokkos::Impl::cuda_internal_multiprocessor_count() * warps_per_block; + static int get_max_team_count(execution_space const& space) { + return space.cuda_device_prop().multiProcessorCount * warps_per_block; } __device__ static void driver(scheduler_type scheduler, @@ -225,7 +225,11 @@ class TaskQueueSpecialization> { // FIXME_CUDA_MULTIPLE_DEVICES static void execute(scheduler_type const& scheduler) { const int shared_per_warp = 2048; - const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1); + const Kokkos::Cuda& exec = scheduler.get_execution_space(); + const auto& impl_instance = exec.impl_internal_space_instance(); + const int multi_processor_count = + exec.cuda_device_prop().multiProcessorCount; + const dim3 grid(multi_processor_count, 1, 1); const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block); const int shared_total = shared_per_warp * warps_per_block; const cudaStream_t stream = nullptr; @@ -245,34 +249,30 @@ class TaskQueueSpecialization> { // Query the stack size, in bytes: size_t previous_stack_size = 0; - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_get_limit_wrapper( - &previous_stack_size, cudaLimitStackSize))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_get_limit_wrapper( + &previous_stack_size, cudaLimitStackSize)); // If not large enough then set the stack size, in bytes: const size_t larger_stack_size = 1 << 11; if (previous_stack_size < larger_stack_size) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_set_limit_wrapper( - cudaLimitStackSize, larger_stack_size))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_set_limit_wrapper( + cudaLimitStackSize, larger_stack_size)); } cuda_task_queue_execute<<>>( scheduler, shared_per_warp); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); Impl::cuda_device_synchronize( "Kokkos::Impl::TaskQueueSpecialization::execute: Post Task Execution"); if (previous_stack_size < larger_stack_size) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_set_limit_wrapper( - cudaLimitStackSize, previous_stack_size))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_set_limit_wrapper( + cudaLimitStackSize, previous_stack_size)); } } @@ -300,8 +300,8 @@ class TaskQueueSpecialization> { set_cuda_task_base_apply_function_pointer <<<1, 1>>>(ptr_ptr, dtor_ptr); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); + Impl::cuda_device_synchronize( "Kokkos::Impl::TaskQueueSpecialization::execute: Post Get Function Pointer for Tasks"); @@ -466,7 +466,13 @@ class TaskQueueSpecializationConstrained< static void execute(scheduler_type const& scheduler) { const int shared_per_warp = 2048; const int warps_per_block = 4; - const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1); + const Kokkos::Cuda exec = Cuda(); // FIXME_CUDA_MULTIPLE_DEVICES + const auto& impl_instance = exec.impl_internal_space_instance(); + const int multi_processor_count = + // FIXME not sure why this didn't work + // exec.cuda_device_prop().multiProcessorCount; + impl_instance->m_deviceProp.multiProcessorCount; + const dim3 grid(multi_processor_count, 1, 1); // const dim3 grid( 1 , 1 , 1 ); const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block); const int shared_total = shared_per_warp * warps_per_block; @@ -482,34 +488,30 @@ class TaskQueueSpecializationConstrained< // Query the stack size, in bytes: size_t previous_stack_size = 0; - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_get_limit_wrapper( - &previous_stack_size, cudaLimitStackSize))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_get_limit_wrapper( + &previous_stack_size, cudaLimitStackSize)); // If not large enough then set the stack size, in bytes: const size_t larger_stack_size = 2048; if (previous_stack_size < larger_stack_size) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_set_limit_wrapper( - cudaLimitStackSize, larger_stack_size))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_set_limit_wrapper( + cudaLimitStackSize, larger_stack_size)); } cuda_task_queue_execute<<>>( scheduler, shared_per_warp); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); Impl::cuda_device_synchronize( "Kokkos::Impl::TaskQueueSpecializationConstrained::execute: Post Execute Task"); if (previous_stack_size < larger_stack_size) { - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_device_set_limit_wrapper( - cudaLimitStackSize, previous_stack_size))); + KOKKOS_IMPL_CUDA_SAFE_CALL(impl_instance->cuda_device_set_limit_wrapper( + cudaLimitStackSize, previous_stack_size)); } } @@ -532,8 +534,7 @@ class TaskQueueSpecializationConstrained< set_cuda_task_base_apply_function_pointer <<<1, 1>>>(ptr_ptr, dtor_ptr); - KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_get_last_error_wrapper())); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError()); Impl::cuda_device_synchronize( "Kokkos::Impl::TaskQueueSpecializationConstrained::get_function_pointer: Post Get Function Pointer"); diff --git a/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp b/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp index abb747e39a1..94a428493f4 100644 --- a/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp +++ b/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp @@ -22,7 +22,6 @@ #include #include -#include namespace Kokkos { diff --git a/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp index a945a716bc3..c7ea6988a5d 100644 --- a/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp +++ b/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp @@ -77,7 +77,9 @@ class ParallelFor, inline void execute() { const int warps_per_block = 4; - const dim3 grid(Kokkos::Impl::cuda_internal_multiprocessor_count(), 1, 1); + const int multi_processor_count = + m_policy.space().cuda_device_prop().multiProcessorCount; + const dim3 grid(multi_processor_count, 1, 1); const dim3 block(1, Kokkos::Impl::CudaTraits::WarpSize, warps_per_block); const int shared = 0; diff --git a/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp b/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp index c7f0d12d914..517c592af72 100644 --- a/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp +++ b/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp @@ -25,23 +25,14 @@ namespace Impl { template struct ZeroMemset> { - ZeroMemset(const Kokkos::Cuda& exec_space_instance, const View& dst, - typename View::const_value_type&) { + ZeroMemset(const Kokkos::Cuda& exec_space_instance, + const View& dst) { KOKKOS_IMPL_CUDA_SAFE_CALL( (exec_space_instance.impl_internal_space_instance() ->cuda_memset_async_wrapper( dst.data(), 0, dst.size() * sizeof(typename View::value_type)))); } - - ZeroMemset(const View& dst, - typename View::const_value_type&) { - // FIXME_CUDA_MULTIPLE_DEVICES - KOKKOS_IMPL_CUDA_SAFE_CALL( - (Kokkos::Impl::CudaInternal::singleton().cuda_memset_wrapper( - dst.data(), 0, - dst.size() * sizeof(typename View::value_type)))); - } }; } // namespace Impl diff --git a/core/src/HIP/Kokkos_HIP.cpp b/core/src/HIP/Kokkos_HIP.cpp index f78bfd28b2f..309e07fb3fb 100644 --- a/core/src/HIP/Kokkos_HIP.cpp +++ b/core/src/HIP/Kokkos_HIP.cpp @@ -18,6 +18,7 @@ #define KOKKOS_IMPL_PUBLIC_INCLUDE #endif +#include #include #include @@ -41,7 +42,9 @@ int HIP::impl_is_initialized() { } void HIP::impl_initialize(InitializationSettings const& settings) { - const int hip_device_id = Impl::get_gpu(settings); + const std::vector& visible_devices = Impl::get_visible_devices(); + const int hip_device_id = + Impl::get_gpu(settings).value_or(visible_devices[0]); Impl::HIPInternal::m_hipDev = hip_device_id; KOKKOS_IMPL_HIP_SAFE_CALL( @@ -89,10 +92,23 @@ void HIP::impl_initialize(InitializationSettings const& settings) { hipStream_t singleton_stream; KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&singleton_stream)); - Impl::HIPInternal::singleton().initialize(singleton_stream, /*manage*/ true); + Impl::HIPInternal::singleton().initialize(singleton_stream); } -void HIP::impl_finalize() { Impl::HIPInternal::singleton().finalize(); } +void HIP::impl_finalize() { + (void)Impl::hip_global_unique_token_locks(true); + + desul::Impl::finalize_lock_arrays(); // FIXME + + KOKKOS_IMPL_HIP_SAFE_CALL( + hipEventDestroy(Impl::HIPInternal::constantMemReusable)); + KOKKOS_IMPL_HIP_SAFE_CALL( + hipHostFree(Impl::HIPInternal::constantMemHostStaging)); + + Impl::HIPInternal::singleton().finalize(); + KOKKOS_IMPL_HIP_SAFE_CALL( + hipStreamDestroy(Impl::HIPInternal::singleton().m_stream)); +} HIP::HIP() : m_space_instance(&Impl::HIPInternal::singleton(), @@ -102,13 +118,17 @@ HIP::HIP() } HIP::HIP(hipStream_t const stream, Impl::ManageStream manage_stream) - : m_space_instance(new Impl::HIPInternal, [](Impl::HIPInternal* ptr) { - ptr->finalize(); - delete ptr; - }) { + : m_space_instance( + new Impl::HIPInternal, [manage_stream](Impl::HIPInternal* ptr) { + ptr->finalize(); + if (static_cast(manage_stream)) { + KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(ptr->m_stream)); + } + delete ptr; + }) { Impl::HIPInternal::singleton().verify_is_initialized( "HIP instance constructor"); - m_space_instance->initialize(stream, static_cast(manage_stream)); + m_space_instance->initialize(stream); } KOKKOS_DEPRECATED HIP::HIP(hipStream_t const stream, bool manage_stream) diff --git a/core/src/HIP/Kokkos_HIP.hpp b/core/src/HIP/Kokkos_HIP.hpp index 61ed346b218..3a88e97ee3d 100644 --- a/core/src/HIP/Kokkos_HIP.hpp +++ b/core/src/HIP/Kokkos_HIP.hpp @@ -57,13 +57,15 @@ class HIP { //! \name Functions that all Kokkos devices must implement. //@{ - KOKKOS_INLINE_FUNCTION static int in_parallel() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static int in_parallel() { #if defined(__HIP_DEVICE_COMPILE__) return true; #else return false; #endif } +#endif /** \brief Wait until all dispatched functors complete. * @@ -94,9 +96,13 @@ class HIP { static int impl_is_initialized(); - // static size_type device_arch(); - - static size_type detect_device_count(); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static size_type detect_device_count() { + int count; + KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&count)); + return count; + } +#endif #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(); diff --git a/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp b/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp index 576c53426bc..5f0df72df17 100644 --- a/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp +++ b/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp @@ -20,13 +20,11 @@ #include #include -#include #include #include #include -#include #include namespace Kokkos { @@ -43,7 +41,6 @@ class GraphNodeKernelImpl using base_t = typename PatternImplSpecializationFromTag::type; - using Record = Kokkos::Impl::SharedAllocationRecord; // TODO use the name and executionspace template @@ -60,7 +57,7 @@ class GraphNodeKernelImpl ~GraphNodeKernelImpl() { if (m_driver_storage) { - Record::decrement(Record::get_record(m_driver_storage)); + Kokkos::HIPSpace().deallocate(m_driver_storage, sizeof(base_t)); } } @@ -78,15 +75,9 @@ class GraphNodeKernelImpl Kokkos::ObservingRawPtr allocate_driver_memory_buffer() const { KOKKOS_EXPECTS(m_driver_storage == nullptr); - - auto* record = Record::allocate( - Kokkos::HIPSpace{}, "GraphNodeKernel global memory functor storage", - sizeof(base_t)); - - Record::increment(record); - m_driver_storage = reinterpret_cast(record->data()); + m_driver_storage = static_cast(Kokkos::HIPSpace().allocate( + "GraphNodeKernel global memory functor storage", sizeof(base_t))); KOKKOS_ENSURES(m_driver_storage != nullptr); - return m_driver_storage; } diff --git a/core/src/HIP/Kokkos_HIP_Instance.cpp b/core/src/HIP/Kokkos_HIP_Instance.cpp index 7f04eb721cb..22c0db047f6 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include /*--------------------------------------------------------------------------*/ @@ -89,10 +90,14 @@ void HIPInternal::print_configuration(std::ostream &s) const { << '\n'; #endif - int hipDevCount; - KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&hipDevCount)); + s << "macro KOKKOS_ENABLE_ROCTHRUST : " +#if defined(KOKKOS_ENABLE_ROCTHRUST) + << "defined\n"; +#else + << "undefined\n"; +#endif - for (int i = 0; i < hipDevCount; ++i) { + for (int i : get_visible_devices()) { hipDeviceProp_t hipProp; KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceProperties(&hipProp, i)); std::string gpu_type = hipProp.integrated == 1 ? "APU" : "dGPU"; @@ -159,14 +164,13 @@ void HIPInternal::fence(const std::string &name) const { [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(m_stream)); }); } -void HIPInternal::initialize(hipStream_t stream, bool manage_stream) { +void HIPInternal::initialize(hipStream_t stream) { KOKKOS_EXPECTS(!is_initialized()); if (was_finalized) Kokkos::abort("Calling HIP::initialize after HIP::finalize is illegal\n"); - m_stream = stream; - m_manage_stream = manage_stream; + m_stream = stream; //---------------------------------- // Multiblock reduction uses scratch flags for counters @@ -192,20 +196,19 @@ void HIPInternal::initialize(hipStream_t stream, bool manage_stream) { Kokkos::HIP::size_type *HIPInternal::scratch_space(const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { - m_scratchSpaceCount = scratch_count(size); + Kokkos::HIPSpace mem_space; - using Record = Kokkos::Impl::SharedAllocationRecord; + if (m_scratchSpace) { + mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + } - if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace)); + m_scratchSpaceCount = scratch_count(size); std::size_t alloc_size = multiply_overflow_abort(m_scratchSpaceCount, sizeScratchGrain); - Record *const r = Record::allocate( - Kokkos::HIPSpace(), "Kokkos::InternalScratchSpace", alloc_size); - - Record::increment(r); - - m_scratchSpace = reinterpret_cast(r->data()); + m_scratchSpace = static_cast( + mem_space.allocate("Kokkos::InternalScratchSpace", alloc_size)); } return m_scratchSpace; @@ -214,21 +217,23 @@ Kokkos::HIP::size_type *HIPInternal::scratch_space(const std::size_t size) { Kokkos::HIP::size_type *HIPInternal::scratch_flags(const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { - m_scratchFlagsCount = scratch_count(size); + Kokkos::HIPSpace mem_space; - using Record = Kokkos::Impl::SharedAllocationRecord; + if (m_scratchFlags) { + mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); + } - if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags)); + m_scratchFlagsCount = scratch_count(size); std::size_t alloc_size = multiply_overflow_abort(m_scratchFlagsCount, sizeScratchGrain); - Record *const r = Record::allocate( - Kokkos::HIPSpace(), "Kokkos::InternalScratchFlags", alloc_size); - - Record::increment(r); - - m_scratchFlags = reinterpret_cast(r->data()); + m_scratchFlags = static_cast( + mem_space.allocate("Kokkos::InternalScratchFlags", alloc_size)); + // We only zero-initialize the allocation when we actually allocate. + // It's the responsibility of the features using scratch_flags, + // namely parallel_reduce and parallel_scan, to reset the used values to 0. KOKKOS_IMPL_HIP_SAFE_CALL(hipMemset(m_scratchFlags, 0, alloc_size)); } @@ -238,29 +243,20 @@ Kokkos::HIP::size_type *HIPInternal::scratch_flags(const std::size_t size) { Kokkos::HIP::size_type *HIPInternal::stage_functor_for_execution( void const *driver, std::size_t const size) const { if (verify_is_initialized("scratch_functor") && m_scratchFunctorSize < size) { - m_scratchFunctorSize = size; - - using Record = Kokkos::Impl::SharedAllocationRecord; - using RecordHost = - Kokkos::Impl::SharedAllocationRecord; + Kokkos::HIPSpace device_mem_space; + Kokkos::HIPHostPinnedSpace host_mem_space; if (m_scratchFunctor) { - Record::decrement(Record::get_record(m_scratchFunctor)); - RecordHost::decrement(RecordHost::get_record(m_scratchFunctorHost)); + device_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize); + host_mem_space.deallocate(m_scratchFunctorHost, m_scratchFunctorSize); } - Record *const r = - Record::allocate(Kokkos::HIPSpace(), "Kokkos::InternalScratchFunctor", - m_scratchFunctorSize); - RecordHost *const r_host = RecordHost::allocate( - Kokkos::HIPHostPinnedSpace(), "Kokkos::InternalScratchFunctorHost", - m_scratchFunctorSize); - - Record::increment(r); - RecordHost::increment(r_host); + m_scratchFunctorSize = size; - m_scratchFunctor = reinterpret_cast(r->data()); - m_scratchFunctorHost = reinterpret_cast(r_host->data()); + m_scratchFunctor = static_cast(device_mem_space.allocate( + "Kokkos::InternalScratchFunctor", m_scratchFunctorSize)); + m_scratchFunctorHost = static_cast(host_mem_space.allocate( + "Kokkos::InternalScratchFunctorHost", m_scratchFunctorSize)); } // When using HSA_XNACK=1, it is necessary to copy the driver to the host to @@ -323,23 +319,18 @@ void HIPInternal::finalize() { this->fence("Kokkos::HIPInternal::finalize: fence on finalization"); was_finalized = true; - if (this == &singleton()) { - (void)Kokkos::Impl::hip_global_unique_token_locks(true); - desul::Impl::finalize_lock_arrays(); // FIXME - - KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(constantMemHostStaging)); - KOKKOS_IMPL_HIP_SAFE_CALL(hipEventDestroy(constantMemReusable)); - } - if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) { - using RecordHIP = Kokkos::Impl::SharedAllocationRecord; + Kokkos::HIPSpace device_mem_space; - RecordHIP::decrement(RecordHIP::get_record(m_scratchFlags)); - RecordHIP::decrement(RecordHIP::get_record(m_scratchSpace)); + device_mem_space.deallocate(m_scratchFlags, + m_scratchSpaceCount * sizeScratchGrain); + device_mem_space.deallocate(m_scratchSpace, + m_scratchFlagsCount * sizeScratchGrain); if (m_scratchFunctorSize > 0) { - RecordHIP::decrement(RecordHIP::get_record(m_scratchFunctor)); - RecordHIP::decrement(RecordHIP::get_record(m_scratchFunctorHost)); + device_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize); + Kokkos::HIPHostPinnedSpace host_mem_space; + host_mem_space.deallocate(m_scratchFunctorHost, m_scratchFunctorSize); } } @@ -348,14 +339,10 @@ void HIPInternal::finalize() { Kokkos::kokkos_free(m_team_scratch_ptr[i]); } - if (m_manage_stream && m_stream != nullptr) - KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(m_stream)); - m_scratchSpaceCount = 0; m_scratchFlagsCount = 0; m_scratchSpace = nullptr; m_scratchFlags = nullptr; - m_stream = nullptr; for (int i = 0; i < m_n_team_scratch; ++i) { m_team_scratch_current_size[i] = 0; m_team_scratch_ptr[i] = nullptr; @@ -419,13 +406,3 @@ void Kokkos::Impl::create_HIP_instances(std::vector &instances) { instances[s] = HIP(stream, ManageStream::yes); } } - -//---------------------------------------------------------------------------- - -namespace Kokkos { -HIP::size_type HIP::detect_device_count() { - int hipDevCount; - KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&hipDevCount)); - return hipDevCount; -} -} // namespace Kokkos diff --git a/core/src/HIP/Kokkos_HIP_Instance.hpp b/core/src/HIP/Kokkos_HIP_Instance.hpp index 63ad66686bb..142008124af 100644 --- a/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -98,7 +98,6 @@ class HIPInternal { uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance( reinterpret_cast(this)); - bool m_manage_stream = false; // Team Scratch Level 1 Space int m_n_team_scratch = 10; @@ -124,7 +123,7 @@ class HIPInternal { return nullptr != m_scratchSpace && nullptr != m_scratchFlags; } - void initialize(hipStream_t stream, bool manage_stream); + void initialize(hipStream_t stream); void finalize(); void print_configuration(std::ostream &) const; diff --git a/core/src/HIP/Kokkos_HIP_ParallelFor_MDRange.hpp b/core/src/HIP/Kokkos_HIP_ParallelFor_MDRange.hpp new file mode 100644 index 00000000000..db07c360b5c --- /dev/null +++ b/core/src/HIP/Kokkos_HIP_ParallelFor_MDRange.hpp @@ -0,0 +1,173 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_FOR_MDRANGE_HPP +#define KOKKOS_HIP_PARALLEL_FOR_MDRANGE_HPP + +#include + +#include +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +// ParallelFor +template +class ParallelFor, HIP> { + public: + using Policy = Kokkos::MDRangePolicy; + using functor_type = FunctorType; + + private: + using array_index_type = typename Policy::array_index_type; + using index_type = typename Policy::index_type; + using LaunchBounds = typename Policy::launch_bounds; + + const FunctorType m_functor; + const Policy m_policy; + + public: + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; + ParallelFor& operator=(ParallelFor const&) = delete; + + inline __device__ void operator()() const { + Kokkos::Impl::DeviceIterateTile(m_policy, + m_functor) + .exec_range(); + } + + inline void execute() const { + using ClosureType = ParallelFor; + if (m_policy.m_num_tiles == 0) return; + auto const maxblocks = hip_internal_maximum_grid_count(); + if (Policy::rank == 2) { + dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], 1); + dim3 const grid( + std::min( + (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) / + block.x, + maxblocks[0]), + std::min( + (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) / + block.y, + maxblocks[1]), + 1); + hip_parallel_launch( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else if (Policy::rank == 3) { + dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], + m_policy.m_tile[2]); + dim3 const grid( + std::min( + (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) / + block.x, + maxblocks[0]), + std::min( + (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) / + block.y, + maxblocks[1]), + std::min( + (m_policy.m_upper[2] - m_policy.m_lower[2] + block.z - 1) / + block.z, + maxblocks[2])); + hip_parallel_launch( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else if (Policy::rank == 4) { + // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to + // threadIdx.z + dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], + m_policy.m_tile[2], m_policy.m_tile[3]); + dim3 const grid( + std::min( + m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), + std::min( + (m_policy.m_upper[2] - m_policy.m_lower[2] + block.y - 1) / + block.y, + maxblocks[1]), + std::min( + (m_policy.m_upper[3] - m_policy.m_lower[3] + block.z - 1) / + block.z, + maxblocks[2])); + hip_parallel_launch( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else if (Policy::rank == 5) { + // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 + // to threadIdx.z + dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], + m_policy.m_tile[2] * m_policy.m_tile[3], + m_policy.m_tile[4]); + dim3 const grid( + std::min( + m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), + std::min( + m_policy.m_tile_end[2] * m_policy.m_tile_end[3], maxblocks[1]), + std::min( + (m_policy.m_upper[4] - m_policy.m_lower[4] + block.z - 1) / + block.z, + maxblocks[2])); + hip_parallel_launch( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else if (Policy::rank == 6) { + // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; + // id4,id5 to threadIdx.z + dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], + m_policy.m_tile[2] * m_policy.m_tile[3], + m_policy.m_tile[4] * m_policy.m_tile[5]); + dim3 const grid( + std::min( + m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), + std::min( + m_policy.m_tile_end[2] * m_policy.m_tile_end[3], maxblocks[1]), + std::min( + m_policy.m_tile_end[4] * m_policy.m_tile_end[5], maxblocks[2])); + hip_parallel_launch( + *this, grid, block, 0, + m_policy.space().impl_internal_space_instance(), false); + } else { + Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with HIP\n"); + } + + } // end execute + + ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} + + template + static int max_tile_size_product(const Policy&, const Functor&) { + using closure_type = + ParallelFor, HIP>; + unsigned block_size = hip_get_max_blocksize(); + if (block_size == 0) + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelFor< HIP > could not find a valid " + "tile size.")); + return block_size; + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp b/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp new file mode 100644 index 00000000000..9355c1c75fb --- /dev/null +++ b/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp @@ -0,0 +1,100 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_FOR_RANGE_HPP +#define KOKKOS_HIP_PARALLEL_FOR_RANGE_HPP + +#include + +#include +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelFor, Kokkos::HIP> { + public: + using Policy = Kokkos::RangePolicy; + + private: + using Member = typename Policy::member_type; + using WorkTag = typename Policy::work_tag; + using LaunchBounds = typename Policy::launch_bounds; + + const FunctorType m_functor; + const Policy m_policy; + + template + inline __device__ std::enable_if_t::value> exec_range( + const Member i) const { + m_functor(i); + } + + template + inline __device__ std::enable_if_t::value> exec_range( + const Member i) const { + m_functor(TagType(), i); + } + + public: + using functor_type = FunctorType; + + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; + ParallelFor& operator=(ParallelFor const&) = delete; + + inline __device__ void operator()() const { + const Member work_stride = blockDim.y * gridDim.x; + const Member work_end = m_policy.end(); + + for (Member iwork = + m_policy.begin() + threadIdx.y + blockDim.y * blockIdx.x; + iwork < work_end; + iwork = iwork < work_end - work_stride ? iwork + work_stride + : work_end) { + this->template exec_range(iwork); + } + } + + inline void execute() const { + const typename Policy::index_type nwork = m_policy.end() - m_policy.begin(); + + using DriverType = ParallelFor; + const int block_size = + Kokkos::Impl::hip_get_preferred_blocksize(); + const dim3 block(1, block_size, 1); + const dim3 grid( + typename Policy::index_type((nwork + block.y - 1) / block.y), 1, 1); + + if (block_size == 0) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelFor< HIP > could not find a " + "valid execution configuration.")); + } + Kokkos::Impl::hip_parallel_launch( + *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), + false); + } + + ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) + : m_functor(arg_functor), m_policy(arg_policy) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp b/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp new file mode 100644 index 00000000000..bf0c2193383 --- /dev/null +++ b/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp @@ -0,0 +1,177 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_FOR_TEAM_HPP +#define KOKKOS_HIP_PARALLEL_FOR_TEAM_HPP + +#include + +#include +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelFor, HIP> { + public: + using Policy = TeamPolicy; + using functor_type = FunctorType; + using size_type = HIP::size_type; + + private: + using member_type = typename Policy::member_type; + using work_tag = typename Policy::work_tag; + using launch_bounds = typename Policy::launch_bounds; + + // Algorithmic constraints: blockDim.y is a power of two AND + // blockDim.y == blockDim.z == 1 shared memory utilization: + // + // [ team reduce space ] + // [ team shared space ] + + FunctorType const m_functor; + Policy const m_policy; + size_type const m_league_size; + int m_team_size; + size_type const m_vector_size; + int m_shmem_begin; + int m_shmem_size; + void* m_scratch_ptr[2]; + size_t m_scratch_size[2]; + int m_scratch_pool_id = -1; + int32_t* m_scratch_locks; + size_t m_num_scratch_locks; + + template + __device__ inline std::enable_if_t::value> exec_team( + const member_type& member) const { + m_functor(member); + } + + template + __device__ inline std::enable_if_t::value> exec_team( + const member_type& member) const { + m_functor(TagType(), member); + } + + public: + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; + ParallelFor& operator=(ParallelFor const&) = delete; + + __device__ inline void operator()() const { + // Iterate this block through the league + int64_t threadid = 0; + if (m_scratch_size[1] > 0) { + threadid = hip_get_scratch_index(m_league_size, m_scratch_locks, + m_num_scratch_locks); + } + + int const int_league_size = static_cast(m_league_size); + for (int league_rank = blockIdx.x; league_rank < int_league_size; + league_rank += gridDim.x) { + this->template exec_team(typename Policy::member_type( + kokkos_impl_hip_shared_memory(), m_shmem_begin, m_shmem_size, + static_cast(static_cast(m_scratch_ptr[1]) + + ptrdiff_t(threadid / (blockDim.x * blockDim.y)) * + m_scratch_size[1]), + m_scratch_size[1], league_rank, m_league_size)); + } + if (m_scratch_size[1] > 0) { + hip_release_scratch_index(m_scratch_locks, threadid); + } + } + + inline void execute() const { + int64_t const shmem_size_total = m_shmem_begin + m_shmem_size; + dim3 const grid(static_cast(m_league_size), 1, 1); + dim3 const block(static_cast(m_vector_size), + static_cast(m_team_size), 1); + + using closure_type = + ParallelFor, HIP>; + Impl::hip_parallel_launch( + *this, grid, block, shmem_size_total, + m_policy.space().impl_internal_space_instance(), + true); // copy to device and execute + } + + ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) + : m_functor(arg_functor), + m_policy(arg_policy), + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()) { + auto internal_space_instance = + m_policy.space().impl_internal_space_instance(); + m_team_size = m_team_size >= 0 ? m_team_size + : arg_policy.team_size_recommended( + arg_functor, ParallelForTag()); + + m_shmem_begin = (sizeof(double) * (m_team_size + 2)); + m_shmem_size = + (m_policy.scratch_size(0, m_team_size) + + FunctorTeamShmemSize::value(m_functor, m_team_size)); + m_scratch_size[0] = m_policy.scratch_size(0, m_team_size); + m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); + m_scratch_locks = internal_space_instance->m_scratch_locks; + m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + m_scratch_ptr[0] = nullptr; + if (m_team_size <= 0) { + m_scratch_ptr[1] = nullptr; + } else { + m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space(); + m_scratch_ptr[1] = internal_space_instance->resize_team_scratch_space( + m_scratch_pool_id, + static_cast(m_scratch_size[1]) * + (std::min( + static_cast(HIP().concurrency() / + (m_team_size * m_vector_size)), + static_cast(m_league_size)))); + } + + int const shmem_size_total = m_shmem_begin + m_shmem_size; + if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { + Kokkos::Impl::throw_runtime_exception(std::string( + "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory")); + } + + size_t max_size = arg_policy.team_size_max(arg_functor, ParallelForTag()); + if (static_cast(m_team_size) > static_cast(max_size)) { + Kokkos::Impl::throw_runtime_exception(std::string( + "Kokkos::Impl::ParallelFor< HIP > requested too large team size.")); + } + } + + ~ParallelFor() { + if (m_scratch_pool_id >= 0) { + m_policy.space() + .impl_internal_space_instance() + ->release_team_scratch_space(m_scratch_pool_id); + } + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/core/src/HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp similarity index 61% rename from core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp rename to core/src/HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp index 0fa325cb12c..55b6218d1c8 100644 --- a/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp +++ b/core/src/HIP/Kokkos_HIP_ParallelReduce_MDRange.hpp @@ -14,157 +14,19 @@ // //@HEADER -#ifndef KOKKOS_HIP_PARALLEL_MDRANGE_HPP -#define KOKKOS_HIP_PARALLEL_MDRANGE_HPP +#ifndef KOKKOS_HIP_PARALLEL_REDUCE_MDRANGE_HPP +#define KOKKOS_HIP_PARALLEL_REDUCE_MDRANGE_HPP + +#include #include #include #include #include #include -#include namespace Kokkos { namespace Impl { -// ParallelFor -template -class ParallelFor, HIP> { - public: - using Policy = Kokkos::MDRangePolicy; - using functor_type = FunctorType; - - private: - using array_index_type = typename Policy::array_index_type; - using index_type = typename Policy::index_type; - using LaunchBounds = typename Policy::launch_bounds; - - const FunctorType m_functor; - const Policy m_policy; - - public: - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; - ParallelFor& operator=(ParallelFor const&) = delete; - - inline __device__ void operator()() const { - Kokkos::Impl::DeviceIterateTile(m_policy, - m_functor) - .exec_range(); - } - - inline void execute() const { - using ClosureType = ParallelFor; - if (m_policy.m_num_tiles == 0) return; - auto const maxblocks = hip_internal_maximum_grid_count(); - if (Policy::rank == 2) { - dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], 1); - dim3 const grid( - std::min( - (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) / - block.x, - maxblocks[0]), - std::min( - (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) / - block.y, - maxblocks[1]), - 1); - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else if (Policy::rank == 3) { - dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], - m_policy.m_tile[2]); - dim3 const grid( - std::min( - (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) / - block.x, - maxblocks[0]), - std::min( - (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) / - block.y, - maxblocks[1]), - std::min( - (m_policy.m_upper[2] - m_policy.m_lower[2] + block.z - 1) / - block.z, - maxblocks[2])); - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else if (Policy::rank == 4) { - // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to - // threadIdx.z - dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], - m_policy.m_tile[2], m_policy.m_tile[3]); - dim3 const grid( - std::min( - m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), - std::min( - (m_policy.m_upper[2] - m_policy.m_lower[2] + block.y - 1) / - block.y, - maxblocks[1]), - std::min( - (m_policy.m_upper[3] - m_policy.m_lower[3] + block.z - 1) / - block.z, - maxblocks[2])); - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else if (Policy::rank == 5) { - // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 - // to threadIdx.z - dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], - m_policy.m_tile[2] * m_policy.m_tile[3], - m_policy.m_tile[4]); - dim3 const grid( - std::min( - m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), - std::min( - m_policy.m_tile_end[2] * m_policy.m_tile_end[3], maxblocks[1]), - std::min( - (m_policy.m_upper[4] - m_policy.m_lower[4] + block.z - 1) / - block.z, - maxblocks[2])); - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else if (Policy::rank == 6) { - // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; - // id4,id5 to threadIdx.z - dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], - m_policy.m_tile[2] * m_policy.m_tile[3], - m_policy.m_tile[4] * m_policy.m_tile[5]); - dim3 const grid( - std::min( - m_policy.m_tile_end[0] * m_policy.m_tile_end[1], maxblocks[0]), - std::min( - m_policy.m_tile_end[2] * m_policy.m_tile_end[3], maxblocks[1]), - std::min( - m_policy.m_tile_end[4] * m_policy.m_tile_end[5], maxblocks[2])); - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else { - Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with HIP\n"); - } - - } // end execute - - ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) - : m_functor(arg_functor), m_policy(arg_policy) {} - - template - static int max_tile_size_product(const Policy&, const Functor&) { - using closure_type = - ParallelFor, HIP>; - unsigned block_size = hip_get_max_blocksize(); - if (block_size == 0) - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelFor< HIP > could not find a valid " - "tile size.")); - return block_size; - } -}; // ParallelReduce template diff --git a/core/src/HIP/Kokkos_HIP_ParallelReduce_Range.hpp b/core/src/HIP/Kokkos_HIP_ParallelReduce_Range.hpp new file mode 100644 index 00000000000..c8981866e8a --- /dev/null +++ b/core/src/HIP/Kokkos_HIP_ParallelReduce_Range.hpp @@ -0,0 +1,329 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_REDUCE_RANGE_HPP +#define KOKKOS_HIP_PARALLEL_REDUCE_RANGE_HPP + +#include + +#include +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelReduce, + Kokkos::HIP> { + public: + using Policy = Kokkos::RangePolicy; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; + + private: + using WorkRange = typename Policy::WorkRange; + using WorkTag = typename Policy::work_tag; + using Member = typename Policy::member_type; + using LaunchBounds = typename Policy::launch_bounds; + + public: + using pointer_type = typename ReducerType::pointer_type; + using value_type = typename ReducerType::value_type; + using reference_type = typename ReducerType::reference_type; + using functor_type = FunctorType; + using reducer_type = ReducerType; + using size_type = Kokkos::HIP::size_type; + using index_type = typename Policy::index_type; + // Conditionally set word_size_type to int16_t or int8_t if value_type is + // smaller than int32_t (Kokkos::HIP::size_type) + // word_size_type is used to determine the word count, shared memory buffer + // size, and global memory buffer size before the scan is performed. + // Within the scan, the word count is recomputed based on word_size_type + // and when calculating indexes into the shared/global memory buffers for + // performing the scan, word_size_type is used again. + // For scalars > 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + using word_size_type = std::conditional_t< + sizeof(value_type) < sizeof(size_type), + std::conditional_t, size_type>; + + // Algorithmic constraints: blockSize is a power of two AND blockDim.y == + // blockDim.z == 1 + + const CombinedFunctorReducerType m_functor_reducer; + const Policy m_policy; + const pointer_type m_result_ptr; + const bool m_result_ptr_device_accessible; + const bool m_result_ptr_host_accessible; + word_size_type* m_scratch_space = nullptr; + size_type* m_scratch_flags = nullptr; + + static constexpr bool UseShflReduction = false; + + private: + struct ShflReductionTag {}; + struct SHMEMReductionTag {}; + + // Make the exec_range calls call to Reduce::DeviceIterateTile + template + __device__ inline std::enable_if_t::value> exec_range( + const Member& i, reference_type update) const { + m_functor_reducer.get_functor()(i, update); + } + + template + __device__ inline std::enable_if_t::value> exec_range( + const Member& i, reference_type update) const { + m_functor_reducer.get_functor()(TagType(), i, update); + } + + public: + __device__ inline void operator()() const { + using ReductionTag = std::conditional_t; + run(ReductionTag{}); + } + + __device__ inline void run(SHMEMReductionTag) const { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + const integral_nonzero_constant + word_count(reducer.value_size() / sizeof(word_size_type)); + + { + reference_type value = reducer.init(reinterpret_cast( + ::Kokkos::kokkos_impl_hip_shared_memory() + + threadIdx.y * word_count.value)); + + // Number of blocks is bounded so that the reduction can be limited to two + // passes. Each thread block is given an approximately equal amount of + // work to perform. Accumulate the values for this block. The accumulation + // ordering does not match the final pass, but is arithmetically + // equivalent. + + const WorkRange range(m_policy, blockIdx.x, gridDim.x); + + for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); + iwork < iwork_end; iwork += blockDim.y) { + this->template exec_range(iwork, value); + } + } + + // Reduce with final value at blockDim.y - 1 location. + // Shortcut for length zero reduction + bool do_final_reduction = m_policy.begin() == m_policy.end(); + if (!do_final_reduction) + do_final_reduction = hip_single_inter_block_reduce_scan( + reducer, blockIdx.x, gridDim.x, + ::Kokkos::kokkos_impl_hip_shared_memory(), + m_scratch_space, m_scratch_flags); + if (do_final_reduction) { + // This is the final block with the final result at the final threads' + // location + + word_size_type* const shared = + ::Kokkos::kokkos_impl_hip_shared_memory() + + (blockDim.y - 1) * word_count.value; + word_size_type* const global = + m_result_ptr_device_accessible + ? reinterpret_cast(m_result_ptr) + : m_scratch_space; + + if (threadIdx.y == 0) { + reducer.final(reinterpret_cast(shared)); + } + + if (::Kokkos::Impl::HIPTraits::WarpSize < word_count.value) { + __syncthreads(); + } + + for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { + global[i] = shared[i]; + } + } + } + + __device__ inline void run(ShflReductionTag) const { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + value_type value; + reducer.init(&value); + // Number of blocks is bounded so that the reduction can be limited to two + // passes. Each thread block is given an approximately equal amount of work + // to perform. Accumulate the values for this block. The accumulation + // ordering does not match the final pass, but is arithmetically equivalent. + + WorkRange const range(m_policy, blockIdx.x, gridDim.x); + + for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); + iwork < iwork_end; iwork += blockDim.y) { + this->template exec_range(iwork, value); + } + + pointer_type const result = reinterpret_cast(m_scratch_space); + + int max_active_thread = static_cast(range.end() - range.begin()) < + static_cast(blockDim.y) + ? range.end() - range.begin() + : blockDim.y; + + max_active_thread = + (max_active_thread == 0) ? blockDim.y : max_active_thread; + + value_type init; + reducer.init(&init); + if (m_policy.begin() == m_policy.end()) { + reducer.final(&value); + pointer_type const final_result = + m_result_ptr_device_accessible ? m_result_ptr : result; + *final_result = value; + } else if (Impl::hip_inter_block_shuffle_reduction<>( + value, init, reducer, m_scratch_space, result, + m_scratch_flags, max_active_thread)) { + unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; + if (id == 0) { + reducer.final(&value); + pointer_type const final_result = + m_result_ptr_device_accessible ? m_result_ptr : result; + *final_result = value; + } + } + } + + // Determine block size constrained by shared memory: + inline unsigned local_block_size(const FunctorType& f) { + const auto& instance = m_policy.space().impl_internal_space_instance(); + auto shmem_functor = [&f](unsigned n) { + return hip_single_inter_block_reduce_scan_shmem(f, n); + }; + return Kokkos::Impl::hip_get_preferred_blocksize( + instance, shmem_functor); + } + + inline void execute() { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + const index_type nwork = m_policy.end() - m_policy.begin(); + const bool need_device_set = ReducerType::has_init_member_function() || + ReducerType::has_final_member_function() || + !m_result_ptr_host_accessible || + !std::is_same::value; + if ((nwork > 0) || need_device_set) { + const int block_size = local_block_size(m_functor_reducer.get_functor()); + if (block_size == 0) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a " + "valid execution configuration.")); + } + + // REQUIRED ( 1 , N , 1 ) + dim3 block(1, block_size, 1); + // use a slightly less constrained, but still well bounded limit for + // scratch + int nblocks = (nwork + block.y - 1) / block.y; + // Heuristic deciding the value of nblocks. + // The general idea here is we want to: + // 1. Not undersubscribe the device (i.e., we want at least + // preferred_block_min blocks) + // 2. Have each thread reduce > 1 value to minimize overheads + // 3. Limit the total # of blocks, to avoid unbounded scratch space + constexpr int block_max = 4096; + constexpr int preferred_block_min = 1024; + + if (nblocks < preferred_block_min) { + // keep blocks as is, already have low parallelism + } else if (nblocks > block_max) { + // "large dispatch" -> already have lots of parallelism + nblocks = block_max; + } else { + // in the intermediate range, try to have each thread process multiple + // items to offset the cost of the reduction (with not enough + // parallelism to hide it) + int items_per_thread = + (nwork + nblocks * block_size - 1) / (nblocks * block_size); + if (items_per_thread < 4) { + int ratio = std::min( + (nblocks + preferred_block_min - 1) / preferred_block_min, + (4 + items_per_thread - 1) / items_per_thread); + nblocks /= ratio; + } + } + + // TODO: down casting these uses more space than required? + m_scratch_space = + (word_size_type*)::Kokkos::Impl::hip_internal_scratch_space( + m_policy.space(), reducer.value_size() * nblocks); + // Intentionally do not downcast to word_size_type since we use HIP + // atomics in Kokkos_HIP_ReduceScan.hpp + m_scratch_flags = ::Kokkos::Impl::hip_internal_scratch_flags( + m_policy.space(), sizeof(size_type)); + // Required grid.x <= block.y + dim3 grid(nblocks, 1, 1); + + if (nwork == 0) { + block = dim3(1, 1, 1); + grid = dim3(1, 1, 1); + } + const int shmem = + UseShflReduction + ? 0 + : hip_single_inter_block_reduce_scan_shmem( + m_functor_reducer.get_functor(), block.y); + + Kokkos::Impl::hip_parallel_launch( + *this, grid, block, shmem, + m_policy.space().impl_internal_space_instance(), + false); // copy to device and execute + + if (!m_result_ptr_device_accessible && m_result_ptr) { + const int size = reducer.value_size(); + DeepCopy(m_policy.space(), m_result_ptr, + m_scratch_space, size); + } + } else { + if (m_result_ptr) { + reducer.init(m_result_ptr); + } + } + } + + template + ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, + const Policy& arg_policy, const ViewType& arg_result) + : m_functor_reducer(arg_functor_reducer), + m_policy(arg_policy), + m_result_ptr(arg_result.data()), + m_result_ptr_device_accessible( + MemorySpaceAccess::accessible), + m_result_ptr_host_accessible( + MemorySpaceAccess::accessible) {} +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp b/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp new file mode 100644 index 00000000000..609ba28b866 --- /dev/null +++ b/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp @@ -0,0 +1,394 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_PARALLEL_REDUCE_TEAM_HPP +#define KOKKOS_HIP_PARALLEL_REDUCE_TEAM_HPP + +#include + +#include +#include +#include +#include + +namespace Kokkos { +namespace Impl { + +template +class ParallelReduce, HIP> { + public: + using Policy = TeamPolicyInternal; + using FunctorType = typename CombinedFunctorReducerType::functor_type; + using ReducerType = typename CombinedFunctorReducerType::reducer_type; + + private: + using member_type = typename Policy::member_type; + using work_tag = typename Policy::work_tag; + using launch_bounds = typename Policy::launch_bounds; + + using pointer_type = typename ReducerType::pointer_type; + using reference_type = typename ReducerType::reference_type; + using value_type = typename ReducerType::value_type; + + public: + using functor_type = FunctorType; + using size_type = HIP::size_type; + + // static int constexpr UseShflReduction = false; + // FIXME_HIP This should be disabled unconditionally for best performance, but + // it currently causes tests to fail. + static constexpr int UseShflReduction = + (ReducerType::static_value_size() != 0); + + private: + struct ShflReductionTag {}; + struct SHMEMReductionTag {}; + + // Algorithmic constraints: blockDim.y is a power of two AND + // blockDim.y == blockDim.z == 1 shared memory utilization: + // + // [ global reduce space ] + // [ team reduce space ] + // [ team shared space ] + // + + const CombinedFunctorReducerType m_functor_reducer; + const Policy m_policy; + const pointer_type m_result_ptr; + const bool m_result_ptr_device_accessible; + const bool m_result_ptr_host_accessible; + size_type* m_scratch_space; + size_type* m_scratch_flags; + size_type m_team_begin; + size_type m_shmem_begin; + size_type m_shmem_size; + void* m_scratch_ptr[2]; + size_t m_scratch_size[2]; + int m_scratch_pool_id = -1; + int32_t* m_scratch_locks; + size_t m_num_scratch_locks; + const size_type m_league_size; + int m_team_size; + const size_type m_vector_size; + + template + __device__ inline std::enable_if_t::value> exec_team( + member_type const& member, reference_type update) const { + m_functor_reducer.get_functor()(member, update); + } + + template + __device__ inline std::enable_if_t::value> exec_team( + member_type const& member, reference_type update) const { + m_functor_reducer.get_functor()(TagType(), member, update); + } + + __device__ inline void iterate_through_league(int const threadid, + reference_type value) const { + int const int_league_size = static_cast(m_league_size); + for (int league_rank = blockIdx.x; league_rank < int_league_size; + league_rank += gridDim.x) { + this->template exec_team( + member_type( + kokkos_impl_hip_shared_memory() + m_team_begin, + m_shmem_begin, m_shmem_size, + reinterpret_cast( + reinterpret_cast(m_scratch_ptr[1]) + + static_cast(threadid / (blockDim.x * blockDim.y)) * + m_scratch_size[1]), + m_scratch_size[1], league_rank, m_league_size), + value); + } + } + + int compute_block_count() const { + constexpr auto light_weight = + Kokkos::Experimental::WorkItemProperty::HintLightWeight; + constexpr typename Policy::work_item_property property; + // Numbers were tuned on MI210 using dot product and yAx benchmarks + constexpr int block_max = + (property & light_weight) == light_weight ? 2097152 : 65536; + constexpr int preferred_block_min = 1024; + int block_count = m_league_size; + if (block_count < preferred_block_min) { + // keep blocks as is, already low parallelism + } else if (block_count >= block_max) { + block_count = block_max; + + } else { + int nwork = m_league_size * m_team_size; + int items_per_thread = + (nwork + block_count * m_team_size - 1) / (block_count * m_team_size); + if (items_per_thread < 4) { + int ratio = std::min( + (block_count + preferred_block_min - 1) / preferred_block_min, + (4 + items_per_thread - 1) / items_per_thread); + block_count /= ratio; + } + } + + return block_count; + } + + public: + __device__ inline void operator()() const { + int64_t threadid = 0; + if (m_scratch_size[1] > 0) { + threadid = hip_get_scratch_index(m_league_size, m_scratch_locks, + m_num_scratch_locks); + } + + using ReductionTag = std::conditional_t; + run(ReductionTag{}, threadid); + + if (m_scratch_size[1] > 0) { + hip_release_scratch_index(m_scratch_locks, threadid); + } + } + + __device__ inline void run(SHMEMReductionTag, int const threadid) const { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + integral_nonzero_constant const + word_count(reducer.value_size() / sizeof(size_type)); + + reference_type value = + reducer.init(kokkos_impl_hip_shared_memory() + + threadIdx.y * word_count.value); + // Iterate this block through the league + iterate_through_league(threadid, value); + + // Reduce with final value at blockDim.y - 1 location. + bool do_final_reduce = (m_league_size == 0); + if (!do_final_reduce) + do_final_reduce = + hip_single_inter_block_reduce_scan( + reducer, blockIdx.x, gridDim.x, + kokkos_impl_hip_shared_memory(), m_scratch_space, + m_scratch_flags); + if (do_final_reduce) { + // This is the final block with the final result at the final threads' + // location + + size_type* const shared = kokkos_impl_hip_shared_memory() + + (blockDim.y - 1) * word_count.value; + size_type* const global = m_result_ptr_device_accessible + ? reinterpret_cast(m_result_ptr) + : m_scratch_space; + + if (threadIdx.y == 0) { + reducer.final(reinterpret_cast(shared)); + } + + if (HIPTraits::WarpSize < word_count.value) { + __syncthreads(); + } + + for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { + global[i] = shared[i]; + } + } + } + + __device__ inline void run(ShflReductionTag, int const threadid) const { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + value_type value; + reducer.init(&value); + + // Iterate this block through the league + iterate_through_league(threadid, value); + + pointer_type const result = + m_result_ptr_device_accessible + ? m_result_ptr + : reinterpret_cast(m_scratch_space); + + value_type init; + reducer.init(&init); + if (m_league_size == 0) { + reducer.final(&value); + *result = value; + } else if (Impl::hip_inter_block_shuffle_reduction( + value, init, reducer, m_scratch_space, result, + m_scratch_flags, blockDim.y)) { + unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; + if (id == 0) { + reducer.final(&value); + *result = value; + } + } + } + + inline void execute() { + const ReducerType& reducer = m_functor_reducer.get_reducer(); + + const bool is_empty_range = m_league_size == 0 || m_team_size == 0; + const bool need_device_set = ReducerType::has_init_member_function() || + ReducerType::has_final_member_function() || + !m_result_ptr_host_accessible || + Policy::is_graph_kernel::value || + !std::is_same::value; + if (!is_empty_range || need_device_set) { + int const block_count = compute_block_count(); + + m_scratch_space = hip_internal_scratch_space( + m_policy.space(), reducer.value_size() * block_count); + m_scratch_flags = + hip_internal_scratch_flags(m_policy.space(), sizeof(size_type)); + + dim3 block(m_vector_size, m_team_size, 1); + dim3 grid(block_count, 1, 1); + if (is_empty_range) { + block = dim3(1, 1, 1); + grid = dim3(1, 1, 1); + } + const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; + + Impl::hip_parallel_launch( + *this, grid, block, shmem_size_total, + m_policy.space().impl_internal_space_instance(), + true); // copy to device and execute + + if (!m_result_ptr_device_accessible) { + m_policy.space().impl_internal_space_instance()->fence(); + + if (m_result_ptr) { + const int size = reducer.value_size(); + DeepCopy(m_policy.space(), m_result_ptr, + m_scratch_space, size); + } + } + } else { + if (m_result_ptr) { + reducer.init(m_result_ptr); + } + } + } + + template + ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer, + Policy const& arg_policy, ViewType const& arg_result) + : m_functor_reducer(arg_functor_reducer), + m_policy(arg_policy), + m_result_ptr(arg_result.data()), + m_result_ptr_device_accessible( + MemorySpaceAccess::accessible), + m_result_ptr_host_accessible( + MemorySpaceAccess::accessible), + m_scratch_space(nullptr), + m_scratch_flags(nullptr), + m_team_begin(0), + m_shmem_begin(0), + m_shmem_size(0), + m_scratch_ptr{nullptr, nullptr}, + m_league_size(arg_policy.league_size()), + m_team_size(arg_policy.team_size()), + m_vector_size(arg_policy.impl_vector_length()) { + auto internal_space_instance = + m_policy.space().impl_internal_space_instance(); + m_team_size = m_team_size >= 0 ? m_team_size + : arg_policy.team_size_recommended( + arg_functor_reducer.get_functor(), + arg_functor_reducer.get_reducer(), + ParallelReduceTag()); + + m_team_begin = + UseShflReduction + ? 0 + : hip_single_inter_block_reduce_scan_shmem( + arg_functor_reducer.get_functor(), m_team_size); + m_shmem_begin = sizeof(double) * (m_team_size + 2); + m_shmem_size = m_policy.scratch_size(0, m_team_size) + + FunctorTeamShmemSize::value( + arg_functor_reducer.get_functor(), m_team_size); + m_scratch_size[0] = m_shmem_size; + m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); + m_scratch_locks = internal_space_instance->m_scratch_locks; + m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; + if (m_team_size <= 0) { + m_scratch_ptr[1] = nullptr; + } else { + m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space(); + m_scratch_ptr[1] = internal_space_instance->resize_team_scratch_space( + m_scratch_pool_id, + static_cast(m_scratch_size[1]) * + (std::min( + static_cast(HIP().concurrency() / + (m_team_size * m_vector_size)), + static_cast(m_league_size)))); + } + + // The global parallel_reduce does not support vector_length other than 1 at + // the moment + if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) + Impl::throw_runtime_exception( + "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " + "greater than 1 is not currently supported for HIP for dynamic " + "sized reduction types."); + + if ((m_team_size < HIPTraits::WarpSize) && !UseShflReduction) + Impl::throw_runtime_exception( + "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller " + "than 64 is not currently supported with HIP for dynamic sized " + "reduction types."); + + // Functor's reduce memory, team scan memory, and team shared memory depend + // upon team size. + + const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; + + if (!Kokkos::Impl::is_integral_power_of_two(m_team_size) && + !UseShflReduction) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size")); + } + + if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > requested too much " + "L0 scratch memory")); + } + + size_t max_size = arg_policy.team_size_max( + arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(), + ParallelReduceTag()); + if (static_cast(m_team_size) > static_cast(max_size)) { + Kokkos::Impl::throw_runtime_exception( + std::string("Kokkos::Impl::ParallelReduce< HIP > requested too " + "large team size.")); + } + } + + ~ParallelReduce() { + if (m_scratch_pool_id >= 0) { + m_policy.space() + .impl_internal_space_instance() + ->release_team_scratch_space(m_scratch_pool_id); + } + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/core/src/HIP/Kokkos_HIP_ParallelScan_Range.hpp similarity index 50% rename from core/src/HIP/Kokkos_HIP_Parallel_Range.hpp rename to core/src/HIP/Kokkos_HIP_ParallelScan_Range.hpp index 26e8be4698a..41692a3291b 100644 --- a/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp +++ b/core/src/HIP/Kokkos_HIP_ParallelScan_Range.hpp @@ -14,390 +14,18 @@ // //@HEADER -#ifndef KOKKO_HIP_PARALLEL_RANGE_HPP -#define KOKKO_HIP_PARALLEL_RANGE_HPP +#ifndef KOKKOS_HIP_PARALLEL_SCAN_RANGE_HPP +#define KOKKOS_HIP_PARALLEL_SCAN_RANGE_HPP #include -#if defined(__HIPCC__) - #include #include #include -#include -#include namespace Kokkos { namespace Impl { -template -class ParallelFor, Kokkos::HIP> { - public: - using Policy = Kokkos::RangePolicy; - - private: - using Member = typename Policy::member_type; - using WorkTag = typename Policy::work_tag; - using LaunchBounds = typename Policy::launch_bounds; - - const FunctorType m_functor; - const Policy m_policy; - - template - inline __device__ std::enable_if_t::value> exec_range( - const Member i) const { - m_functor(i); - } - - template - inline __device__ std::enable_if_t::value> exec_range( - const Member i) const { - m_functor(TagType(), i); - } - - public: - using functor_type = FunctorType; - - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; - ParallelFor& operator=(ParallelFor const&) = delete; - - inline __device__ void operator()() const { - const Member work_stride = blockDim.y * gridDim.x; - const Member work_end = m_policy.end(); - - for (Member iwork = - m_policy.begin() + threadIdx.y + blockDim.y * blockIdx.x; - iwork < work_end; - iwork = iwork < work_end - work_stride ? iwork + work_stride - : work_end) { - this->template exec_range(iwork); - } - } - - inline void execute() const { - const typename Policy::index_type nwork = m_policy.end() - m_policy.begin(); - - using DriverType = ParallelFor; - const int block_size = - Kokkos::Impl::hip_get_preferred_blocksize(); - const dim3 block(1, block_size, 1); - const dim3 grid( - typename Policy::index_type((nwork + block.y - 1) / block.y), 1, 1); - - if (block_size == 0) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelFor< HIP > could not find a " - "valid execution configuration.")); - } - Kokkos::Impl::hip_parallel_launch( - *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), - false); - } - - ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) - : m_functor(arg_functor), m_policy(arg_policy) {} -}; - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -template -class ParallelReduce, - Kokkos::HIP> { - public: - using Policy = Kokkos::RangePolicy; - using FunctorType = typename CombinedFunctorReducerType::functor_type; - using ReducerType = typename CombinedFunctorReducerType::reducer_type; - - private: - using WorkRange = typename Policy::WorkRange; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; - using LaunchBounds = typename Policy::launch_bounds; - - public: - using pointer_type = typename ReducerType::pointer_type; - using value_type = typename ReducerType::value_type; - using reference_type = typename ReducerType::reference_type; - using functor_type = FunctorType; - using reducer_type = ReducerType; - using size_type = Kokkos::HIP::size_type; - using index_type = typename Policy::index_type; - // Conditionally set word_size_type to int16_t or int8_t if value_type is - // smaller than int32_t (Kokkos::HIP::size_type) - // word_size_type is used to determine the word count, shared memory buffer - // size, and global memory buffer size before the scan is performed. - // Within the scan, the word count is recomputed based on word_size_type - // and when calculating indexes into the shared/global memory buffers for - // performing the scan, word_size_type is used again. - // For scalars > 4 bytes in size, indexing into shared/global memory relies - // on the block and grid dimensions to ensure that we index at the correct - // offset rather than at every 4 byte word; such that, when the join is - // performed, we have the correct data that was copied over in chunks of 4 - // bytes. - using word_size_type = std::conditional_t< - sizeof(value_type) < sizeof(size_type), - std::conditional_t, size_type>; - - // Algorithmic constraints: blockSize is a power of two AND blockDim.y == - // blockDim.z == 1 - - const CombinedFunctorReducerType m_functor_reducer; - const Policy m_policy; - const pointer_type m_result_ptr; - const bool m_result_ptr_device_accessible; - const bool m_result_ptr_host_accessible; - word_size_type* m_scratch_space = nullptr; - size_type* m_scratch_flags = nullptr; - - static constexpr bool UseShflReduction = false; - - private: - struct ShflReductionTag {}; - struct SHMEMReductionTag {}; - - // Make the exec_range calls call to Reduce::DeviceIterateTile - template - __device__ inline std::enable_if_t::value> exec_range( - const Member& i, reference_type update) const { - m_functor_reducer.get_functor()(i, update); - } - - template - __device__ inline std::enable_if_t::value> exec_range( - const Member& i, reference_type update) const { - m_functor_reducer.get_functor()(TagType(), i, update); - } - - public: - __device__ inline void operator()() const { - using ReductionTag = std::conditional_t; - run(ReductionTag{}); - } - - __device__ inline void run(SHMEMReductionTag) const { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - const integral_nonzero_constant - word_count(reducer.value_size() / sizeof(word_size_type)); - - { - reference_type value = reducer.init(reinterpret_cast( - ::Kokkos::kokkos_impl_hip_shared_memory() + - threadIdx.y * word_count.value)); - - // Number of blocks is bounded so that the reduction can be limited to two - // passes. Each thread block is given an approximately equal amount of - // work to perform. Accumulate the values for this block. The accumulation - // ordering does not match the final pass, but is arithmetically - // equivalent. - - const WorkRange range(m_policy, blockIdx.x, gridDim.x); - - for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); - iwork < iwork_end; iwork += blockDim.y) { - this->template exec_range(iwork, value); - } - } - - // Reduce with final value at blockDim.y - 1 location. - // Shortcut for length zero reduction - bool do_final_reduction = m_policy.begin() == m_policy.end(); - if (!do_final_reduction) - do_final_reduction = hip_single_inter_block_reduce_scan( - reducer, blockIdx.x, gridDim.x, - ::Kokkos::kokkos_impl_hip_shared_memory(), - m_scratch_space, m_scratch_flags); - if (do_final_reduction) { - // This is the final block with the final result at the final threads' - // location - - word_size_type* const shared = - ::Kokkos::kokkos_impl_hip_shared_memory() + - (blockDim.y - 1) * word_count.value; - word_size_type* const global = - m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) - : m_scratch_space; - - if (threadIdx.y == 0) { - reducer.final(reinterpret_cast(shared)); - } - - if (::Kokkos::Impl::HIPTraits::WarpSize < word_count.value) { - __syncthreads(); - } - - for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { - global[i] = shared[i]; - } - } - } - - __device__ inline void run(ShflReductionTag) const { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - value_type value; - reducer.init(&value); - // Number of blocks is bounded so that the reduction can be limited to two - // passes. Each thread block is given an approximately equal amount of work - // to perform. Accumulate the values for this block. The accumulation - // ordering does not match the final pass, but is arithmetically equivalent. - - WorkRange const range(m_policy, blockIdx.x, gridDim.x); - - for (Member iwork = range.begin() + threadIdx.y, iwork_end = range.end(); - iwork < iwork_end; iwork += blockDim.y) { - this->template exec_range(iwork, value); - } - - pointer_type const result = reinterpret_cast(m_scratch_space); - - int max_active_thread = static_cast(range.end() - range.begin()) < - static_cast(blockDim.y) - ? range.end() - range.begin() - : blockDim.y; - - max_active_thread = - (max_active_thread == 0) ? blockDim.y : max_active_thread; - - value_type init; - reducer.init(&init); - if (m_policy.begin() == m_policy.end()) { - reducer.final(&value); - pointer_type const final_result = - m_result_ptr_device_accessible ? m_result_ptr : result; - *final_result = value; - } else if (Impl::hip_inter_block_shuffle_reduction<>( - value, init, reducer, m_scratch_space, result, - m_scratch_flags, max_active_thread)) { - unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; - if (id == 0) { - reducer.final(&value); - pointer_type const final_result = - m_result_ptr_device_accessible ? m_result_ptr : result; - *final_result = value; - } - } - } - - // Determine block size constrained by shared memory: - inline unsigned local_block_size(const FunctorType& f) { - const auto& instance = m_policy.space().impl_internal_space_instance(); - auto shmem_functor = [&f](unsigned n) { - return hip_single_inter_block_reduce_scan_shmem(f, n); - }; - return Kokkos::Impl::hip_get_preferred_blocksize( - instance, shmem_functor); - } - - inline void execute() { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - const index_type nwork = m_policy.end() - m_policy.begin(); - const bool need_device_set = ReducerType::has_init_member_function() || - ReducerType::has_final_member_function() || - !m_result_ptr_host_accessible || - !std::is_same::value; - if ((nwork > 0) || need_device_set) { - const int block_size = local_block_size(m_functor_reducer.get_functor()); - if (block_size == 0) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a " - "valid execution configuration.")); - } - - // REQUIRED ( 1 , N , 1 ) - dim3 block(1, block_size, 1); - // use a slightly less constrained, but still well bounded limit for - // scratch - int nblocks = (nwork + block.y - 1) / block.y; - // Heuristic deciding the value of nblocks. - // The general idea here is we want to: - // 1. Not undersubscribe the device (i.e., we want at least - // preferred_block_min blocks) - // 2. Have each thread reduce > 1 value to minimize overheads - // 3. Limit the total # of blocks, to avoid unbounded scratch space - constexpr int block_max = 4096; - constexpr int preferred_block_min = 1024; - - if (nblocks < preferred_block_min) { - // keep blocks as is, already have low parallelism - } else if (nblocks > block_max) { - // "large dispatch" -> already have lots of parallelism - nblocks = block_max; - } else { - // in the intermediate range, try to have each thread process multiple - // items to offset the cost of the reduction (with not enough - // parallelism to hide it) - int items_per_thread = - (nwork + nblocks * block_size - 1) / (nblocks * block_size); - if (items_per_thread < 4) { - int ratio = std::min( - (nblocks + preferred_block_min - 1) / preferred_block_min, - (4 + items_per_thread - 1) / items_per_thread); - nblocks /= ratio; - } - } - - // TODO: down casting these uses more space than required? - m_scratch_space = - (word_size_type*)::Kokkos::Impl::hip_internal_scratch_space( - m_policy.space(), reducer.value_size() * nblocks); - // Intentionally do not downcast to word_size_type since we use HIP - // atomics in Kokkos_HIP_ReduceScan.hpp - m_scratch_flags = ::Kokkos::Impl::hip_internal_scratch_flags( - m_policy.space(), sizeof(size_type)); - // Required grid.x <= block.y - dim3 grid(nblocks, 1, 1); - - if (nwork == 0) { - block = dim3(1, 1, 1); - grid = dim3(1, 1, 1); - } - const int shmem = - UseShflReduction - ? 0 - : hip_single_inter_block_reduce_scan_shmem( - m_functor_reducer.get_functor(), block.y); - - Kokkos::Impl::hip_parallel_launch( - *this, grid, block, shmem, - m_policy.space().impl_internal_space_instance(), - false); // copy to device and execute - - if (!m_result_ptr_device_accessible && m_result_ptr) { - const int size = reducer.value_size(); - DeepCopy(m_policy.space(), m_result_ptr, - m_scratch_space, size); - } - } else { - if (m_result_ptr) { - reducer.init(m_result_ptr); - } - } - } - - template - ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, - const Policy& arg_policy, const ViewType& arg_result) - : m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr(arg_result.data()), - m_result_ptr_device_accessible( - MemorySpaceAccess::accessible), - m_result_ptr_host_accessible( - MemorySpaceAccess::accessible) {} -}; - template class ParallelScanHIPBase { public: @@ -763,5 +391,3 @@ class ParallelScanWithTotal, } // namespace Kokkos #endif - -#endif diff --git a/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp deleted file mode 100644 index 3fe568ac361..00000000000 --- a/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp +++ /dev/null @@ -1,936 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKO_HIP_PARALLEL_TEAM_HPP -#define KOKKO_HIP_PARALLEL_TEAM_HPP - -#include - -#if defined(__HIPCC__) - -#include -#include -#include -#include - -namespace Kokkos { -namespace Impl { - -template -class TeamPolicyInternal - : public PolicyTraits { - public: - using execution_policy = TeamPolicyInternal; - - using traits = PolicyTraits; - - template - friend class TeamPolicyInternal; - - private: - typename traits::execution_space m_space; - int m_league_size; - int m_team_size; - int m_vector_length; - size_t m_team_scratch_size[2]; - size_t m_thread_scratch_size[2]; - int m_chunk_size; - bool m_tune_team_size; - bool m_tune_vector_length; - - public: - using execution_space = HIP; - - template - TeamPolicyInternal(TeamPolicyInternal const& p) { - m_league_size = p.m_league_size; - m_team_size = p.m_team_size; - m_vector_length = p.m_vector_length; - m_team_scratch_size[0] = p.m_team_scratch_size[0]; - m_team_scratch_size[1] = p.m_team_scratch_size[1]; - m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; - m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; - m_chunk_size = p.m_chunk_size; - m_space = p.m_space; - m_tune_team_size = p.m_tune_team_size; - m_tune_vector_length = p.m_tune_vector_length; - } - - template - int team_size_max(FunctorType const& f, ParallelForTag const&) const { - using closure_type = - Impl::ParallelFor>; - - return internal_team_size_common(f); - } - - template - inline int team_size_max(const FunctorType& f, - const ParallelReduceTag&) const { - using functor_analysis_type = - Impl::FunctorAnalysis; - using closure_type = Impl::ParallelReduce< - CombinedFunctorReducer, - TeamPolicy, Kokkos::HIP>; - return internal_team_size_common< - BlockType::Max, closure_type, - typename functor_analysis_type::value_type>(f); - } - - template - inline int team_size_max(const FunctorType& f, const ReducerType&, - const ParallelReduceTag&) const { - using closure_type = - Impl::ParallelReduce, - TeamPolicy, Kokkos::HIP>; - return internal_team_size_common(f); - } - - template - int team_size_recommended(FunctorType const& f, ParallelForTag const&) const { - using closure_type = - Impl::ParallelFor>; - - return internal_team_size_common( - f); - } - - template - inline int team_size_recommended(FunctorType const& f, - ParallelReduceTag const&) const { - using functor_analysis_type = - Impl::FunctorAnalysis; - using closure_type = Impl::ParallelReduce< - CombinedFunctorReducer, - TeamPolicy, Kokkos::HIP>; - return internal_team_size_common< - BlockType::Preferred, closure_type, - typename functor_analysis_type::value_type>(f); - } - - template - int team_size_recommended(FunctorType const& f, ReducerType const&, - ParallelReduceTag const&) const { - using closure_type = - Impl::ParallelReduce, - TeamPolicy, Kokkos::HIP>; - return internal_team_size_common(f); - } - - inline bool impl_auto_vector_length() const { return m_tune_vector_length; } - inline bool impl_auto_team_size() const { return m_tune_team_size; } - static int vector_length_max() { return HIPTraits::WarpSize; } - - static int verify_requested_vector_length(int requested_vector_length) { - int test_vector_length = - std::min(requested_vector_length, vector_length_max()); - - // Allow only power-of-two vector_length - if (!(is_integral_power_of_two(test_vector_length))) { - int test_pow2 = 1; - constexpr int warp_size = HIPTraits::WarpSize; - while (test_pow2 < warp_size) { - test_pow2 <<= 1; - if (test_pow2 > test_vector_length) { - break; - } - } - test_vector_length = test_pow2 >> 1; - } - - return test_vector_length; - } - - inline static int scratch_size_max(int level) { - // HIP Teams use (team_size + 2)*sizeof(double) shared memory for team - // reductions. They also use one int64_t in static shared memory for a - // shared ID. Furthermore, they use additional scratch memory in some - // reduction scenarios, which depend on the size of the value_type and is - // NOT captured here - constexpr size_t max_possible_team_size = 1024; - constexpr size_t max_reserved_shared_mem_per_team = - (max_possible_team_size + 2) * sizeof(double) + sizeof(int64_t); - // arbitrarily setting level 1 scratch limit to 20MB, for a - // MI250 that would give us about 4.4GB for 2 teams per CU - constexpr size_t max_l1_scratch_size = 20 * 1024 * 1024; - - size_t max_shmem = HIP().hip_device_prop().sharedMemPerBlock; - return (level == 0 ? max_shmem - max_reserved_shared_mem_per_team - : max_l1_scratch_size); - } - - inline void impl_set_vector_length(size_t size) { m_vector_length = size; } - inline void impl_set_team_size(size_t size) { m_team_size = size; } - int impl_vector_length() const { return m_vector_length; } - - int team_size() const { return m_team_size; } - - int league_size() const { return m_league_size; } - - size_t scratch_size(int level, int team_size_ = -1) const { - if (team_size_ < 0) team_size_ = m_team_size; - return m_team_scratch_size[level] + - team_size_ * m_thread_scratch_size[level]; - } - - size_t team_scratch_size(int level) const { - return m_team_scratch_size[level]; - } - - size_t thread_scratch_size(int level) const { - return m_thread_scratch_size[level]; - } - - typename traits::execution_space space() const { return m_space; } - - TeamPolicyInternal() - : m_space(typename traits::execution_space()), - m_league_size(0), - m_team_size(-1), - m_vector_length(0), - m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_chunk_size(HIPTraits::WarpSize), - m_tune_team_size(false), - m_tune_vector_length(false) {} - - /** \brief Specify league size, request team size */ - TeamPolicyInternal(const execution_space space_, int league_size_, - int team_size_request, int vector_length_request = 1) - : m_space(space_), - m_league_size(league_size_), - m_team_size(team_size_request), - m_vector_length( - (vector_length_request > 0) - ? verify_requested_vector_length(vector_length_request) - : (verify_requested_vector_length(1))), - m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_chunk_size(HIPTraits::WarpSize), - m_tune_team_size(bool(team_size_request <= 0)), - m_tune_vector_length(bool(vector_length_request <= 0)) { - // Make sure league size is permissible - if (league_size_ >= static_cast(hip_internal_maximum_grid_count()[0])) - Impl::throw_runtime_exception( - "Requested too large league_size for TeamPolicy on HIP execution " - "space."); - - // Make sure total block size is permissible - if (m_team_size * m_vector_length > HIPTraits::MaxThreadsPerBlock) { - Impl::throw_runtime_exception( - std::string("Kokkos::TeamPolicy< HIP > the team size is too large. " - "Team size x vector length must be smaller than 1024.")); - } - } - - /** \brief Specify league size, request team size */ - TeamPolicyInternal(const execution_space space_, int league_size_, - const Kokkos::AUTO_t& /* team_size_request */, - int vector_length_request = 1) - : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {} - // FLAG - /** \brief Specify league size and team size, request vector length*/ - TeamPolicyInternal(const execution_space space_, int league_size_, - int team_size_request, - const Kokkos::AUTO_t& /* vector_length_request */ - ) - : TeamPolicyInternal(space_, league_size_, team_size_request, -1) - - {} - - /** \brief Specify league size, request team size and vector length*/ - TeamPolicyInternal(const execution_space space_, int league_size_, - const Kokkos::AUTO_t& /* team_size_request */, - const Kokkos::AUTO_t& /* vector_length_request */ - - ) - : TeamPolicyInternal(space_, league_size_, -1, -1) - - {} - - TeamPolicyInternal(int league_size_, int team_size_request, - int vector_length_request = 1) - : TeamPolicyInternal(typename traits::execution_space(), league_size_, - team_size_request, vector_length_request) {} - - TeamPolicyInternal(int league_size_, - const Kokkos::AUTO_t& /* team_size_request */, - int vector_length_request = 1) - : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, - vector_length_request) {} - - /** \brief Specify league size and team size, request vector length*/ - TeamPolicyInternal(int league_size_, int team_size_request, - const Kokkos::AUTO_t& /* vector_length_request */ - - ) - : TeamPolicyInternal(typename traits::execution_space(), league_size_, - team_size_request, -1) - - {} - - /** \brief Specify league size, request team size and vector length*/ - TeamPolicyInternal(int league_size_, - const Kokkos::AUTO_t& /* team_size_request */, - const Kokkos::AUTO_t& /* vector_length_request */ - - ) - : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, - -1) {} - - int chunk_size() const { return m_chunk_size; } - - TeamPolicyInternal& set_chunk_size(typename traits::index_type chunk_size_) { - m_chunk_size = chunk_size_; - return *this; - } - - /** \brief set per team scratch size for a specific level of the scratch - * hierarchy */ - TeamPolicyInternal& set_scratch_size(int level, - PerTeamValue const& per_team) { - m_team_scratch_size[level] = per_team.value; - return *this; - } - - /** \brief set per thread scratch size for a specific level of the scratch - * hierarchy */ - TeamPolicyInternal& set_scratch_size(int level, - PerThreadValue const& per_thread) { - m_thread_scratch_size[level] = per_thread.value; - return *this; - } - - /** \brief set per thread and per team scratch size for a specific level of - * the scratch hierarchy */ - TeamPolicyInternal& set_scratch_size(int level, PerTeamValue const& per_team, - PerThreadValue const& per_thread) { - m_team_scratch_size[level] = per_team.value; - m_thread_scratch_size[level] = per_thread.value; - return *this; - } - - using member_type = Kokkos::Impl::HIPTeamMember; - - protected: - template - int internal_team_size_common(FunctorType const& f) const { - const unsigned shmem_block = team_scratch_size(0) + 2 * sizeof(double); - unsigned shmem_thread = thread_scratch_size(0) + sizeof(double); - using Tag = typename PatternTagFromImplSpecialization::type; - if constexpr (std::is_same_v) { - using Interface = - typename Impl::DeduceFunctorPatternInterface::type; - using Analysis = - Impl::FunctorAnalysis; - shmem_thread += - ((Analysis::StaticValueSize != 0) ? 0 : Analysis::value_size(f)); - } - const int vector_length = impl_vector_length(); - - const auto functor = [&f, shmem_block, shmem_thread, vector_length]( - const hipFuncAttributes& attr, int block_size) { - int functor_shmem = - ::Kokkos::Impl::FunctorTeamShmemSize::value( - f, block_size / vector_length); - return shmem_block + shmem_thread * (block_size / vector_length) + - functor_shmem + attr.sharedSizeBytes; - }; - int block_size; - if constexpr (BlockSize == BlockType::Max) { - block_size = hip_get_max_team_blocksize( - space().impl_internal_space_instance(), functor); - } else { - block_size = - hip_get_preferred_team_blocksize( - space().impl_internal_space_instance(), functor); - } - - if (block_size == 0) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::ParallelFor/Reduce< HIP > could not find a valid " - "team size.")); - } - if constexpr (std::is_same_v) { - return block_size / impl_vector_length(); - } else { - // Currently we require Power-of-2 team size for reductions. - int p2 = 1; - while (p2 <= block_size) p2 *= 2; - p2 /= 2; - return p2 / impl_vector_length(); - } - } -}; - -__device__ inline int64_t hip_get_scratch_index(HIP::size_type league_size, - int32_t* scratch_locks, - size_t num_scratch_locks) { - int64_t threadid = 0; - __shared__ int64_t base_thread_id; - if (threadIdx.x == 0 && threadIdx.y == 0) { - int64_t const wraparound_len = - Kokkos::min(int64_t(league_size), - int64_t(num_scratch_locks) / (blockDim.x * blockDim.y)); - threadid = (blockIdx.x * blockDim.z + threadIdx.z) % wraparound_len; - threadid *= blockDim.x * blockDim.y; - int done = 0; - while (!done) { - done = (0 == atomicCAS(&scratch_locks[threadid], 0, 1)); - if (!done) { - threadid += blockDim.x * blockDim.y; - if (int64_t(threadid + blockDim.x * blockDim.y) >= - wraparound_len * blockDim.x * blockDim.y) - threadid = 0; - } - } - base_thread_id = threadid; - } - __syncthreads(); - threadid = base_thread_id; - return threadid; -} - -__device__ inline void hip_release_scratch_index(int32_t* scratch_locks, - int64_t threadid) { - __syncthreads(); - if (threadIdx.x == 0 && threadIdx.y == 0) { - scratch_locks[threadid] = 0; - } -} - -template -class ParallelFor, HIP> { - public: - using Policy = TeamPolicy; - using functor_type = FunctorType; - using size_type = HIP::size_type; - - private: - using member_type = typename Policy::member_type; - using work_tag = typename Policy::work_tag; - using launch_bounds = typename Policy::launch_bounds; - - // Algorithmic constraints: blockDim.y is a power of two AND - // blockDim.y == blockDim.z == 1 shared memory utilization: - // - // [ team reduce space ] - // [ team shared space ] - - FunctorType const m_functor; - Policy const m_policy; - size_type const m_league_size; - int m_team_size; - size_type const m_vector_size; - int m_shmem_begin; - int m_shmem_size; - void* m_scratch_ptr[2]; - size_t m_scratch_size[2]; - int m_scratch_pool_id = -1; - int32_t* m_scratch_locks; - size_t m_num_scratch_locks; - - template - __device__ inline std::enable_if_t::value> exec_team( - const member_type& member) const { - m_functor(member); - } - - template - __device__ inline std::enable_if_t::value> exec_team( - const member_type& member) const { - m_functor(TagType(), member); - } - - public: - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; - ParallelFor& operator=(ParallelFor const&) = delete; - - __device__ inline void operator()() const { - // Iterate this block through the league - int64_t threadid = 0; - if (m_scratch_size[1] > 0) { - threadid = hip_get_scratch_index(m_league_size, m_scratch_locks, - m_num_scratch_locks); - } - - int const int_league_size = static_cast(m_league_size); - for (int league_rank = blockIdx.x; league_rank < int_league_size; - league_rank += gridDim.x) { - this->template exec_team(typename Policy::member_type( - kokkos_impl_hip_shared_memory(), m_shmem_begin, m_shmem_size, - static_cast(static_cast(m_scratch_ptr[1]) + - ptrdiff_t(threadid / (blockDim.x * blockDim.y)) * - m_scratch_size[1]), - m_scratch_size[1], league_rank, m_league_size)); - } - if (m_scratch_size[1] > 0) { - hip_release_scratch_index(m_scratch_locks, threadid); - } - } - - inline void execute() const { - int64_t const shmem_size_total = m_shmem_begin + m_shmem_size; - dim3 const grid(static_cast(m_league_size), 1, 1); - dim3 const block(static_cast(m_vector_size), - static_cast(m_team_size), 1); - - using closure_type = - ParallelFor, HIP>; - Impl::hip_parallel_launch( - *this, grid, block, shmem_size_total, - m_policy.space().impl_internal_space_instance(), - true); // copy to device and execute - } - - ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) - : m_functor(arg_functor), - m_policy(arg_policy), - m_league_size(arg_policy.league_size()), - m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()) { - auto internal_space_instance = - m_policy.space().impl_internal_space_instance(); - m_team_size = m_team_size >= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor, ParallelForTag()); - - m_shmem_begin = (sizeof(double) * (m_team_size + 2)); - m_shmem_size = - (m_policy.scratch_size(0, m_team_size) + - FunctorTeamShmemSize::value(m_functor, m_team_size)); - m_scratch_size[0] = m_policy.scratch_size(0, m_team_size); - m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - m_scratch_locks = internal_space_instance->m_scratch_locks; - m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; - - // Functor's reduce memory, team scan memory, and team shared memory depend - // upon team size. - m_scratch_ptr[0] = nullptr; - if (m_team_size <= 0) { - m_scratch_ptr[1] = nullptr; - } else { - m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space(); - m_scratch_ptr[1] = internal_space_instance->resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * - (std::min( - static_cast(HIP().concurrency() / - (m_team_size * m_vector_size)), - static_cast(m_league_size)))); - } - - int const shmem_size_total = m_shmem_begin + m_shmem_size; - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory")); - } - - size_t max_size = arg_policy.team_size_max(arg_functor, ParallelForTag()); - if (static_cast(m_team_size) > static_cast(max_size)) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::ParallelFor< HIP > requested too large team size.")); - } - } - - ~ParallelFor() { - if (m_scratch_pool_id >= 0) { - m_policy.space() - .impl_internal_space_instance() - ->release_team_scratch_space(m_scratch_pool_id); - } - } -}; - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -template -class ParallelReduce, HIP> { - public: - using Policy = TeamPolicyInternal; - using FunctorType = typename CombinedFunctorReducerType::functor_type; - using ReducerType = typename CombinedFunctorReducerType::reducer_type; - - private: - using member_type = typename Policy::member_type; - using work_tag = typename Policy::work_tag; - using launch_bounds = typename Policy::launch_bounds; - - using pointer_type = typename ReducerType::pointer_type; - using reference_type = typename ReducerType::reference_type; - using value_type = typename ReducerType::value_type; - - public: - using functor_type = FunctorType; - using size_type = HIP::size_type; - - // static int constexpr UseShflReduction = false; - // FIXME_HIP This should be disabled unconditionally for best performance, but - // it currently causes tests to fail. - static constexpr int UseShflReduction = - (ReducerType::static_value_size() != 0); - - private: - struct ShflReductionTag {}; - struct SHMEMReductionTag {}; - - // Algorithmic constraints: blockDim.y is a power of two AND - // blockDim.y == blockDim.z == 1 shared memory utilization: - // - // [ global reduce space ] - // [ team reduce space ] - // [ team shared space ] - // - - const CombinedFunctorReducerType m_functor_reducer; - const Policy m_policy; - const pointer_type m_result_ptr; - const bool m_result_ptr_device_accessible; - const bool m_result_ptr_host_accessible; - size_type* m_scratch_space; - size_type* m_scratch_flags; - size_type m_team_begin; - size_type m_shmem_begin; - size_type m_shmem_size; - void* m_scratch_ptr[2]; - size_t m_scratch_size[2]; - int m_scratch_pool_id = -1; - int32_t* m_scratch_locks; - size_t m_num_scratch_locks; - const size_type m_league_size; - int m_team_size; - const size_type m_vector_size; - - template - __device__ inline std::enable_if_t::value> exec_team( - member_type const& member, reference_type update) const { - m_functor_reducer.get_functor()(member, update); - } - - template - __device__ inline std::enable_if_t::value> exec_team( - member_type const& member, reference_type update) const { - m_functor_reducer.get_functor()(TagType(), member, update); - } - - __device__ inline void iterate_through_league(int const threadid, - reference_type value) const { - int const int_league_size = static_cast(m_league_size); - for (int league_rank = blockIdx.x; league_rank < int_league_size; - league_rank += gridDim.x) { - this->template exec_team( - member_type( - kokkos_impl_hip_shared_memory() + m_team_begin, - m_shmem_begin, m_shmem_size, - reinterpret_cast( - reinterpret_cast(m_scratch_ptr[1]) + - static_cast(threadid / (blockDim.x * blockDim.y)) * - m_scratch_size[1]), - m_scratch_size[1], league_rank, m_league_size), - value); - } - } - - int compute_block_count() const { - constexpr auto light_weight = - Kokkos::Experimental::WorkItemProperty::HintLightWeight; - constexpr typename Policy::work_item_property property; - // Numbers were tuned on MI210 using dot product and yAx benchmarks - constexpr int block_max = - (property & light_weight) == light_weight ? 2097152 : 65536; - constexpr int preferred_block_min = 1024; - int block_count = m_league_size; - if (block_count < preferred_block_min) { - // keep blocks as is, already low parallelism - } else if (block_count >= block_max) { - block_count = block_max; - - } else { - int nwork = m_league_size * m_team_size; - int items_per_thread = - (nwork + block_count * m_team_size - 1) / (block_count * m_team_size); - if (items_per_thread < 4) { - int ratio = std::min( - (block_count + preferred_block_min - 1) / preferred_block_min, - (4 + items_per_thread - 1) / items_per_thread); - block_count /= ratio; - } - } - - return block_count; - } - - public: - __device__ inline void operator()() const { - int64_t threadid = 0; - if (m_scratch_size[1] > 0) { - threadid = hip_get_scratch_index(m_league_size, m_scratch_locks, - m_num_scratch_locks); - } - - using ReductionTag = std::conditional_t; - run(ReductionTag{}, threadid); - - if (m_scratch_size[1] > 0) { - hip_release_scratch_index(m_scratch_locks, threadid); - } - } - - __device__ inline void run(SHMEMReductionTag, int const threadid) const { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - integral_nonzero_constant const - word_count(reducer.value_size() / sizeof(size_type)); - - reference_type value = - reducer.init(kokkos_impl_hip_shared_memory() + - threadIdx.y * word_count.value); - // Iterate this block through the league - iterate_through_league(threadid, value); - - // Reduce with final value at blockDim.y - 1 location. - bool do_final_reduce = (m_league_size == 0); - if (!do_final_reduce) - do_final_reduce = - hip_single_inter_block_reduce_scan( - reducer, blockIdx.x, gridDim.x, - kokkos_impl_hip_shared_memory(), m_scratch_space, - m_scratch_flags); - if (do_final_reduce) { - // This is the final block with the final result at the final threads' - // location - - size_type* const shared = kokkos_impl_hip_shared_memory() + - (blockDim.y - 1) * word_count.value; - size_type* const global = m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) - : m_scratch_space; - - if (threadIdx.y == 0) { - reducer.final(reinterpret_cast(shared)); - } - - if (HIPTraits::WarpSize < word_count.value) { - __syncthreads(); - } - - for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { - global[i] = shared[i]; - } - } - } - - __device__ inline void run(ShflReductionTag, int const threadid) const { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - value_type value; - reducer.init(&value); - - // Iterate this block through the league - iterate_through_league(threadid, value); - - pointer_type const result = - m_result_ptr_device_accessible - ? m_result_ptr - : reinterpret_cast(m_scratch_space); - - value_type init; - reducer.init(&init); - if (m_league_size == 0) { - reducer.final(&value); - *result = value; - } else if (Impl::hip_inter_block_shuffle_reduction( - value, init, reducer, m_scratch_space, result, - m_scratch_flags, blockDim.y)) { - unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; - if (id == 0) { - reducer.final(&value); - *result = value; - } - } - } - - inline void execute() { - const ReducerType& reducer = m_functor_reducer.get_reducer(); - - const bool is_empty_range = m_league_size == 0 || m_team_size == 0; - const bool need_device_set = ReducerType::has_init_member_function() || - ReducerType::has_final_member_function() || - !m_result_ptr_host_accessible || - Policy::is_graph_kernel::value || - !std::is_same::value; - if (!is_empty_range || need_device_set) { - int const block_count = compute_block_count(); - - m_scratch_space = hip_internal_scratch_space( - m_policy.space(), reducer.value_size() * block_count); - m_scratch_flags = - hip_internal_scratch_flags(m_policy.space(), sizeof(size_type)); - - dim3 block(m_vector_size, m_team_size, 1); - dim3 grid(block_count, 1, 1); - if (is_empty_range) { - block = dim3(1, 1, 1); - grid = dim3(1, 1, 1); - } - const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; - - Impl::hip_parallel_launch( - *this, grid, block, shmem_size_total, - m_policy.space().impl_internal_space_instance(), - true); // copy to device and execute - - if (!m_result_ptr_device_accessible) { - m_policy.space().impl_internal_space_instance()->fence(); - - if (m_result_ptr) { - const int size = reducer.value_size(); - DeepCopy(m_result_ptr, m_scratch_space, size); - } - } - } else { - if (m_result_ptr) { - reducer.init(m_result_ptr); - } - } - } - - template - ParallelReduce(CombinedFunctorReducerType const& arg_functor_reducer, - Policy const& arg_policy, ViewType const& arg_result) - : m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr(arg_result.data()), - m_result_ptr_device_accessible( - MemorySpaceAccess::accessible), - m_result_ptr_host_accessible( - MemorySpaceAccess::accessible), - m_scratch_space(nullptr), - m_scratch_flags(nullptr), - m_team_begin(0), - m_shmem_begin(0), - m_shmem_size(0), - m_scratch_ptr{nullptr, nullptr}, - m_league_size(arg_policy.league_size()), - m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()) { - auto internal_space_instance = - m_policy.space().impl_internal_space_instance(); - m_team_size = m_team_size >= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor_reducer.get_functor(), - arg_functor_reducer.get_reducer(), - ParallelReduceTag()); - - m_team_begin = - UseShflReduction - ? 0 - : hip_single_inter_block_reduce_scan_shmem( - arg_functor_reducer.get_functor(), m_team_size); - m_shmem_begin = sizeof(double) * (m_team_size + 2); - m_shmem_size = m_policy.scratch_size(0, m_team_size) + - FunctorTeamShmemSize::value( - arg_functor_reducer.get_functor(), m_team_size); - m_scratch_size[0] = m_shmem_size; - m_scratch_size[1] = m_policy.scratch_size(1, m_team_size); - m_scratch_locks = internal_space_instance->m_scratch_locks; - m_num_scratch_locks = internal_space_instance->m_num_scratch_locks; - if (m_team_size <= 0) { - m_scratch_ptr[1] = nullptr; - } else { - m_scratch_pool_id = internal_space_instance->acquire_team_scratch_space(); - m_scratch_ptr[1] = internal_space_instance->resize_team_scratch_space( - m_scratch_pool_id, - static_cast(m_scratch_size[1]) * - (std::min( - static_cast(HIP().concurrency() / - (m_team_size * m_vector_size)), - static_cast(m_league_size)))); - } - - // The global parallel_reduce does not support vector_length other than 1 at - // the moment - if ((arg_policy.impl_vector_length() > 1) && !UseShflReduction) - Impl::throw_runtime_exception( - "Kokkos::parallel_reduce with a TeamPolicy using a vector length of " - "greater than 1 is not currently supported for HIP for dynamic " - "sized reduction types."); - - if ((m_team_size < HIPTraits::WarpSize) && !UseShflReduction) - Impl::throw_runtime_exception( - "Kokkos::parallel_reduce with a TeamPolicy using a team_size smaller " - "than 64 is not currently supported with HIP for dynamic sized " - "reduction types."); - - // Functor's reduce memory, team scan memory, and team shared memory depend - // upon team size. - - const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size; - - if (!Kokkos::Impl::is_integral_power_of_two(m_team_size) && - !UseShflReduction) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size")); - } - - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< HIP > requested too much " - "L0 scratch memory")); - } - - size_t max_size = arg_policy.team_size_max( - arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(), - ParallelReduceTag()); - if (static_cast(m_team_size) > static_cast(max_size)) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< HIP > requested too " - "large team size.")); - } - } - - ~ParallelReduce() { - if (m_scratch_pool_id >= 0) { - m_policy.space() - .impl_internal_space_instance() - ->release_team_scratch_space(m_scratch_pool_id); - } - } -}; -} // namespace Impl -} // namespace Kokkos - -#endif - -#endif diff --git a/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp b/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp index ea599989e7a..ab24004f5fc 100644 --- a/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp +++ b/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp @@ -18,138 +18,14 @@ #define KOKKOS_IMPL_PUBLIC_INCLUDE #endif -#include -#include #include - -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord - SharedAllocationRecord::s_root_record; - -SharedAllocationRecord - SharedAllocationRecord::s_root_record; - -SharedAllocationRecord - SharedAllocationRecord::s_root_record; -#endif - -SharedAllocationRecord::~SharedAllocationRecord() { - auto alloc_size = SharedAllocationRecord::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - alloc_size, (alloc_size - sizeof(SharedAllocationHeader))); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size); -} - -SharedAllocationRecord::SharedAllocationRecord( - const HIPSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - HIP exec; - Kokkos::Impl::DeepCopy( - exec, RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - exec.fence( - "SharedAllocationRecord::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -SharedAllocationRecord::SharedAllocationRecord( - const HIP& arg_exec_space, const HIPSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // Copy to device memory - Kokkos::Impl::DeepCopy(arg_exec_space, - RecordBase::m_alloc_ptr, &header, - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord::SharedAllocationRecord( - const HIPHostPinnedSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - // Fill in the Header information, directly accessible via host pinned memory - this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord::SharedAllocationRecord( - const HIPManagedSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - // Fill in the Header information, directly accessible via managed memory - this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, - arg_label); -} - -} // namespace Impl -} // namespace Kokkos +#include +#include +#include + +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::HIPSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::HIPHostPinnedSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::HIPManagedSpace); diff --git a/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp b/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp index e68bad97230..fbae5188344 100644 --- a/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp +++ b/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp @@ -18,120 +18,11 @@ #define KOKKOS_HIP_SHARED_ALLOCATION_RECORD_HPP #include +#include -namespace Kokkos { -namespace Impl { - -template <> -class SharedAllocationRecord - : public HostInaccessibleSharedAllocationRecordCommon { - private: - friend class SharedAllocationRecordCommon; - friend class HostInaccessibleSharedAllocationRecordCommon; - using base_t = HostInaccessibleSharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; -#endif - - const HIPSpace m_space; - - protected: - ~SharedAllocationRecord(); - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec*/, const HIPSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const HIP& exec_space, const HIPSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const HIPSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon { - private: - friend class SharedAllocationRecordCommon; - using base_t = SharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; -#endif - - const HIPHostPinnedSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, const HIPHostPinnedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const HIPHostPinnedSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon { - private: - friend class SharedAllocationRecordCommon; - using base_t = SharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; -#endif - - const HIPManagedSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, const HIPManagedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const HIPManagedSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; -} // namespace Impl -} // namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::HIPSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HIPHostPinnedSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HIPManagedSpace); #endif diff --git a/core/src/HIP/Kokkos_HIP_Space.cpp b/core/src/HIP/Kokkos_HIP_Space.cpp index 7f6aa0d8e82..e8bdfca66fe 100644 --- a/core/src/HIP/Kokkos_HIP_Space.cpp +++ b/core/src/HIP/Kokkos_HIP_Space.cpp @@ -24,10 +24,8 @@ #include #include -#include #include -#include #include #include @@ -287,22 +285,3 @@ void HIPManagedSpace::impl_deallocate( } } // namespace Kokkos - -/*--------------------------------------------------------------------------*/ -/*--------------------------------------------------------------------------*/ - -#include - -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class HostInaccessibleSharedAllocationRecordCommon; -template class SharedAllocationRecordCommon; -template class SharedAllocationRecordCommon; -template class SharedAllocationRecordCommon; - -} // end namespace Impl -} // end namespace Kokkos diff --git a/core/src/HIP/Kokkos_HIP_Space.hpp b/core/src/HIP/Kokkos_HIP_Space.hpp index f3e5adf87e5..7f2004e5cbc 100644 --- a/core/src/HIP/Kokkos_HIP_Space.hpp +++ b/core/src/HIP/Kokkos_HIP_Space.hpp @@ -65,6 +65,15 @@ class HIPSpace { ~HIPSpace() = default; /**\brief Allocate untracked memory in the hip space */ + // FIXME_HIP Use execution space instance + void* allocate(const HIP&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + // FIXME_HIP Use execution space instance + void* allocate(const HIP&, const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -76,8 +85,6 @@ class HIPSpace { const size_t arg_logical_size = 0) const; private: - template - friend class LogicalMemorySpace; void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -94,8 +101,6 @@ class HIPSpace { private: int m_device; ///< Which HIP device - - friend class Kokkos::Impl::SharedAllocationRecord; }; template <> @@ -129,6 +134,16 @@ class HIPHostPinnedSpace { ~HIPHostPinnedSpace() = default; /**\brief Allocate untracked memory in the space */ + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -140,8 +155,6 @@ class HIPHostPinnedSpace { const size_t arg_logical_size = 0) const; private: - template - friend class LogicalMemorySpace; void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -194,6 +207,16 @@ class HIPManagedSpace { ~HIPManagedSpace() = default; /**\brief Allocate untracked memory in the space */ + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -209,8 +232,6 @@ class HIPManagedSpace { private: int m_device; ///< Which HIP device - template - friend class LogicalMemorySpace; void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -239,8 +260,7 @@ struct Impl::is_hip_type_space : public std::true_type {}; namespace Kokkos { namespace Impl { -static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); +static_assert(Kokkos::Impl::MemorySpaceAccess::assignable); //---------------------------------------- diff --git a/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp b/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp new file mode 100644 index 00000000000..67e1181125c --- /dev/null +++ b/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp @@ -0,0 +1,421 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_HIP_TEAM_POLICY_INTERNAL_HPP +#define KOKKOS_HIP_TEAM_POLICY_INTERNAL_HPP + +#include + +namespace Kokkos { +namespace Impl { + +template +class TeamPolicyInternal + : public PolicyTraits { + public: + using execution_policy = TeamPolicyInternal; + + using traits = PolicyTraits; + + template + friend class TeamPolicyInternal; + + private: + typename traits::execution_space m_space; + int m_league_size; + int m_team_size; + int m_vector_length; + size_t m_team_scratch_size[2]; + size_t m_thread_scratch_size[2]; + int m_chunk_size; + bool m_tune_team_size; + bool m_tune_vector_length; + + public: + using execution_space = HIP; + + template + TeamPolicyInternal(TeamPolicyInternal const& p) { + m_league_size = p.m_league_size; + m_team_size = p.m_team_size; + m_vector_length = p.m_vector_length; + m_team_scratch_size[0] = p.m_team_scratch_size[0]; + m_team_scratch_size[1] = p.m_team_scratch_size[1]; + m_thread_scratch_size[0] = p.m_thread_scratch_size[0]; + m_thread_scratch_size[1] = p.m_thread_scratch_size[1]; + m_chunk_size = p.m_chunk_size; + m_space = p.m_space; + m_tune_team_size = p.m_tune_team_size; + m_tune_vector_length = p.m_tune_vector_length; + } + + template + int team_size_max(FunctorType const& f, ParallelForTag const&) const { + using closure_type = + Impl::ParallelFor>; + + return internal_team_size_common(f); + } + + template + inline int team_size_max(const FunctorType& f, + const ParallelReduceTag&) const { + using functor_analysis_type = + Impl::FunctorAnalysis; + using closure_type = Impl::ParallelReduce< + CombinedFunctorReducer, + TeamPolicy, Kokkos::HIP>; + return internal_team_size_common< + BlockType::Max, closure_type, + typename functor_analysis_type::value_type>(f); + } + + template + inline int team_size_max(const FunctorType& f, const ReducerType&, + const ParallelReduceTag&) const { + using closure_type = + Impl::ParallelReduce, + TeamPolicy, Kokkos::HIP>; + return internal_team_size_common(f); + } + + template + int team_size_recommended(FunctorType const& f, ParallelForTag const&) const { + using closure_type = + Impl::ParallelFor>; + + return internal_team_size_common( + f); + } + + template + inline int team_size_recommended(FunctorType const& f, + ParallelReduceTag const&) const { + using functor_analysis_type = + Impl::FunctorAnalysis; + using closure_type = Impl::ParallelReduce< + CombinedFunctorReducer, + TeamPolicy, Kokkos::HIP>; + return internal_team_size_common< + BlockType::Preferred, closure_type, + typename functor_analysis_type::value_type>(f); + } + + template + int team_size_recommended(FunctorType const& f, ReducerType const&, + ParallelReduceTag const&) const { + using closure_type = + Impl::ParallelReduce, + TeamPolicy, Kokkos::HIP>; + return internal_team_size_common(f); + } + + inline bool impl_auto_vector_length() const { return m_tune_vector_length; } + inline bool impl_auto_team_size() const { return m_tune_team_size; } + static int vector_length_max() { return HIPTraits::WarpSize; } + + static int verify_requested_vector_length(int requested_vector_length) { + int test_vector_length = + std::min(requested_vector_length, vector_length_max()); + + // Allow only power-of-two vector_length + if (!(is_integral_power_of_two(test_vector_length))) { + int test_pow2 = 1; + constexpr int warp_size = HIPTraits::WarpSize; + while (test_pow2 < warp_size) { + test_pow2 <<= 1; + if (test_pow2 > test_vector_length) { + break; + } + } + test_vector_length = test_pow2 >> 1; + } + + return test_vector_length; + } + + inline static int scratch_size_max(int level) { + // HIP Teams use (team_size + 2)*sizeof(double) shared memory for team + // reductions. They also use one int64_t in static shared memory for a + // shared ID. Furthermore, they use additional scratch memory in some + // reduction scenarios, which depend on the size of the value_type and is + // NOT captured here + constexpr size_t max_possible_team_size = 1024; + constexpr size_t max_reserved_shared_mem_per_team = + (max_possible_team_size + 2) * sizeof(double) + sizeof(int64_t); + // arbitrarily setting level 1 scratch limit to 20MB, for a + // MI250 that would give us about 4.4GB for 2 teams per CU + constexpr size_t max_l1_scratch_size = 20 * 1024 * 1024; + + size_t max_shmem = HIP().hip_device_prop().sharedMemPerBlock; + return (level == 0 ? max_shmem - max_reserved_shared_mem_per_team + : max_l1_scratch_size); + } + + inline void impl_set_vector_length(size_t size) { m_vector_length = size; } + inline void impl_set_team_size(size_t size) { m_team_size = size; } + int impl_vector_length() const { return m_vector_length; } + + int team_size() const { return m_team_size; } + + int league_size() const { return m_league_size; } + + size_t scratch_size(int level, int team_size_ = -1) const { + if (team_size_ < 0) team_size_ = m_team_size; + return m_team_scratch_size[level] + + team_size_ * m_thread_scratch_size[level]; + } + + size_t team_scratch_size(int level) const { + return m_team_scratch_size[level]; + } + + size_t thread_scratch_size(int level) const { + return m_thread_scratch_size[level]; + } + + typename traits::execution_space space() const { return m_space; } + + TeamPolicyInternal() + : m_space(typename traits::execution_space()), + m_league_size(0), + m_team_size(-1), + m_vector_length(0), + m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(HIPTraits::WarpSize), + m_tune_team_size(false), + m_tune_vector_length(false) {} + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(const execution_space space_, int league_size_, + int team_size_request, int vector_length_request = 1) + : m_space(space_), + m_league_size(league_size_), + m_team_size(team_size_request), + m_vector_length( + (vector_length_request > 0) + ? verify_requested_vector_length(vector_length_request) + : (verify_requested_vector_length(1))), + m_team_scratch_size{0, 0}, + m_thread_scratch_size{0, 0}, + m_chunk_size(HIPTraits::WarpSize), + m_tune_team_size(bool(team_size_request <= 0)), + m_tune_vector_length(bool(vector_length_request <= 0)) { + // Make sure league size is permissible + if (league_size_ >= static_cast(hip_internal_maximum_grid_count()[0])) + Impl::throw_runtime_exception( + "Requested too large league_size for TeamPolicy on HIP execution " + "space."); + + // Make sure total block size is permissible + if (m_team_size * m_vector_length > HIPTraits::MaxThreadsPerBlock) { + Impl::throw_runtime_exception( + std::string("Kokkos::TeamPolicy< HIP > the team size is too large. " + "Team size x vector length must be smaller than 1024.")); + } + } + + /** \brief Specify league size, request team size */ + TeamPolicyInternal(const execution_space space_, int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + int vector_length_request = 1) + : TeamPolicyInternal(space_, league_size_, -1, vector_length_request) {} + // FLAG + /** \brief Specify league size and team size, request vector length*/ + TeamPolicyInternal(const execution_space space_, int league_size_, + int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */ + ) + : TeamPolicyInternal(space_, league_size_, team_size_request, -1) + + {} + + /** \brief Specify league size, request team size and vector length*/ + TeamPolicyInternal(const execution_space space_, int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(space_, league_size_, -1, -1) + + {} + + TeamPolicyInternal(int league_size_, int team_size_request, + int vector_length_request = 1) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, vector_length_request) {} + + TeamPolicyInternal(int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + int vector_length_request = 1) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, + vector_length_request) {} + + /** \brief Specify league size and team size, request vector length*/ + TeamPolicyInternal(int league_size_, int team_size_request, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, + team_size_request, -1) + + {} + + /** \brief Specify league size, request team size and vector length*/ + TeamPolicyInternal(int league_size_, + const Kokkos::AUTO_t& /* team_size_request */, + const Kokkos::AUTO_t& /* vector_length_request */ + + ) + : TeamPolicyInternal(typename traits::execution_space(), league_size_, -1, + -1) {} + + int chunk_size() const { return m_chunk_size; } + + TeamPolicyInternal& set_chunk_size(typename traits::index_type chunk_size_) { + m_chunk_size = chunk_size_; + return *this; + } + + /** \brief set per team scratch size for a specific level of the scratch + * hierarchy */ + TeamPolicyInternal& set_scratch_size(int level, + PerTeamValue const& per_team) { + m_team_scratch_size[level] = per_team.value; + return *this; + } + + /** \brief set per thread scratch size for a specific level of the scratch + * hierarchy */ + TeamPolicyInternal& set_scratch_size(int level, + PerThreadValue const& per_thread) { + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + /** \brief set per thread and per team scratch size for a specific level of + * the scratch hierarchy */ + TeamPolicyInternal& set_scratch_size(int level, PerTeamValue const& per_team, + PerThreadValue const& per_thread) { + m_team_scratch_size[level] = per_team.value; + m_thread_scratch_size[level] = per_thread.value; + return *this; + } + + using member_type = Kokkos::Impl::HIPTeamMember; + + protected: + template + int internal_team_size_common(FunctorType const& f) const { + const unsigned shmem_block = team_scratch_size(0) + 2 * sizeof(double); + unsigned shmem_thread = thread_scratch_size(0) + sizeof(double); + using Tag = typename PatternTagFromImplSpecialization::type; + if constexpr (std::is_same_v) { + using Interface = + typename Impl::DeduceFunctorPatternInterface::type; + using Analysis = + Impl::FunctorAnalysis; + shmem_thread += + ((Analysis::StaticValueSize != 0) ? 0 : Analysis::value_size(f)); + } + const int vector_length = impl_vector_length(); + + const auto functor = [&f, shmem_block, shmem_thread, vector_length]( + const hipFuncAttributes& attr, int block_size) { + int functor_shmem = + ::Kokkos::Impl::FunctorTeamShmemSize::value( + f, block_size / vector_length); + return shmem_block + shmem_thread * (block_size / vector_length) + + functor_shmem + attr.sharedSizeBytes; + }; + int block_size; + if constexpr (BlockSize == BlockType::Max) { + block_size = hip_get_max_team_blocksize( + space().impl_internal_space_instance(), functor); + } else { + block_size = + hip_get_preferred_team_blocksize( + space().impl_internal_space_instance(), functor); + } + + if (block_size == 0) { + Kokkos::Impl::throw_runtime_exception(std::string( + "Kokkos::Impl::ParallelFor/Reduce< HIP > could not find a valid " + "team size.")); + } + if constexpr (std::is_same_v) { + return block_size / impl_vector_length(); + } else { + // Currently we require Power-of-2 team size for reductions. + int p2 = 1; + while (p2 <= block_size) p2 *= 2; + p2 /= 2; + return p2 / impl_vector_length(); + } + } +}; + +__device__ inline int64_t hip_get_scratch_index(HIP::size_type league_size, + int32_t* scratch_locks, + size_t num_scratch_locks) { + int64_t threadid = 0; + __shared__ int64_t base_thread_id; + if (threadIdx.x == 0 && threadIdx.y == 0) { + int64_t const wraparound_len = + Kokkos::min(int64_t(league_size), + int64_t(num_scratch_locks) / (blockDim.x * blockDim.y)); + threadid = (blockIdx.x * blockDim.z + threadIdx.z) % wraparound_len; + threadid *= blockDim.x * blockDim.y; + int done = 0; + while (!done) { + done = (0 == atomicCAS(&scratch_locks[threadid], 0, 1)); + if (!done) { + threadid += blockDim.x * blockDim.y; + if (int64_t(threadid + blockDim.x * blockDim.y) >= + wraparound_len * blockDim.x * blockDim.y) + threadid = 0; + } + } + base_thread_id = threadid; + } + __syncthreads(); + threadid = base_thread_id; + return threadid; +} + +__device__ inline void hip_release_scratch_index(int32_t* scratch_locks, + int64_t threadid) { + __syncthreads(); + if (threadIdx.x == 0 && threadIdx.y == 0) { + scratch_locks[threadid] = 0; + } +} + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/core/src/HIP/Kokkos_HIP_UniqueToken.hpp b/core/src/HIP/Kokkos_HIP_UniqueToken.hpp index 313e5f52172..3d70b596463 100644 --- a/core/src/HIP/Kokkos_HIP_UniqueToken.hpp +++ b/core/src/HIP/Kokkos_HIP_UniqueToken.hpp @@ -19,7 +19,6 @@ #include #include -#include namespace Kokkos { diff --git a/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp b/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp index 5c40d0fbc8d..4bca29868f7 100644 --- a/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp +++ b/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp @@ -25,19 +25,11 @@ namespace Impl { template struct ZeroMemset> { - ZeroMemset(const HIP& exec_space, const View& dst, - typename View::const_value_type&) { + ZeroMemset(const HIP& exec_space, const View& dst) { KOKKOS_IMPL_HIP_SAFE_CALL(hipMemsetAsync( dst.data(), 0, dst.size() * sizeof(typename View::value_type), exec_space.hip_stream())); } - - ZeroMemset(const View& dst, - typename View::const_value_type&) { - KOKKOS_IMPL_HIP_SAFE_CALL( - hipMemset(dst.data(), 0, - dst.size() * sizeof(typename View::value_type))); - } }; } // namespace Impl diff --git a/core/src/HPX/Kokkos_HPX.cpp b/core/src/HPX/Kokkos_HPX.cpp index 4a40ffcaa4f..6d541a64148 100644 --- a/core/src/HPX/Kokkos_HPX.cpp +++ b/core/src/HPX/Kokkos_HPX.cpp @@ -103,6 +103,7 @@ void HPX::print_configuration(std::ostream &os, const bool) const { os << hpx::configuration_string() << '\n'; } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 bool &HPX::impl_get_in_parallel() noexcept { static thread_local bool in_parallel = false; return in_parallel; @@ -127,6 +128,7 @@ HPX::impl_not_in_parallel_scope::~impl_not_in_parallel_scope() noexcept { KOKKOS_EXPECTS(!impl_get_in_parallel()); impl_get_in_parallel() = true; } +#endif void HPX::impl_decrement_active_parallel_region_count() { std::unique_lock l(m_active_parallel_region_count_mutex); diff --git a/core/src/HPX/Kokkos_HPX.hpp b/core/src/HPX/Kokkos_HPX.hpp index 1dfc5b40646..26181a7c05d 100644 --- a/core/src/HPX/Kokkos_HPX.hpp +++ b/core/src/HPX/Kokkos_HPX.hpp @@ -27,14 +27,6 @@ static_assert(false, #include -#include -#include -#include - -#ifdef KOKKOS_ENABLE_HBWSPACE -#include -#endif - #include #include #include @@ -59,6 +51,7 @@ static_assert(false, #include +#include #include #include #include @@ -201,6 +194,7 @@ class HPX { return impl_get_instance_data().m_instance_id; } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static bool &impl_get_in_parallel() noexcept; struct impl_in_parallel_scope { @@ -223,9 +217,10 @@ class HPX { delete; }; - static bool in_parallel(HPX const & = HPX()) noexcept { + KOKKOS_DEPRECATED static bool in_parallel(HPX const & = HPX()) noexcept { return impl_get_in_parallel(); } +#endif static void impl_decrement_active_parallel_region_count(); static void impl_increment_active_parallel_region_count(); @@ -248,18 +243,6 @@ class HPX { #endif } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - template - KOKKOS_DEPRECATED static void partition_master( - F const &, int requested_num_partitions = 0, int = 0) { - if (requested_num_partitions > 1) { - Kokkos::abort( - "Kokkos::Experimental::HPX::partition_master: can't partition an " - "HPX instance\n"); - } - } -#endif - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(); #else @@ -355,7 +338,9 @@ class HPX { hpx::threads::thread_stacksize::default_) const { impl_bulk_plain_erased(force_synchronous, is_light_weight_policy, {[functor](Index i) { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 impl_in_parallel_scope p; +#endif functor.execute_range(i); }}, n, stacksize); @@ -417,15 +402,21 @@ class HPX { hpx::threads::thread_stacksize::default_) const { impl_bulk_setup_finalize_erased(force_synchronous, is_light_weight_policy, {[functor](Index i) { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 impl_in_parallel_scope p; +#endif functor.execute_range(i); }}, {[functor]() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 impl_in_parallel_scope p; +#endif functor.setup(); }}, {[functor]() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 impl_in_parallel_scope p; +#endif functor.finalize(); }}, n, stacksize); @@ -1292,6 +1283,7 @@ class ParallelScan, const WorkRange range(m_policy, t, num_worker_threads); execute_chunk(range.begin(), range.end(), update_sum, false); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 { // Since arrive_and_wait may yield and resume on another worker thread we // set in_parallel = false on the current thread before suspending and set @@ -1299,6 +1291,9 @@ class ParallelScan, Kokkos::Experimental::HPX::impl_not_in_parallel_scope p; barrier.arrive_and_wait(); } +#else + barrier.arrive_and_wait(); +#endif if (t == 0) { final_reducer.init(reinterpret_cast( @@ -1320,6 +1315,7 @@ class ParallelScan, } } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 { // Since arrive_and_wait may yield and resume on another worker thread we // set in_parallel = false on the current thread before suspending and set @@ -1327,6 +1323,9 @@ class ParallelScan, Kokkos::Experimental::HPX::impl_not_in_parallel_scope p; barrier.arrive_and_wait(); } +#else + barrier.arrive_and_wait(); +#endif reference_type update_base = Analysis::Reducer::reference(reinterpret_cast( @@ -1407,6 +1406,7 @@ class ParallelScanWithTotal, const WorkRange range(m_policy, t, num_worker_threads); execute_chunk(range.begin(), range.end(), update_sum, false); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 { // Since arrive_and_wait may yield and resume on another worker thread we // set in_parallel = false on the current thread before suspending and set @@ -1414,6 +1414,9 @@ class ParallelScanWithTotal, Kokkos::Experimental::HPX::impl_not_in_parallel_scope p; barrier.arrive_and_wait(); } +#else + barrier.arrive_and_wait(); +#endif if (t == 0) { final_reducer.init(reinterpret_cast( @@ -1435,6 +1438,7 @@ class ParallelScanWithTotal, } } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 { // Since arrive_and_wait may yield and resume on another worker thread we // set in_parallel = false on the current thread before suspending and set @@ -1442,6 +1446,9 @@ class ParallelScanWithTotal, Kokkos::Experimental::HPX::impl_not_in_parallel_scope p; barrier.arrive_and_wait(); } +#else + barrier.arrive_and_wait(); +#endif reference_type update_base = Analysis::Reducer::reference(reinterpret_cast( diff --git a/core/src/KokkosExp_MDRangePolicy.hpp b/core/src/KokkosExp_MDRangePolicy.hpp index c9080db01ca..297b1fadee9 100644 --- a/core/src/KokkosExp_MDRangePolicy.hpp +++ b/core/src/KokkosExp_MDRangePolicy.hpp @@ -73,7 +73,7 @@ is_less_than_value_initialized_variable(T arg) { // Checked narrowing conversion that calls abort if the cast changes the value template -constexpr To checked_narrow_cast(From arg) { +constexpr To checked_narrow_cast(From arg, std::size_t idx) { constexpr const bool is_different_signedness = (std::is_signed::value != std::is_signed::value); auto const ret = static_cast(arg); @@ -81,7 +81,12 @@ constexpr To checked_narrow_cast(From arg) { (is_different_signedness && is_less_than_value_initialized_variable(arg) != is_less_than_value_initialized_variable(ret))) { - Kokkos::abort("unsafe narrowing conversion"); + auto msg = + "Kokkos::MDRangePolicy bound type error: an unsafe implicit conversion " + "is performed on a bound (" + + std::to_string(arg) + ") in dimension (" + std::to_string(idx) + + "), which may not preserve its original value.\n"; + Kokkos::abort(msg.c_str()); } return ret; } @@ -96,15 +101,15 @@ constexpr Array to_array_potentially_narrowing(const U (&init)[M]) { using T = typename Array::value_type; Array a{}; constexpr std::size_t N = a.size(); - static_assert(M <= N, ""); + static_assert(M <= N); auto* ptr = a.data(); // NOTE equivalent to // std::transform(std::begin(init), std::end(init), a.data(), // [](U x) { return static_cast(x); }); // except that std::transform is not constexpr. - for (auto x : init) { - *ptr++ = checked_narrow_cast(x); - (void)checked_narrow_cast(x); // see note above + for (std::size_t i = 0; i < M; ++i) { + *ptr++ = checked_narrow_cast(init[i], i); + (void)checked_narrow_cast(init[i], i); // see note above } return a; } @@ -120,10 +125,10 @@ constexpr NVCC_WONT_LET_ME_CALL_YOU_Array to_array_potentially_narrowing( using T = typename NVCC_WONT_LET_ME_CALL_YOU_Array::value_type; NVCC_WONT_LET_ME_CALL_YOU_Array a{}; constexpr std::size_t N = a.size(); - static_assert(M <= N, ""); + static_assert(M <= N); for (std::size_t i = 0; i < M; ++i) { - a[i] = checked_narrow_cast(other[i]); - (void)checked_narrow_cast(other[i]); // see note above + a[i] = checked_narrow_cast(other[i], i); + (void)checked_narrow_cast(other[i], i); // see note above } return a; } @@ -150,9 +155,20 @@ TileSizeProperties get_tile_size_properties(const ExecutionSpace&) { // multi-dimensional iteration pattern template -struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { - using traits = Kokkos::Impl::PolicyTraits; - using range_policy = RangePolicy; +struct MDRangePolicy; + +// Note: If MDRangePolicy has a primary template, implicit CTAD (deduction +// guides) are generated -> MDRangePolicy<> by some compilers, which is +// incorrect. By making it a template specialization instead, no implicit CTAD +// is generated. This works because there has to be at least one property +// specified (which is Rank<...>); otherwise, we'd get the static_assert +// "Kokkos::Error: MD iteration pattern not defined". This template +// specialization uses in all places for correctness. +template +struct MDRangePolicy + : public Kokkos::Impl::PolicyTraits { + using traits = Kokkos::Impl::PolicyTraits; + using range_policy = RangePolicy; typename traits::execution_space m_space; @@ -161,8 +177,8 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { typename traits::schedule_type, typename traits::index_type>; using execution_policy = - MDRangePolicy; // needed for is_execution_space - // interrogation + MDRangePolicy; // needed for is_execution_policy + // interrogation template friend struct MDRangePolicy; @@ -327,6 +343,20 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { } for (int i = rank_start; i != rank_end; i += increment) { const index_type length = m_upper[i] - m_lower[i]; + + if (m_upper[i] < m_lower[i]) { + std::string msg = + "Kokkos::MDRangePolicy bounds error: The lower bound (" + + std::to_string(m_lower[i]) + ") is greater than its upper bound (" + + std::to_string(m_upper[i]) + ") in dimension " + std::to_string(i) + + ".\n"; +#if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + Kokkos::abort(msg.c_str()); +#elif defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) + Kokkos::Impl::log_warning(msg); +#endif + } + if (m_tile[i] <= 0) { m_tune_tile_size = true; if ((inner_direction == Iterate::Right && (i < rank - 1)) || @@ -358,6 +388,60 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { } }; +template +MDRangePolicy(const LT (&)[N], const UT (&)[N])->MDRangePolicy>; + +template +MDRangePolicy(const LT (&)[N], const UT (&)[N], const TT (&)[TN]) + ->MDRangePolicy>; + +template +MDRangePolicy(DefaultExecutionSpace const&, const LT (&)[N], const UT (&)[N]) + ->MDRangePolicy>; + +template +MDRangePolicy(DefaultExecutionSpace const&, const LT (&)[N], const UT (&)[N], + const TT (&)[TN]) + ->MDRangePolicy>; + +template >> +MDRangePolicy(ES const&, const LT (&)[N], const UT (&)[N]) + ->MDRangePolicy>; + +template >> +MDRangePolicy(ES const&, const LT (&)[N], const UT (&)[N], const TT (&)[TN]) + ->MDRangePolicy>; + +template +MDRangePolicy(Array const&, Array const&)->MDRangePolicy>; + +template +MDRangePolicy(Array const&, Array const&, Array const&) + ->MDRangePolicy>; + +template +MDRangePolicy(DefaultExecutionSpace const&, Array const&, + Array const&) + ->MDRangePolicy>; + +template +MDRangePolicy(DefaultExecutionSpace const&, Array const&, + Array const&, Array const&) + ->MDRangePolicy>; + +template >> +MDRangePolicy(ES const&, Array const&, Array const&) + ->MDRangePolicy>; + +template >> +MDRangePolicy(ES const&, Array const&, Array const&, + Array const&) + ->MDRangePolicy>; + } // namespace Kokkos #endif // KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP diff --git a/core/src/Kokkos_Array.hpp b/core/src/Kokkos_Array.hpp index 82ceaaec218..ba1626bb72e 100644 --- a/core/src/Kokkos_Array.hpp +++ b/core/src/Kokkos_Array.hpp @@ -22,6 +22,7 @@ #endif #include +#include #include #include @@ -320,6 +321,9 @@ struct Array::strided> { : m_elem(arg_ptr), m_size(arg_size), m_stride(arg_stride) {} }; +template +Array(T, Us...)->Array; + } // namespace Kokkos // diff --git a/core/src/Kokkos_Assert.hpp b/core/src/Kokkos_Assert.hpp index c3b9004734a..6fea286005e 100644 --- a/core/src/Kokkos_Assert.hpp +++ b/core/src/Kokkos_Assert.hpp @@ -44,9 +44,6 @@ __LINE__) " \n"); \ } \ } -// some projects already define this for themselves, so don't mess -// them up -#ifndef KOKKOS_ASSERT #define KOKKOS_ASSERT(...) \ { \ if (!bool(__VA_ARGS__)) { \ @@ -58,8 +55,7 @@ __LINE__) " \n"); \ } \ } -#endif // ifndef KOKKOS_ASSERT -#else // not debug mode +#else // not debug mode #define KOKKOS_EXPECTS(...) #define KOKKOS_ENSURES(...) #ifndef KOKKOS_ASSERT diff --git a/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp b/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp index 1c434746321..9acacef901a 100644 --- a/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp +++ b/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp @@ -25,7 +25,7 @@ static_assert(false, #include #include -#ifdef KOKKOS_INTERNAL_NOT_PARALLEL +#ifdef KOKKOS_ENABLE_ATOMICS_BYPASS #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeCaller() #else #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeDevice() diff --git a/core/src/Kokkos_Atomics_Desul_Wrapper.hpp b/core/src/Kokkos_Atomics_Desul_Wrapper.hpp index bda37839805..eebdd20f15d 100644 --- a/core/src/Kokkos_Atomics_Desul_Wrapper.hpp +++ b/core/src/Kokkos_Atomics_Desul_Wrapper.hpp @@ -49,7 +49,7 @@ inline const char* atomic_query_version() { return "KOKKOS_DESUL_ATOMICS"; } #endif // ============================================================ -#ifdef KOKKOS_INTERNAL_NOT_PARALLEL +#ifdef KOKKOS_ENABLE_ATOMICS_BYPASS #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeCaller() #else #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeDevice() diff --git a/core/src/Kokkos_Clamp.hpp b/core/src/Kokkos_Clamp.hpp new file mode 100644 index 00000000000..033cde9ab84 --- /dev/null +++ b/core/src/Kokkos_Clamp.hpp @@ -0,0 +1,41 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_CLAMP_HPP +#define KOKKOS_CLAMP_HPP + +#include + +namespace Kokkos { + +template +constexpr KOKKOS_INLINE_FUNCTION const T& clamp(const T& value, const T& lo, + const T& hi) { + KOKKOS_EXPECTS(!(hi < lo)); + return (value < lo) ? lo : (hi < value) ? hi : value; +} + +template +constexpr KOKKOS_INLINE_FUNCTION const T& clamp(const T& value, const T& lo, + const T& hi, + ComparatorType comp) { + KOKKOS_EXPECTS(!comp(hi, lo)); + return comp(value, lo) ? lo : comp(hi, value) ? hi : value; +} + +} // namespace Kokkos + +#endif diff --git a/core/src/Kokkos_CopyViews.hpp b/core/src/Kokkos_CopyViews.hpp index a0ca55be704..08f6ba8d696 100644 --- a/core/src/Kokkos_CopyViews.hpp +++ b/core/src/Kokkos_CopyViews.hpp @@ -22,6 +22,7 @@ static_assert(false, #ifndef KOKKOS_COPYVIEWS_HPP_ #define KOKKOS_COPYVIEWS_HPP_ #include +#include #include #include #include @@ -612,12 +613,17 @@ void view_copy(const DstType& dst, const SrcType& src) { }; if (!DstExecCanAccessSrc && !SrcExecCanAccessDst) { - std::string message( - "Error: Kokkos::deep_copy with no available copy mechanism: "); - message += src.label(); - message += " to "; - message += dst.label(); - Kokkos::Impl::throw_runtime_exception(message); + std::ostringstream ss; + ss << "Error: Kokkos::deep_copy with no available copy mechanism: " + << "from source view (\"" << src.label() << "\") to destination view (\"" + << dst.label() << "\").\n" + << "There is no common execution space that can access both source's " + "space\n" + << "(" << src_memory_space().name() << ") and destination's space (" + << dst_memory_space().name() << "), " + << "so source and destination\n" + << "must be contiguous and have the same layout.\n"; + Kokkos::Impl::throw_runtime_exception(ss.str()); } // Figure out iteration order in case we need it @@ -1330,13 +1336,12 @@ inline void contiguous_fill( // Default implementation for execution spaces that don't provide a definition template struct ZeroMemset { - ZeroMemset(const ExecutionSpace& exec_space, const ViewType& dst, - typename ViewType::const_value_type& value) { - contiguous_fill(exec_space, dst, value); - } - - ZeroMemset(const ViewType& dst, typename ViewType::const_value_type& value) { - contiguous_fill(ExecutionSpace(), dst, value); + ZeroMemset(const ExecutionSpace& exec_space, const ViewType& dst) { + using ValueType = typename ViewType::value_type; + alignas(alignof(ValueType)) unsigned char + zero_initialized_storage[sizeof(ValueType)] = {}; + contiguous_fill(exec_space, dst, + *reinterpret_cast(zero_initialized_storage)); } }; @@ -1348,13 +1353,18 @@ inline std::enable_if_t< contiguous_fill_or_memset( const ExecutionSpace& exec_space, const View& dst, typename ViewTraits::const_value_type& value) { -// On A64FX memset seems to do the wrong thing with regards to first touch -// leading to the significant performance issues -#ifndef KOKKOS_ARCH_A64FX - if (Impl::is_zero_byte(value)) - ZeroMemset>(exec_space, dst, value); - else + // With OpenMP, using memset has significant performance issues. + if (Impl::is_zero_byte(value) +#ifdef KOKKOS_ENABLE_OPENMP + && !std::is_same_v #endif + ) + // FIXME intel/19 icpc fails to deduce template parameters here, + // resulting in compilation errors; explicitly passing the template + // parameters to ZeroMemset helps workaround the issue + // See https://github.com/kokkos/kokkos/issues/6775 + ZeroMemset>(exec_space, dst); + else contiguous_fill(exec_space, dst, value); } @@ -1379,15 +1389,20 @@ contiguous_fill_or_memset( typename ViewTraits::const_value_type& value) { using ViewType = View; using exec_space_type = typename ViewType::execution_space; + exec_space_type exec; // On A64FX memset seems to do the wrong thing with regards to first touch // leading to the significant performance issues #ifndef KOKKOS_ARCH_A64FX if (Impl::is_zero_byte(value)) - ZeroMemset>(dst, value); + // FIXME intel/19 icpc fails to deduce template parameters here, + // resulting in compilation errors; explicitly passing the template + // parameters to ZeroMemset helps workaround the issue + // See https://github.com/kokkos/kokkos/issues/6775 + ZeroMemset(exec, dst); else #endif - contiguous_fill(exec_space_type(), dst, value); + contiguous_fill(exec, dst, value); } template diff --git a/core/src/Kokkos_Core.hpp b/core/src/Kokkos_Core.hpp index 805411a699e..1f146563be2 100644 --- a/core/src/Kokkos_Core.hpp +++ b/core/src/Kokkos_Core.hpp @@ -46,14 +46,15 @@ #include #include -#include #include -#include +#include +#include #include #include #include #include #include +#include #include #include #include @@ -101,6 +102,7 @@ void declare_configuration_metadata(const std::string& category, [[nodiscard]] bool is_finalized() noexcept; [[nodiscard]] int device_id() noexcept; +[[nodiscard]] int num_devices() noexcept; [[nodiscard]] int num_threads() noexcept; bool show_warnings() noexcept; @@ -300,9 +302,6 @@ std::vector partition_space(ExecSpace const& space, // implementation of the RAII wrapper is using Kokkos::single. #include -// Specializations required after core definitions -#include - //---------------------------------------------------------------------------- // Redefinition of the macros min and max if we pushed them at entry of // Kokkos_Core.hpp diff --git a/core/src/Kokkos_Core_fwd.hpp b/core/src/Kokkos_Core_fwd.hpp index 44f1c5b42f4..7edb35f00eb 100644 --- a/core/src/Kokkos_Core_fwd.hpp +++ b/core/src/Kokkos_Core_fwd.hpp @@ -30,10 +30,6 @@ #include #include -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -#include -#endif - //---------------------------------------------------------------------------- // Have assumed a 64-bit build (8-byte pointers) throughout the code base. // 32-bit build allowed but unsupported. @@ -75,9 +71,6 @@ template struct Device; // forward declare here so that backend initializer calls can use it. -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -struct InitArguments; -#endif class InitializationSettings; } // namespace Kokkos @@ -262,12 +255,6 @@ KOKKOS_FUNCTION void runtime_check_memory_access_violation( } } // namespace Impl - -namespace Experimental { -template -class LogicalMemorySpace; -} - } // namespace Kokkos //---------------------------------------------------------------------------- diff --git a/core/src/Kokkos_ExecPolicy.hpp b/core/src/Kokkos_ExecPolicy.hpp index ae1585a4989..5f251eeb26a 100644 --- a/core/src/Kokkos_ExecPolicy.hpp +++ b/core/src/Kokkos_ExecPolicy.hpp @@ -28,6 +28,7 @@ static_assert(false, #include #include #include +#include //---------------------------------------------------------------------------- @@ -114,62 +115,67 @@ class RangePolicy : public Impl::PolicyTraits { m_granularity_mask(0) {} /** \brief Total range */ + template && + std::is_convertible_v), + bool> = false> + inline RangePolicy(const IndexType1 work_begin, const IndexType2 work_end) + : RangePolicy(typename traits::execution_space(), work_begin, work_end) {} + + /** \brief Total range */ + template && + std::is_convertible_v), + bool> = false> inline RangePolicy(const typename traits::execution_space& work_space, - const member_type work_begin, const member_type work_end) + const IndexType1 work_begin, const IndexType2 work_end) : m_space(work_space), - m_begin(work_begin < work_end ? work_begin : 0), - m_end(work_begin < work_end ? work_end : 0), + m_begin(work_begin), + m_end(work_end), m_granularity(0), m_granularity_mask(0) { + check_conversion_safety(work_begin); + check_conversion_safety(work_end); + check_bounds_validity(); set_auto_chunk_size(); } - /** \brief Total range */ - inline RangePolicy(const member_type work_begin, const member_type work_end) - : RangePolicy(typename traits::execution_space(), work_begin, work_end) { - set_auto_chunk_size(); - } - - /** \brief Total range */ - template - inline RangePolicy(const typename traits::execution_space& work_space, - const member_type work_begin, const member_type work_end, - Args... args) + template && + std::is_convertible_v), + bool> = false> + RangePolicy(const typename traits::execution_space& work_space, + const IndexType1 work_begin, const IndexType2 work_end, + const ChunkSize chunk_size) : m_space(work_space), - m_begin(work_begin < work_end ? work_begin : 0), - m_end(work_begin < work_end ? work_end : 0), + m_begin(work_begin), + m_end(work_end), m_granularity(0), m_granularity_mask(0) { - set_auto_chunk_size(); - set(args...); + check_conversion_safety(work_begin); + check_conversion_safety(work_end); + check_bounds_validity(); + set_chunk_size(chunk_size.value); } /** \brief Total range */ - template - inline RangePolicy(const member_type work_begin, const member_type work_end, - Args... args) - : RangePolicy(typename traits::execution_space(), work_begin, work_end) { - set_auto_chunk_size(); - set(args...); - } - - private: - inline void set() {} + template && + std::is_convertible_v), + bool> = false> + RangePolicy(const IndexType1 work_begin, const IndexType2 work_end, + const ChunkSize chunk_size) + : RangePolicy(typename traits::execution_space(), work_begin, work_end, + chunk_size) {} public: - template - inline void set(Args...) { - static_assert( - 0 == sizeof...(Args), - "Kokkos::RangePolicy: unhandled constructor arguments encountered."); - } - - template - inline void set(const ChunkSize& chunksize, Args... args) { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED_WITH_COMMENT("Use set_chunk_size instead") + inline void set(ChunkSize chunksize) { m_granularity = chunksize.value; m_granularity_mask = m_granularity - 1; - set(args...); } +#endif public: /** \brief return chunk_size */ @@ -218,6 +224,67 @@ class RangePolicy : public Impl::PolicyTraits { m_granularity_mask = m_granularity - 1; } + void check_bounds_validity() { + if (m_end < m_begin) { + std::string msg = "Kokkos::RangePolicy bounds error: The lower bound (" + + std::to_string(m_begin) + + ") is greater than the upper bound (" + + std::to_string(m_end) + ").\n"; +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 + Kokkos::abort(msg.c_str()); +#endif + m_begin = 0; + m_end = 0; +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + Kokkos::Impl::log_warning(msg); +#endif + } + } + + // To be replaced with std::in_range (c++20) + template + static void check_conversion_safety(const IndexType bound) { +#if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) || \ + defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) + + std::string msg = + "Kokkos::RangePolicy bound type error: an unsafe implicit conversion " + "is performed on a bound (" + + std::to_string(bound) + + "), which may " + "not preserve its original value.\n"; + bool warn = false; + + if constexpr (std::is_signed_v != + std::is_signed_v) { + // check signed to unsigned + if constexpr (std::is_signed_v) + warn |= (bound < static_cast( + std::numeric_limits::min())); + + // check unsigned to signed + if constexpr (std::is_signed_v) + warn |= (bound > static_cast( + std::numeric_limits::max())); + } + + // check narrowing + warn |= (static_cast(static_cast(bound)) != bound); + + if (warn) { +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 + Kokkos::abort(msg.c_str()); +#endif + +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + Kokkos::Impl::log_warning(msg); +#endif + } +#else + (void)bound; +#endif + } + public: /** \brief Subrange for a partition's rank and size. * @@ -261,6 +328,21 @@ class RangePolicy : public Impl::PolicyTraits { }; }; +RangePolicy()->RangePolicy<>; + +RangePolicy(int64_t, int64_t)->RangePolicy<>; +RangePolicy(int64_t, int64_t, ChunkSize const&)->RangePolicy<>; + +RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t)->RangePolicy<>; +RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t, ChunkSize const&) + ->RangePolicy<>; + +template >> +RangePolicy(ES const&, int64_t, int64_t)->RangePolicy; + +template >> +RangePolicy(ES const&, int64_t, int64_t, ChunkSize const&)->RangePolicy; + } // namespace Kokkos //---------------------------------------------------------------------------- @@ -983,7 +1065,16 @@ template const& policy, Lambda const& lambda, ReducerValueType& val) { + static_assert(/*!Kokkos::is_view_v &&*/ + !std::is_array_v && + !std::is_pointer_v && + !Kokkos::is_reducer_v, + "Only scalar return types are allowed!"); + + val = ReducerValueType{}; Impl::md_parallel_impl(policy, lambda, val); + policy.team.team_reduce( + Kokkos::Sum{val}); } template @@ -997,7 +1088,29 @@ template const& policy, Lambda const& lambda, ReducerValueType& val) { + static_assert(/*!Kokkos::is_view_v &&*/ + !std::is_array_v && + !std::is_pointer_v && + !Kokkos::is_reducer_v, + "Only a scalar return types are allowed!"); + + val = ReducerValueType{}; Impl::md_parallel_impl(policy, lambda, val); + if constexpr (false +#ifdef KOKKOS_ENABLE_CUDA + || std::is_same_v +#elif defined(KOKKOS_ENABLE_HIP) + || std::is_same_v +#elif defined(KOKKOS_ENABLE_SYCL) + || std::is_same_v +#endif + ) + policy.team.vector_reduce( + Kokkos::Sum{ + val}); } template @@ -1011,7 +1124,31 @@ template const& policy, Lambda const& lambda, ReducerValueType& val) { + static_assert(/*!Kokkos::is_view_v &&*/ + !std::is_array_v && + !std::is_pointer_v && + !Kokkos::is_reducer_v, + "Only a scalar return types are allowed!"); + + val = ReducerValueType{}; Impl::md_parallel_impl(policy, lambda, val); + if constexpr (false +#ifdef KOKKOS_ENABLE_CUDA + || std::is_same_v +#elif defined(KOKKOS_ENABLE_HIP) + || std::is_same_v +#elif defined(KOKKOS_ENABLE_SYCL) + || std::is_same_v +#endif + ) + policy.team.vector_reduce( + Kokkos::Sum{ + val}); + policy.team.team_reduce( + Kokkos::Sum{val}); } template diff --git a/core/src/Kokkos_HBWSpace.hpp b/core/src/Kokkos_HBWSpace.hpp deleted file mode 100644 index 369b7bafb7b..00000000000 --- a/core/src/Kokkos_HBWSpace.hpp +++ /dev/null @@ -1,308 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_HBWSPACE_HPP -#define KOKKOS_HBWSPACE_HPP - -#include -#ifdef KOKKOS_ENABLE_HBWSPACE - -#include - -namespace Kokkos { - -namespace Experimental { - -/// \class HBWSpace -/// \brief Memory management for host memory. -/// -/// HBWSpace is a memory space that governs host memory. "Host" -/// memory means the usual CPU-accessible memory. -class HBWSpace { - public: - //! Tag this class as a kokkos memory space - using memory_space = HBWSpace; - using size_type = size_t; - - /// \typedef execution_space - /// \brief Default execution space for this memory space. - /// - /// Every memory space has a default execution space. This is - /// useful for things like initializing a View (which happens in - /// parallel using the View's default execution space). - using execution_space = Kokkos::DefaultHostExecutionSpace; - - //! This memory space preferred device_type - using device_type = Kokkos::Device; - - /**\brief Default memory space instance */ - HBWSpace(); - HBWSpace(const HBWSpace& rhs) = default; - HBWSpace& operator=(const HBWSpace&) = default; - ~HBWSpace() = default; - - /**\brief Non-default memory space instance to choose allocation mechansim, - * if available */ - - enum AllocationMechanism { - STD_MALLOC, - POSIX_MEMALIGN, - POSIX_MMAP, - INTEL_MM_ALLOC - }; - - explicit HBWSpace(const AllocationMechanism&); - - /**\brief Allocate untracked memory in the space */ - void* allocate(const size_t arg_alloc_size) const; - void* allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const; - - /**\brief Deallocate untracked memory in the space */ - void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const; - void deallocate(const char* arg_label, void* const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const; - - private: - template - friend class LogicalMemorySpace; - - void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - const Kokkos::Tools::SpaceHandle = - Kokkos::Tools::make_space_handle(name())) const; - void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - const Kokkos::Tools::SpaceHandle = - Kokkos::Tools::make_space_handle(name())) const; - - public: - /**\brief Return Name of the MemorySpace */ - static constexpr const char* name() { return "HBW"; } - - private: - AllocationMechanism m_alloc_mech; - friend class Kokkos::Impl::SharedAllocationRecord< - Kokkos::Experimental::HBWSpace, void>; -}; - -} // namespace Experimental - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { - -template <> -class SharedAllocationRecord - : public SharedAllocationRecord { - private: - friend Kokkos::Experimental::HBWSpace; - - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - static void deallocate(RecordBase*); - -#ifdef KOKKOS_ENABLE_DEBUG - /**\brief Root record for tracked allocations from this HBWSpace instance */ - static RecordBase s_root_record; -#endif - - const Kokkos::Experimental::HBWSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - SharedAllocationRecord( - const Kokkos::Experimental::HBWSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); - - public: - inline std::string get_label() const { - return std::string(RecordBase::head()->m_label); - } - - KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( - const Kokkos::Experimental::HBWSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size) { - KOKKOS_IF_ON_HOST((return new SharedAllocationRecord(arg_space, arg_label, - arg_alloc_size);)) - KOKKOS_IF_ON_DEVICE(((void)arg_space; (void)arg_label; (void)arg_alloc_size; - return nullptr;)) - } - - /**\brief Allocate tracked memory in the space */ - static void* allocate_tracked(const Kokkos::Experimental::HBWSpace& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size); - - /**\brief Reallocate tracked memory in the space */ - static void* reallocate_tracked(void* const arg_alloc_ptr, - const size_t arg_alloc_size); - - /**\brief Deallocate tracked memory in the space */ - static void deallocate_tracked(void* const arg_alloc_ptr); - - static SharedAllocationRecord* get_record(void* arg_alloc_ptr); - - static void print_records(std::ostream&, - const Kokkos::Experimental::HBWSpace&, - bool detail = false); -}; - -} // namespace Impl - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { - -static_assert( - Kokkos::Impl::MemorySpaceAccess::assignable, - ""); - -template <> -struct MemorySpaceAccess { - enum : bool { assignable = true }; - enum : bool { accessible = true }; - enum : bool { deepcopy = true }; -}; - -template <> -struct MemorySpaceAccess { - enum : bool { assignable = false }; - enum : bool { accessible = true }; - enum : bool { deepcopy = true }; -}; - -} // namespace Impl - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { - -template <> -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const DefaultHostExecutionSpace& exec, void* dst, const void* src, - size_t n) { - hostspace_parallel_deepcopy(exec, dst, src, n); - } -}; - -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const DefaultHostExecutionSpace& exec, void* dst, const void* src, - size_t n) { - hostspace_parallel_deepcopy(exec, dst, src, n); - } -}; - -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); - hostspace_parallel_deepcopy_async(copy_space, dst, src, n); - } -}; - -template <> -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const DefaultHostExecutionSpace& exec, void* dst, const void* src, - size_t n) { - hostspace_parallel_deepcopy(exec, dst, src, n); - } -}; - -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); - hostspace_parallel_deepcopy_async(dst, src, n); - } -}; - -} // namespace Impl - -} // namespace Kokkos - -#endif -#endif // #define KOKKOS_HBWSPACE_HPP diff --git a/core/src/Kokkos_HostSpace.hpp b/core/src/Kokkos_HostSpace.hpp index 252aabd949f..a1fb0f5a677 100644 --- a/core/src/Kokkos_HostSpace.hpp +++ b/core/src/Kokkos_HostSpace.hpp @@ -37,7 +37,6 @@ static_assert(false, #include #include "impl/Kokkos_HostSpace_deepcopy.hpp" -#include /*--------------------------------------------------------------------------*/ @@ -94,6 +93,16 @@ class HostSpace { #endif /**\brief Allocate untracked memory in the space */ + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -105,9 +114,6 @@ class HostSpace { const size_t arg_logical_size = 0) const; private: - template - friend class Kokkos::Experimental::LogicalMemorySpace; - void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = @@ -124,7 +130,6 @@ class HostSpace { private: static constexpr const char* m_name = "Host"; - friend class Kokkos::Impl::SharedAllocationRecord; }; } // namespace Kokkos @@ -136,8 +141,7 @@ namespace Kokkos { namespace Impl { static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); template struct HostMirror { @@ -173,75 +177,7 @@ struct HostMirror { //---------------------------------------------------------------------------- -namespace Kokkos { - -namespace Impl { - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon { - private: - friend Kokkos::HostSpace; - friend class SharedAllocationRecordCommon; - - using base_t = SharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - /**\brief Root record for tracked allocations from this HostSpace instance */ - static RecordBase s_root_record; -#endif - - Kokkos::HostSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - // This constructor does not forward to the one without exec_space arg - // in order to work around https://github.com/kokkos/kokkos/issues/5258 - // This constructor is templated so I can't just put it into the cpp file - // like the other constructor. - template - SharedAllocationRecord( - const ExecutionSpace& /* exec_space*/, const Kokkos::HostSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, - arg_label); - } - - SharedAllocationRecord( - const Kokkos::HostSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); - - public: - KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( - const Kokkos::HostSpace& arg_space, const std::string& arg_label, - const size_t arg_alloc_size) { - KOKKOS_IF_ON_HOST((return new SharedAllocationRecord(arg_space, arg_label, - arg_alloc_size);)) - KOKKOS_IF_ON_DEVICE(((void)arg_space; (void)arg_label; (void)arg_alloc_size; - return nullptr;)) - } -}; - -} // namespace Impl - -} // namespace Kokkos +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HostSpace); //---------------------------------------------------------------------------- diff --git a/core/src/Kokkos_LogicalSpaces.hpp b/core/src/Kokkos_LogicalSpaces.hpp deleted file mode 100644 index 1ee1d2c81fe..00000000000 --- a/core/src/Kokkos_LogicalSpaces.hpp +++ /dev/null @@ -1,413 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_LOGICALSPACES_HPP -#define KOKKOS_LOGICALSPACES_HPP - -#include -#include -#include -#include -#include -#include -#include -#include -namespace Kokkos { -namespace Experimental { -struct DefaultMemorySpaceNamer { - static constexpr const char* get_name() { - return "DefaultLogicalMemorySpaceName"; - } -}; - -struct LogicalSpaceSharesAccess { - struct shared_access {}; - struct no_shared_access {}; -}; - -/// \class LogicalMemorySpace -/// \brief -/// -/// LogicalMemorySpace is a space that is identical to another space, -/// but differentiable by name and template argument -template -class LogicalMemorySpace { -#ifdef KOKKOS_ENABLE_OPENMPTARGET - // [DZP] For some reason I don't yet know, using LogicalMemorySpaces - // inside an OpenMPTarget build causes errors in the - // SharedAllocationRecords of other types. This is my way of erroring - // a build if we instantiate a LogicalMemSpace in an OMPTarget build - static_assert(!std::is_same::value, - "Can't use LogicalMemorySpaces in an OpenMPTarget build, we're " - "debugging memory issues"); -#endif - public: - //! Tag this class as a kokkos memory space - using memory_space = LogicalMemorySpace; - using size_type = typename BaseSpace::size_type; - - /// \typedef execution_space - /// \brief Default execution space for this memory space. - /// - /// Every memory space has a default execution space. This is - /// useful for things like initializing a View (which happens in - /// parallel using the View's default execution space). - - using execution_space = - std::conditional_t::value, - typename BaseSpace::execution_space, - DefaultBaseExecutionSpace>; - - using device_type = Kokkos::Device; - - LogicalMemorySpace() = default; - - template - LogicalMemorySpace(Args&&... args) : underlying_space((Args &&) args...) {} - - /**\brief Allocate untracked memory in the space */ - void* allocate(const size_t arg_alloc_size) const { - return allocate("[unlabeled]", arg_alloc_size); - } - void* allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const { - return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); - } - - /**\brief Deallocate untracked memory in the space */ - void deallocate(void* const arg_alloc_ptr, - const size_t arg_alloc_size) const { - deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); - } - void deallocate(const char* arg_label, void* const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const { - impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); - } - - /**\brief Return Name of the MemorySpace */ - constexpr static const char* name() { return Namer::get_name(); } - - private: - BaseSpace underlying_space; - template - friend class LogicalMemorySpace; - friend class Kokkos::Impl::SharedAllocationRecord; - - void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - Kokkos::Tools::SpaceHandle arg_handle = - Kokkos::Tools::make_space_handle(name())) const { - return underlying_space.impl_allocate(arg_label, arg_alloc_size, - arg_logical_size, arg_handle); - } - void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - const Kokkos::Tools::SpaceHandle arg_handle = - Kokkos::Tools::make_space_handle(name())) const { - underlying_space.impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, - arg_logical_size, arg_handle); - } -}; -} // namespace Experimental -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { - -template -struct MemorySpaceAccess< - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>, - OtherSpace> { - enum { assignable = MemorySpaceAccess::assignable }; - enum { accessible = MemorySpaceAccess::accessible }; - enum { deepcopy = MemorySpaceAccess::deepcopy }; -}; - -template -struct MemorySpaceAccess< - OtherSpace, - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>> { - enum { assignable = MemorySpaceAccess::assignable }; - enum { accessible = MemorySpaceAccess::accessible }; - enum { deepcopy = MemorySpaceAccess::deepcopy }; -}; - -template -struct MemorySpaceAccess< - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>, - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>> { - enum { assignable = true }; - enum { accessible = true }; - enum { deepcopy = true }; -}; - -} // namespace Impl - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { -template -class SharedAllocationRecord, - void> : public SharedAllocationRecord { - private: - using SpaceType = - Kokkos::Experimental::LogicalMemorySpace; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - static void deallocate(RecordBase* arg_rec) { - delete static_cast(arg_rec); - } - -#ifdef KOKKOS_ENABLE_DEBUG - /**\brief Root record for tracked allocations from this - * LogicalMemorySpace instance */ - static RecordBase s_root_record; -#endif - - const SpaceType m_space; - - protected: - ~SharedAllocationRecord() { - m_space.deallocate(RecordBase::m_alloc_ptr->m_label, - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size, - (SharedAllocationRecord::m_alloc_size - - sizeof(SharedAllocationHeader))); - } - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, const SpaceType& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const SpaceType& arg_space, const std::string& arg_label, - const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : SharedAllocationRecord( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - // Fill in the Header information - RecordBase::m_alloc_ptr->m_record = - static_cast*>(this); - - strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length - 1); - // Set last element zero, in case c_str is too long - RecordBase::m_alloc_ptr - ->m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0'; - } - - public: - inline std::string get_label() const { - return std::string(RecordBase::head()->m_label); - } - KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( - const SpaceType& arg_space, const std::string& arg_label, - const size_t arg_alloc_size) { - KOKKOS_IF_ON_HOST((return new SharedAllocationRecord(arg_space, arg_label, - arg_alloc_size);)) - KOKKOS_IF_ON_DEVICE(((void)arg_space; (void)arg_label; (void)arg_alloc_size; - return nullptr;)) - } - - /**\brief Allocate tracked memory in the space */ - static void* allocate_tracked(const SpaceType& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size) { - if (!arg_alloc_size) return (void*)nullptr; - - SharedAllocationRecord* const r = - allocate(arg_space, arg_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); - } - - /**\brief Reallocate tracked memory in the space */ - static void* reallocate_tracked(void* const arg_alloc_ptr, - const size_t arg_alloc_size) { - SharedAllocationRecord* const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord* const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - Kokkos::fence( - "SharedAllocationRecord::reallocate_tracked: fence after copying data"); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); - } - /**\brief Deallocate tracked memory in the space */ - static void deallocate_tracked(void* const arg_alloc_ptr) { - if (arg_alloc_ptr != nullptr) { - SharedAllocationRecord* const r = get_record(arg_alloc_ptr); - - RecordBase::decrement(r); - } - } - - static SharedAllocationRecord* get_record(void* alloc_ptr) { - using Header = SharedAllocationHeader; - using RecordHost = SharedAllocationRecord; - - SharedAllocationHeader const* const head = - alloc_ptr ? Header::get_header(alloc_ptr) - : (SharedAllocationHeader*)nullptr; - RecordHost* const record = - head ? static_cast(head->m_record) : (RecordHost*)nullptr; - - if (!alloc_ptr || record->m_alloc_ptr != head) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::SharedAllocationRecord< LogicalMemorySpace<> , " - "void >::get_record ERROR")); - } - - return record; - } -#ifdef KOKKOS_ENABLE_DEBUG - static void print_records(std::ostream& s, const SpaceType&, - bool detail = false) { - SharedAllocationRecord::print_host_accessible_records( - s, "HostSpace", &s_root_record, detail); - } -#else - static void print_records(std::ostream&, const SpaceType&, - bool detail = false) { - (void)detail; - throw_runtime_exception( - "SharedAllocationRecord::print_records only works " - "with KOKKOS_ENABLE_DEBUG enabled"); - } -#endif -}; -#ifdef KOKKOS_ENABLE_DEBUG -/**\brief Root record for tracked allocations from this LogicalSpace - * instance */ -template -SharedAllocationRecord - SharedAllocationRecord, - void>::s_root_record; -#endif - -} // namespace Impl - -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -namespace Kokkos { - -namespace Impl { - -template -struct DeepCopy, - Kokkos::Experimental::LogicalMemorySpace< - BaseSpace, DefaultBaseExecutionSpace, Namer, SharesAccess>, - ExecutionSpace> { - DeepCopy(void* dst, void* src, size_t n) { - DeepCopy(dst, src, n); - } - DeepCopy(const ExecutionSpace& exec, void* dst, void* src, size_t n) { - DeepCopy(exec, dst, src, n); - } -}; - -template -struct DeepCopy, - ExecutionSpace> { - DeepCopy(void* dst, void* src, size_t n) { - DeepCopy(dst, src, n); - } - DeepCopy(const ExecutionSpace& exec, void* dst, void* src, size_t n) { - DeepCopy(exec, dst, src, n); - } -}; - -template -struct DeepCopy, - DestinationSpace, ExecutionSpace> { - DeepCopy(void* dst, void* src, size_t n) { - DeepCopy(dst, src, n); - } - DeepCopy(const ExecutionSpace& exec, void* dst, void* src, size_t n) { - DeepCopy(exec, dst, src, n); - } -}; -} // namespace Impl - -} // namespace Kokkos -#endif // KOKKOS_LOGICALSPACES_HPP diff --git a/core/src/Kokkos_Macros.hpp b/core/src/Kokkos_Macros.hpp index 3cf7ac4fa24..b255d2a5195 100644 --- a/core/src/Kokkos_Macros.hpp +++ b/core/src/Kokkos_Macros.hpp @@ -84,11 +84,12 @@ //---------------------------------------------------------------------------- -#if !defined(KOKKOS_ENABLE_THREADS) && !defined(KOKKOS_ENABLE_CUDA) && \ - !defined(KOKKOS_ENABLE_OPENMP) && !defined(KOKKOS_ENABLE_HPX) && \ - !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_HIP) && \ - !defined(KOKKOS_ENABLE_SYCL) -#define KOKKOS_INTERNAL_NOT_PARALLEL +#if defined(KOKKOS_ENABLE_ATOMICS_BYPASS) && \ + (defined(KOKKOS_ENABLE_THREADS) || defined(KOKKOS_ENABLE_CUDA) || \ + defined(KOKKOS_ENABLE_OPENMP) || defined(KOKKOS_ENABLE_HPX) || \ + defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_HIP) || \ + defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENACC)) +#error Atomics may only be disabled if neither a host parallel nor a device backend is enabled #endif #define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA @@ -339,12 +340,6 @@ #define KOKKOS_IMPL_DEVICE_FUNCTION #endif -// Temporary solution for SYCL not supporting printf in kernels. -// Might disappear at any point once we have found another solution. -#if !defined(KOKKOS_IMPL_DO_NOT_USE_PRINTF) -#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(...) ::printf(__VA_ARGS__) -#endif - //---------------------------------------------------------------------------- // Define final version of functions. This is so that clang tidy can find these // macros more easily @@ -433,22 +428,6 @@ #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL #endif -//---------------------------------------------------------------------------- -// Determine for what space the code is being compiled: -#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_3) - -#if defined(__CUDACC__) && defined(__CUDA_ARCH__) && defined(KOKKOS_ENABLE_CUDA) -#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA -#elif defined(__SYCL_DEVICE_ONLY__) && defined(KOKKOS_ENABLE_SYCL) -#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL -#elif defined(__HIPCC__) && defined(__HIP_DEVICE_COMPILE__) && \ - defined(KOKKOS_ENABLE_HIP) -#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU -#else -#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST -#endif - -#endif //---------------------------------------------------------------------------- // Remove surrounding parentheses if present diff --git a/core/src/Kokkos_MasterLock.hpp b/core/src/Kokkos_MasterLock.hpp deleted file mode 100644 index 1d09617371a..00000000000 --- a/core/src/Kokkos_MasterLock.hpp +++ /dev/null @@ -1,56 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_MASTER_LOCK_HPP -#define KOKKOS_MASTER_LOCK_HPP - -#include - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - -namespace Kokkos { -namespace Experimental { - -// my be used to coordinate work between master instances -// SHOULD NOT be used within a parallel algorithm -// -// This lock should be used with with a scoped lock guard -// i.e. std::unique_lock, std::lock_guard -// -// cannot be copied or moved -// has the following functions available -// -// Lock() -// ~Lock() -// -// void lock() -// void unlock() -// bool try_lock() -// -template -class MasterLock; - -} // namespace Experimental -} // namespace Kokkos - -#endif - -#endif // KOKKOS_MASTER_LOCK_HPP diff --git a/core/src/Kokkos_MathematicalConstants.hpp b/core/src/Kokkos_MathematicalConstants.hpp index 51a50d347de..1a77f373fd8 100644 --- a/core/src/Kokkos_MathematicalConstants.hpp +++ b/core/src/Kokkos_MathematicalConstants.hpp @@ -51,24 +51,6 @@ KOKKOS_IMPL_MATH_CONSTANT(phi, 1.618033988749894848204586834365638118L); } // namespace Kokkos::numbers -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -namespace Kokkos::Experimental { -using Kokkos::numbers::e_v; -using Kokkos::numbers::egamma_v; -using Kokkos::numbers::inv_pi_v; -using Kokkos::numbers::inv_sqrt3_v; -using Kokkos::numbers::inv_sqrtpi_v; -using Kokkos::numbers::ln10_v; -using Kokkos::numbers::ln2_v; -using Kokkos::numbers::log10e_v; -using Kokkos::numbers::log2e_v; -using Kokkos::numbers::phi_v; -using Kokkos::numbers::pi_v; -using Kokkos::numbers::sqrt2_v; -using Kokkos::numbers::sqrt3_v; -} // namespace Kokkos::Experimental -#endif - #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHCONSTANTS #undef KOKKOS_IMPL_PUBLIC_INCLUDE #undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHCONSTANTS diff --git a/core/src/Kokkos_MathematicalFunctions.hpp b/core/src/Kokkos_MathematicalFunctions.hpp index ee64c67b93b..3fead8dd293 100644 --- a/core/src/Kokkos_MathematicalFunctions.hpp +++ b/core/src/Kokkos_MathematicalFunctions.hpp @@ -92,16 +92,6 @@ using promote_3_t = typename promote_3::type; #endif #endif -#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_3) -#define KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - USING_DECLARATIONS_IN_EXPERIMENTAL_NAMESPACE) \ - USING_DECLARATIONS_IN_EXPERIMENTAL_NAMESPACE -#else -#define KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - USING_DECLARATIONS_IN_EXPERIMENTAL_NAMESPACE) \ - /* nothing */ -#endif - #define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC) \ KOKKOS_INLINE_FUNCTION float FUNC(float x) { \ using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ @@ -128,13 +118,7 @@ using promote_3_t = typename promote_3::type; T x) { \ using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ return FUNC(static_cast(x)); \ - } \ - KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - namespace Experimental { \ - using ::Kokkos::FUNC; \ - using ::Kokkos::FUNC##f; \ - using ::Kokkos::FUNC##l; \ - }) + } // isinf, isnan, and isinfinite do not work on Windows with CUDA with std:: // getting warnings about calling host function in device function then @@ -151,9 +135,7 @@ using promote_3_t = typename promote_3::type; KOKKOS_INLINE_FUNCTION std::enable_if_t, bool> FUNC( \ T x) { \ return ::FUNC(static_cast(x)); \ - } \ - KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - namespace Experimental { using ::Kokkos::FUNC; }) + } #else #define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC) \ KOKKOS_INLINE_FUNCTION bool FUNC(float x) { \ @@ -173,9 +155,7 @@ using promote_3_t = typename promote_3::type; T x) { \ using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ return FUNC(static_cast(x)); \ - } \ - KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - namespace Experimental { using ::Kokkos::FUNC; }) + } #endif #define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC) \ @@ -218,16 +198,10 @@ using promote_3_t = typename promote_3::type; long double> \ FUNC(T1 x, T2 y) { \ using Promoted = Kokkos::Impl::promote_2_t; \ - static_assert(std::is_same_v, ""); \ + static_assert(std::is_same_v); \ using std::FUNC; \ return FUNC(static_cast(x), static_cast(y)); \ - } \ - KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( \ - namespace Experimental { \ - using ::Kokkos::FUNC; \ - using ::Kokkos::FUNC##f; \ - using ::Kokkos::FUNC##l; \ - }) + } #define KOKKOS_IMPL_MATH_TERNARY_FUNCTION(FUNC) \ KOKKOS_INLINE_FUNCTION float FUNC(float x, float y, float z) { \ @@ -314,8 +288,6 @@ inline long double abs(long double x) { using std::abs; return abs(x); } -KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( - namespace Experimental { using ::Kokkos::abs; }) KOKKOS_IMPL_MATH_UNARY_FUNCTION(fabs) KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmod) KOKKOS_IMPL_MATH_BINARY_FUNCTION(remainder) @@ -336,12 +308,6 @@ KOKKOS_INLINE_FUNCTION float nanf(char const*) { return sycl::nan(0u); } KOKKOS_INLINE_FUNCTION double nan(char const*) { return sycl::nan(0ul); } #endif inline long double nanl(char const* arg) { return ::nanl(arg); } -KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED( - namespace Experimental { - using ::Kokkos::nan; - using ::Kokkos::nanf; - using ::Kokkos::nanl; - }) // Exponential functions KOKKOS_IMPL_MATH_UNARY_FUNCTION(exp) // FIXME_NVHPC nvc++ has issues with exp2 @@ -478,7 +444,6 @@ KOKKOS_IMPL_MATH_UNARY_PREDICATE(signbit) // islessgreater // isunordered -#undef KOKKOS_IMPL_MATH_FUNCTIONS_DEFINED_IF_DEPRECATED_CODE_ENABLED #undef KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE #undef KOKKOS_IMPL_MATH_UNARY_FUNCTION #undef KOKKOS_IMPL_MATH_UNARY_PREDICATE diff --git a/core/src/Kokkos_MinMaxClamp.hpp b/core/src/Kokkos_MinMax.hpp similarity index 83% rename from core/src/Kokkos_MinMaxClamp.hpp rename to core/src/Kokkos_MinMax.hpp index 37a28a80b68..5c60a88bfb1 100644 --- a/core/src/Kokkos_MinMaxClamp.hpp +++ b/core/src/Kokkos_MinMax.hpp @@ -14,13 +14,8 @@ // //@HEADER -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_MIN_MAX_CLAMP_HPP -#define KOKKOS_MIN_MAX_CLAMP_HPP +#ifndef KOKKOS_MIN_MAX_HPP +#define KOKKOS_MIN_MAX_HPP #include #include @@ -29,22 +24,6 @@ static_assert(false, namespace Kokkos { -// clamp -template -constexpr KOKKOS_INLINE_FUNCTION const T& clamp(const T& value, const T& lo, - const T& hi) { - KOKKOS_EXPECTS(!(hi < lo)); - return (value < lo) ? lo : (hi < value) ? hi : value; -} - -template -constexpr KOKKOS_INLINE_FUNCTION const T& clamp(const T& value, const T& lo, - const T& hi, - ComparatorType comp) { - KOKKOS_EXPECTS(!comp(hi, lo)); - return comp(value, lo) ? lo : comp(hi, value) ? hi : value; -} - // max template constexpr KOKKOS_INLINE_FUNCTION const T& max(const T& a, const T& b) { @@ -199,15 +178,6 @@ KOKKOS_INLINE_FUNCTION constexpr Kokkos::pair minmax( return result; } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -namespace Experimental { -using ::Kokkos::clamp; -using ::Kokkos::max; -using ::Kokkos::min; -using ::Kokkos::minmax; -} // namespace Experimental -#endif - } // namespace Kokkos #endif diff --git a/core/src/Kokkos_Pair.hpp b/core/src/Kokkos_Pair.hpp index 7127c78280e..9be8d8d7aa1 100644 --- a/core/src/Kokkos_Pair.hpp +++ b/core/src/Kokkos_Pair.hpp @@ -28,6 +28,7 @@ #endif #include +#include #include namespace Kokkos { @@ -484,7 +485,6 @@ KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( } namespace Impl { - template struct is_pair_like : std::false_type {}; template diff --git a/core/src/Kokkos_Printf.hpp b/core/src/Kokkos_Printf.hpp index 39f95825c38..63a4cce2aeb 100644 --- a/core/src/Kokkos_Printf.hpp +++ b/core/src/Kokkos_Printf.hpp @@ -30,8 +30,11 @@ namespace Kokkos { // In contrast to std::printf, return void to get a consistent behavior across // backends. The GPU backends always return 1 and NVHPC only compiles if we // don't ask for the return value. +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) +using ::printf; +#else template -KOKKOS_FUNCTION void printf(const char* format, Args... args) { +KOKKOS_FORCEINLINE_FUNCTION void printf(const char* format, Args... args) { #ifdef KOKKOS_ENABLE_SYCL // Some compilers warn if "args" is empty and format is not a string literal if constexpr (sizeof...(Args) == 0) @@ -39,15 +42,13 @@ KOKKOS_FUNCTION void printf(const char* format, Args... args) { else sycl::ext::oneapi::experimental::printf(format, args...); #else - if constexpr (sizeof...(Args) == 0) ::printf("%s", format); - // FIXME_OPENMPTARGET non-string-literal argument used in printf is not - // supported for spir64 -#if !(defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU)) + if constexpr (sizeof...(Args) == 0) + ::printf("%s", format); else ::printf(format, args...); #endif -#endif } +#endif } // namespace Kokkos diff --git a/core/src/Kokkos_Profiling_ProfileSection.hpp b/core/src/Kokkos_Profiling_ProfileSection.hpp index 29a04ac3b07..e7a9ba0c7ed 100644 --- a/core/src/Kokkos_Profiling_ProfileSection.hpp +++ b/core/src/Kokkos_Profiling_ProfileSection.hpp @@ -22,49 +22,34 @@ #endif #include -#include #include #include -namespace Kokkos { -namespace Profiling { +namespace Kokkos::Profiling { + +class [[nodiscard]] ProfilingSection { + uint32_t sectionID; -class ProfilingSection { public: ProfilingSection(ProfilingSection const&) = delete; ProfilingSection& operator=(ProfilingSection const&) = delete; - ProfilingSection(const std::string& sectionName) { - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::createProfileSection(sectionName, &secID); - } - } - - void start() { - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::startSection(secID); - } +#if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201907 + [[nodiscard]] +#endif + explicit ProfilingSection(const std::string& sectionName) { + Kokkos::Profiling::createProfileSection(sectionName, §ionID); } - void stop() { - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::stopSection(secID); - } - } + void start() { Kokkos::Profiling::startSection(sectionID); } - ~ProfilingSection() { - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::destroyProfileSection(secID); - } - } + void stop() { Kokkos::Profiling::stopSection(sectionID); } - protected: - uint32_t secID; + ~ProfilingSection() { Kokkos::Profiling::destroyProfileSection(sectionID); } }; -} // namespace Profiling -} // namespace Kokkos +} // namespace Kokkos::Profiling #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_CORE #undef KOKKOS_IMPL_PUBLIC_INCLUDE diff --git a/core/src/Kokkos_Swap.hpp b/core/src/Kokkos_Swap.hpp new file mode 100644 index 00000000000..2f849a13ab6 --- /dev/null +++ b/core/src/Kokkos_Swap.hpp @@ -0,0 +1,68 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_SWAP_HPP +#define KOKKOS_SWAP_HPP + +#include + +#include +#include +#include + +namespace Kokkos { + +template +KOKKOS_FUNCTION constexpr std::enable_if_t && + std::is_move_assignable_v> +kokkos_swap(T& a, T& b) noexcept(std::is_nothrow_move_constructible_v&& + std::is_nothrow_move_assignable_v) { + T t(std::move(a)); + a = std::move(b); + b = std::move(t); +} + +namespace Impl { + +template +struct is_swappable { + template + static decltype(kokkos_swap(std::declval(), std::declval())) + test_swap(int); + struct Nope; + template + static Nope test_swap(long); + static constexpr bool value = + !std::is_same_v(0)), Nope>; +}; + +template +inline constexpr bool is_nothrow_swappable_v = + noexcept(kokkos_swap(std::declval(), std::declval())); + +} // namespace Impl + +template +KOKKOS_FUNCTION constexpr std::enable_if_t::value> +kokkos_swap(T (&a)[N], T (&b)[N]) noexcept(Impl::is_nothrow_swappable_v) { + for (std::size_t i = 0; i < N; ++i) { + kokkos_swap(a[i], b[i]); + } +} + +} // namespace Kokkos + +#endif diff --git a/core/src/Kokkos_Tuners.hpp b/core/src/Kokkos_Tuners.hpp index 618401654e7..f5ffc66af5b 100644 --- a/core/src/Kokkos_Tuners.hpp +++ b/core/src/Kokkos_Tuners.hpp @@ -256,13 +256,14 @@ auto get_point_helper(const PointType& in, const ArrayType& indices, template struct GetPoint; -template -struct GetPoint> { +template +struct GetPoint< + PointType, + std::array> { using index_set_type = - std::array; + std::array; static auto build(const PointType& in, const index_set_type& indices) { - return get_point_helper(in, indices, std::make_index_sequence{}); + return get_point_helper(in, indices, std::make_index_sequence{}); } }; diff --git a/core/src/Kokkos_View.hpp b/core/src/Kokkos_View.hpp index bcbb28014cd..484a0e6f62e 100644 --- a/core/src/Kokkos_View.hpp +++ b/core/src/Kokkos_View.hpp @@ -39,7 +39,7 @@ static_assert(false, #ifdef KOKKOS_ENABLE_IMPL_MDSPAN #include #endif -#include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -75,25 +75,59 @@ constexpr KOKKOS_INLINE_FUNCTION std::size_t count_valid_integers( (i6 != KOKKOS_INVALID_INDEX) + (i7 != KOKKOS_INVALID_INDEX); } -KOKKOS_INLINE_FUNCTION -void runtime_check_rank(const size_t rank, const size_t dyn_rank, - const bool is_void_spec, const size_t i0, - const size_t i1, const size_t i2, const size_t i3, - const size_t i4, const size_t i5, const size_t i6, - const size_t i7, const std::string& label) { +// FIXME Ideally, we would not instantiate this function for every possible View +// type. We should be able to only pass "extent" when we use mdspan. +template +KOKKOS_INLINE_FUNCTION void runtime_check_rank( + const View&, const bool is_void_spec, const size_t i0, const size_t i1, + const size_t i2, const size_t i3, const size_t i4, const size_t i5, + const size_t i6, const size_t i7, const char* label) { (void)(label); if (is_void_spec) { const size_t num_passed_args = count_valid_integers(i0, i1, i2, i3, i4, i5, i6, i7); + // We either allow to pass as many extents as the dynamic rank is, or + // as many extents as the total rank is. In the latter case, the given + // extents for the static dimensions must match the + // compile-time extents. + constexpr int rank = View::rank(); + constexpr int dyn_rank = View::rank_dynamic(); + const bool n_args_is_dyn_rank = num_passed_args == dyn_rank; + const bool n_args_is_rank = num_passed_args == rank; + + if constexpr (rank != dyn_rank) { + if (n_args_is_rank) { + size_t new_extents[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; + for (int i = dyn_rank; i < rank; ++i) + if (new_extents[i] != View::static_extent(i)) { + KOKKOS_IF_ON_HOST( + const std::string message = + "The specified run-time extent for Kokkos::View '" + + std::string(label) + + "' does not match the compile-time extent in dimension " + + std::to_string(i) + ". The given extent is " + + std::to_string(new_extents[i]) + " but should be " + + std::to_string(View::static_extent(i)) + ".\n"; + Kokkos::abort(message.c_str());) + KOKKOS_IF_ON_DEVICE( + Kokkos::abort( + "The specified run-time extents for a Kokkos::View " + "do not match the compile-time extents.");) + } + } + } - if (num_passed_args != dyn_rank && num_passed_args != rank) { + if (!n_args_is_dyn_rank && !n_args_is_rank) { KOKKOS_IF_ON_HOST( const std::string message = - "Constructor for Kokkos View '" + label + - "' has mismatched number of arguments. Number of arguments = " + + "Constructor for Kokkos::View '" + std::string(label) + + "' has mismatched number of arguments. The number " + "of arguments = " + std::to_string(num_passed_args) + - " but dynamic rank = " + std::to_string(dyn_rank) + " \n"; + " neither matches the dynamic rank = " + + std::to_string(dyn_rank) + + " nor the total rank = " + std::to_string(rank) + "\n"; Kokkos::abort(message.c_str());) KOKKOS_IF_ON_DEVICE(Kokkos::abort("Constructor for Kokkos View has " "mismatched number of arguments.");) @@ -814,15 +848,15 @@ class View : public ViewTraits { template static KOKKOS_FUNCTION void check_access_member_function_valid_args(Is...) { - static_assert(rank <= sizeof...(Is), ""); - static_assert(sizeof...(Is) <= 8, ""); - static_assert(Kokkos::Impl::are_integral::value, ""); + static_assert(rank <= sizeof...(Is)); + static_assert(sizeof...(Is) <= 8); + static_assert(Kokkos::Impl::are_integral::value); } template static KOKKOS_FUNCTION void check_operator_parens_valid_args(Is...) { - static_assert(rank == sizeof...(Is), ""); - static_assert(Kokkos::Impl::are_integral::value, ""); + static_assert(rank == sizeof...(Is)); + static_assert(Kokkos::Impl::are_integral::value); } public: @@ -1402,21 +1436,30 @@ class View : public ViewTraits { "execution space"); } - size_t i0 = arg_layout.dimension[0]; - size_t i1 = arg_layout.dimension[1]; - size_t i2 = arg_layout.dimension[2]; - size_t i3 = arg_layout.dimension[3]; - size_t i4 = arg_layout.dimension[4]; - size_t i5 = arg_layout.dimension[5]; - size_t i6 = arg_layout.dimension[6]; - size_t i7 = arg_layout.dimension[7]; - - const std::string& alloc_name = - Impl::get_property(prop_copy); - Impl::runtime_check_rank( - rank, rank_dynamic, - std::is_same::value, i0, i1, i2, i3, - i4, i5, i6, i7, alloc_name); +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + is_layouttiled::value) { + size_t i0 = arg_layout.dimension[0]; + size_t i1 = arg_layout.dimension[1]; + size_t i2 = arg_layout.dimension[2]; + size_t i3 = arg_layout.dimension[3]; + size_t i4 = arg_layout.dimension[4]; + size_t i5 = arg_layout.dimension[5]; + size_t i6 = arg_layout.dimension[6]; + size_t i7 = arg_layout.dimension[7]; + + const std::string& alloc_name = + Impl::get_property(prop_copy); + Impl::runtime_check_rank( + *this, std::is_same::value, i0, i1, + i2, i3, i4, i5, i6, i7, alloc_name.c_str()); + } +#endif Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( prop_copy, arg_layout, Impl::ViewCtorProp::has_execution_space); @@ -1445,6 +1488,29 @@ class View : public ViewTraits { typename Impl::ViewCtorProp::pointer_type>::value, "Constructing View to wrap user memory must supply matching pointer " "type"); + +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v || + is_layouttiled::value) { + size_t i0 = arg_layout.dimension[0]; + size_t i1 = arg_layout.dimension[1]; + size_t i2 = arg_layout.dimension[2]; + size_t i3 = arg_layout.dimension[3]; + size_t i4 = arg_layout.dimension[4]; + size_t i5 = arg_layout.dimension[5]; + size_t i6 = arg_layout.dimension[6]; + size_t i7 = arg_layout.dimension[7]; + + Impl::runtime_check_rank( + *this, std::is_same::value, i0, i1, + i2, i3, i4, i5, i6, i7, "UNMANAGED"); + } +#endif } // Simple dimension-only layout diff --git a/core/src/OpenACC/Kokkos_OpenACC.cpp b/core/src/OpenACC/Kokkos_OpenACC.cpp index f54c44d66f0..99daf379b6f 100644 --- a/core/src/OpenACC/Kokkos_OpenACC.cpp +++ b/core/src/OpenACC/Kokkos_OpenACC.cpp @@ -58,8 +58,10 @@ void Kokkos::Experimental::OpenACC::impl_initialize( Impl::OpenACCInternal::m_acc_device_num = acc_get_device_num(acc_device_host); } else { + using Kokkos::Impl::get_visible_devices; + std::vector const& visible_devices = get_visible_devices(); using Kokkos::Impl::get_gpu; - int const dev_num = get_gpu(settings); + int const dev_num = get_gpu(settings).value_or(visible_devices[0]); acc_set_device_num(dev_num, Impl::OpenACC_Traits::dev_type); Impl::OpenACCInternal::m_acc_device_num = dev_num; } diff --git a/core/src/OpenACC/Kokkos_OpenACC.hpp b/core/src/OpenACC/Kokkos_OpenACC.hpp index b012f6a42a4..5155bee33dc 100644 --- a/core/src/OpenACC/Kokkos_OpenACC.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC.hpp @@ -91,7 +91,11 @@ class OpenACC { #else int concurrency() const { return 256000; } // FIXME_OPENACC #endif - static bool in_parallel() { return acc_on_device(acc_device_not_host); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static bool in_parallel() { + return acc_on_device(acc_device_not_host); + } +#endif uint32_t impl_instance_id() const noexcept; Impl::OpenACCInternal* impl_internal_space_instance() const { return m_space_instance.get(); diff --git a/core/src/OpenACC/Kokkos_OpenACCSpace.cpp b/core/src/OpenACC/Kokkos_OpenACCSpace.cpp index 141ec77fd1f..acc0dcd3c6e 100644 --- a/core/src/OpenACC/Kokkos_OpenACCSpace.cpp +++ b/core/src/OpenACC/Kokkos_OpenACCSpace.cpp @@ -19,8 +19,8 @@ #include #include #include -#include #include +#include #include @@ -66,6 +66,19 @@ void *Kokkos::Experimental::OpenACCSpace::impl_allocate( ptr = acc_malloc(arg_alloc_size); + if (!ptr) { + size_t alignment = 1; // OpenACC does not handle alignment + using Kokkos::Experimental::RawMemoryAllocationFailure; + auto failure_mode = + arg_alloc_size > 0 + ? RawMemoryAllocationFailure::FailureMode::OutOfMemoryError + : RawMemoryAllocationFailure::FailureMode::InvalidAllocationSize; + auto alloc_mechanism = + RawMemoryAllocationFailure::AllocationMechanism::OpenACCMalloc; + throw RawMemoryAllocationFailure(arg_alloc_size, alignment, failure_mode, + alloc_mechanism); + } + if (Kokkos::Profiling::profileLibraryLoaded()) { const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; diff --git a/core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp b/core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp index 4aed7e00f76..ca022192b0b 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_DeepCopy.hpp @@ -34,7 +34,7 @@ struct Kokkos::Impl::DeepCopy 0) { - acc_memcpy_device(dst, const_cast(src), n); + acc_memcpy_device_async(dst, const_cast(src), n, acc_async_noval); } } DeepCopy(const Kokkos::Experimental::OpenACC& exec, void* dst, @@ -52,7 +52,7 @@ struct Kokkos::Impl::DeepCopy { DeepCopy(void* dst, const void* src, size_t n) { if (n > 0) { - acc_memcpy_device(dst, const_cast(src), n); + acc_memcpy_device_async(dst, const_cast(src), n, acc_async_noval); } } DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { @@ -60,7 +60,7 @@ struct Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); if (n > 0) { - acc_memcpy_device(dst, const_cast(src), n); + acc_memcpy_device_async(dst, const_cast(src), n, acc_async_noval); } } }; @@ -70,7 +70,9 @@ struct Kokkos::Impl::DeepCopy { DeepCopy(void* dst, const void* src, size_t n) { - if (n > 0) acc_memcpy_to_device(dst, const_cast(src), n); + if (n > 0) + acc_memcpy_to_device_async(dst, const_cast(src), n, + acc_async_noval); } DeepCopy(const Kokkos::Experimental::OpenACC& exec, void* dst, const void* src, size_t n) { @@ -85,7 +87,8 @@ struct Kokkos::Impl::DeepCopy { DeepCopy(void* dst, const void* src, size_t n) { if (n > 0) { - acc_memcpy_to_device(dst, const_cast(src), n); + acc_memcpy_to_device_async(dst, const_cast(src), n, + acc_async_noval); } } DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { @@ -93,7 +96,8 @@ struct Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); if (n > 0) { - acc_memcpy_to_device(dst, const_cast(src), n); + acc_memcpy_to_device_async(dst, const_cast(src), n, + acc_async_noval); } } }; @@ -104,7 +108,8 @@ struct Kokkos::Impl::DeepCopy { DeepCopy(void* dst, const void* src, size_t n) { if (n > 0) { - acc_memcpy_from_device(dst, const_cast(src), n); + acc_memcpy_from_device_async(dst, const_cast(src), n, + acc_async_noval); } } DeepCopy(const Kokkos::Experimental::OpenACC& exec, void* dst, @@ -120,14 +125,17 @@ template struct Kokkos::Impl::DeepCopy< Kokkos::HostSpace, Kokkos::Experimental::OpenACCSpace, ExecutionSpace> { DeepCopy(void* dst, const void* src, size_t n) { - if (n > 0) acc_memcpy_from_device(dst, const_cast(src), n); + if (n > 0) + acc_memcpy_from_device_async(dst, const_cast(src), n, + acc_async_noval); } DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { exec.fence( "Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); if (n > 0) { - acc_memcpy_from_device(dst, const_cast(src), n); + acc_memcpy_from_device_async(dst, const_cast(src), n, + acc_async_noval); } } }; diff --git a/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp b/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp index 6645616ba51..c3d72368727 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp @@ -35,7 +35,7 @@ class OpenACCInternal { public: static int m_acc_device_num; - int m_async_arg = acc_async_sync; + int m_async_arg = acc_async_noval; OpenACCInternal() = default; @@ -43,7 +43,7 @@ class OpenACCInternal { bool verify_is_initialized(const char* const label) const; - void initialize(int async_arg = acc_async_sync); + void initialize(int async_arg = acc_async_noval); void finalize(); bool is_initialized() const; diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp index 2c7793dc116..5afb5e75d39 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp @@ -31,7 +31,7 @@ template ::value, + static_assert(Kokkos::Impl::always_false::value, "not implemented"); } }; @@ -136,6 +136,7 @@ class Kokkos::Impl::ParallelReduce> struct OpenACCParallelReduceHelper { OpenACCParallelReduceHelper(Functor const&, Reducer const&, Policy const&) { - static_assert(!Kokkos::Impl::always_true::value, + static_assert(Kokkos::Impl::always_false::value, "not implemented"); } }; @@ -140,6 +140,7 @@ class Kokkos::Impl::ParallelReduce::value, + static_assert(Kokkos::Impl::always_false::value, "not implemented"); } }; @@ -129,7 +129,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamThreadRangeBoundariesStruct& loop_boundaries, const Lambda& lambda, const JoinType& join, ValueType& init_result) { - static_assert(!Kokkos::Impl::always_true::value, + static_assert(Kokkos::Impl::always_false::value, "custom reduction is not implemented"); } @@ -140,7 +140,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, const JoinType& join, ValueType& init_result) { - static_assert(!Kokkos::Impl::always_true::value, + static_assert(Kokkos::Impl::always_false::value, "custom reduction is not implemented"); } @@ -394,6 +394,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( vector_length); \ functor(team, val); \ } \ + acc_wait(async_arg); \ aval = val; \ } \ } // namespace Kokkos::Experimental::Impl diff --git a/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp b/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp index 91faa64f733..76e1514476a 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp +++ b/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.cpp @@ -16,92 +16,11 @@ #define KOKKOS_IMPL_PUBLIC_INCLUDE -#include +#include #include -#include -#include - -#ifdef KOKKOS_ENABLE_DEBUG -Kokkos::Impl::SharedAllocationRecord SharedAllocationRecord< - Kokkos::Experimental::OpenACCSpace, void>::s_root_record; -#endif - -Kokkos::Impl::SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - (SharedAllocationRecord::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -Kokkos::Impl::SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::OpenACCSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - Kokkos::Impl::DeepCopy( - RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - Kokkos::fence( - "SharedAllocationRecord::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -Kokkos::Impl::SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::OpenACC &arg_exec_space, - const Kokkos::Experimental::OpenACCSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_exec_space, arg_space, - arg_label, arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - Kokkos::Impl::DeepCopy( - arg_exec_space, RecordBase::m_alloc_ptr, &header, - sizeof(SharedAllocationHeader)); -} - -//============================================================================== -// {{{1 +#include #include -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicitly instantiate these CRTP base classes -// here, where we have access to the associated *_timpl.hpp header files. -template class Kokkos::Impl::HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace>; -template class Kokkos::Impl::SharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace>; - -// end Explicit instantiations of CRTP Base classes }}}1 -//============================================================================== +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::Experimental::OpenACCSpace); diff --git a/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp b/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp index cf83a5b27bc..cde5ecdcb77 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_SharedAllocationRecord.hpp @@ -20,55 +20,7 @@ #include #include -#include - -template <> -class Kokkos::Impl::SharedAllocationRecord - : public HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace> { - private: - friend class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace>; - friend class SharedAllocationRecordCommon; - friend Kokkos::Experimental::OpenACCSpace; - - using base_t = HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenACCSpace>; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - /**\brief Root record for tracked allocations from this OpenACCSpace - * instance */ - static RecordBase s_root_record; - - const Kokkos::Experimental::OpenACCSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::OpenACCSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::OpenACC& exec_space, - const Kokkos::Experimental::OpenACCSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); - - SharedAllocationRecord( - const Kokkos::Experimental::OpenACCSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); -}; +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::Experimental::OpenACCSpace); #endif diff --git a/core/src/OpenACC/Kokkos_OpenACC_Team.hpp b/core/src/OpenACC/Kokkos_OpenACC_Team.hpp index 4ec71f56ef6..20ea392452b 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_Team.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_Team.hpp @@ -82,7 +82,7 @@ class OpenACCTeamMember { // FIXME_OPENACC: team_broadcast() is not implemented. template KOKKOS_FUNCTION void team_broadcast(ValueType& value, int thread_id) const { - static_assert(!Kokkos::Impl::always_true::value, + static_assert(Kokkos::Impl::always_false::value, "Kokkos Error: team_broadcast() is not implemented for the " "OpenACC backend"); return ValueType(); @@ -99,7 +99,7 @@ class OpenACCTeamMember { template KOKKOS_FUNCTION ValueType team_reduce(const ValueType& value, const JoinOp& op_in) const { - static_assert(!Kokkos::Impl::always_true::value, + static_assert(Kokkos::Impl::always_false::value, "Kokkos Error: team_reduce() is not implemented for the " "OpenACC backend"); return ValueType(); @@ -110,7 +110,7 @@ class OpenACCTeamMember { KOKKOS_FUNCTION ArgType team_scan(const ArgType& /*value*/, ArgType* const /*global_accum*/) const { static_assert( - !Kokkos::Impl::always_true::value, + Kokkos::Impl::always_false::value, "Kokkos Error: team_scan() is not implemented for the OpenACC backend"); return ArgType(); } diff --git a/core/src/OpenMP/Kokkos_OpenMP.cpp b/core/src/OpenMP/Kokkos_OpenMP.cpp index 9a169a435c7..81f2c5c3056 100644 --- a/core/src/OpenMP/Kokkos_OpenMP.cpp +++ b/core/src/OpenMP/Kokkos_OpenMP.cpp @@ -81,29 +81,16 @@ bool OpenMP::impl_is_initialized() noexcept { return Impl::OpenMPInternal::singleton().is_initialized(); } -bool OpenMP::in_parallel(OpenMP const &exec_space) noexcept { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - return ( - (exec_space.impl_internal_space_instance()->m_level < omp_get_level()) && - (!Impl::t_openmp_instance || - Impl::t_openmp_instance->m_level < omp_get_level())); -#else +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_DEPRECATED bool OpenMP::in_parallel(OpenMP const &exec_space) noexcept { return exec_space.impl_internal_space_instance()->m_level < omp_get_level(); -#endif } +#endif int OpenMP::impl_thread_pool_size() const noexcept { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - return OpenMP::in_parallel(*this) - ? omp_get_num_threads() - : (Impl::t_openmp_instance - ? Impl::t_openmp_instance->m_pool_size - : impl_internal_space_instance()->m_pool_size); -#else - return OpenMP::in_parallel(*this) + return (impl_internal_space_instance()->get_level() < omp_get_level()) ? omp_get_num_threads() : impl_internal_space_instance()->m_pool_size; -#endif } int OpenMP::impl_max_hardware_threads() noexcept { diff --git a/core/src/OpenMP/Kokkos_OpenMP.hpp b/core/src/OpenMP/Kokkos_OpenMP.hpp index 594f40d5245..11292af84ad 100644 --- a/core/src/OpenMP/Kokkos_OpenMP.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP.hpp @@ -27,14 +27,7 @@ static_assert(false, #include -#include -#include #include - -#ifdef KOKKOS_ENABLE_HBWSPACE -#include -#endif - #include #include #include @@ -45,6 +38,8 @@ static_assert(false, #include +#include +#include #include /*--------------------------------------------------------------------------*/ @@ -53,11 +48,6 @@ namespace Kokkos { namespace Impl { class OpenMPInternal; - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -// FIXME_OPENMP we can remove this after we remove partition_master -inline thread_local OpenMPInternal* t_openmp_instance = nullptr; -#endif } // namespace Impl /// \class OpenMP @@ -67,12 +57,7 @@ class OpenMP { //! Tag this class as a kokkos execution space using execution_space = OpenMP; - using memory_space = -#ifdef KOKKOS_ENABLE_HBWSPACE - Experimental::HBWSpace; -#else - HostSpace; -#endif + using memory_space = HostSpace; //! This execution space preferred device_type using device_type = Kokkos::Device; @@ -87,8 +72,10 @@ class OpenMP { /// \brief Print configuration information to the given output stream. void print_configuration(std::ostream& os, bool verbose = false) const; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 /// \brief is the instance running a parallel algorithm - static bool in_parallel(OpenMP const& = OpenMP()) noexcept; + KOKKOS_DEPRECATED static bool in_parallel(OpenMP const& = OpenMP()) noexcept; +#endif /// \brief Wait until all dispatched functors complete on the given instance /// @@ -104,18 +91,6 @@ class OpenMP { /// This always returns false on OpenMP inline static bool is_asynchronous(OpenMP const& = OpenMP()) noexcept; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - /// \brief Partition the default instance and call 'f' on each new 'master' - /// thread - /// - /// Func is a functor with the following signiture - /// void( int partition_id, int num_partitions ) - template - KOKKOS_DEPRECATED static void partition_master( - F const& f, int requested_num_partitions = 0, - int requested_partition_size = 0); -#endif - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(OpenMP const& = OpenMP()); #else @@ -166,14 +141,7 @@ class OpenMP { }; inline int OpenMP::impl_thread_pool_rank() noexcept { - // FIXME_OPENMP Can we remove this when removing partition_master? It's only - // used in one partition_master test -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - KOKKOS_IF_ON_HOST( - (return Impl::t_openmp_instance ? 0 : omp_get_thread_num();)) -#else KOKKOS_IF_ON_HOST((return omp_get_thread_num();)) -#endif KOKKOS_IF_ON_DEVICE((return -1;)) } diff --git a/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp b/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp index 12bf3b71f7c..32172fbc6c7 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp @@ -47,61 +47,6 @@ void OpenMPInternal::release_lock() { desul::MemoryScopeDevice()); } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -void OpenMPInternal::validate_partition_impl(const int nthreads, - int &num_partitions, - int &partition_size) { - if (nthreads == 1) { - num_partitions = 1; - partition_size = 1; - } else if (num_partitions < 1 && partition_size < 1) { - int idle = nthreads; - for (int np = 2; np <= nthreads; ++np) { - for (int ps = 1; ps <= nthreads / np; ++ps) { - if (nthreads - np * ps < idle) { - idle = nthreads - np * ps; - num_partitions = np; - partition_size = ps; - } - if (idle == 0) { - break; - } - } - } - } else if (num_partitions < 1 && partition_size > 0) { - if (partition_size <= nthreads) { - num_partitions = nthreads / partition_size; - } else { - num_partitions = 1; - partition_size = nthreads; - } - } else if (num_partitions > 0 && partition_size < 1) { - if (num_partitions <= nthreads) { - partition_size = nthreads / num_partitions; - } else { - num_partitions = nthreads; - partition_size = 1; - } - } else if (num_partitions * partition_size > nthreads) { - int idle = nthreads; - const int NP = num_partitions; - const int PS = partition_size; - for (int np = NP; np > 0; --np) { - for (int ps = PS; ps > 0; --ps) { - if ((np * ps <= nthreads) && (nthreads - np * ps < idle)) { - idle = nthreads - np * ps; - num_partitions = np; - partition_size = ps; - } - if (idle == 0) { - break; - } - } - } - } -} -#endif - void OpenMPInternal::clear_thread_data() { const size_t member_bytes = sizeof(int64_t) * diff --git a/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp b/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp index 03f5fff395a..35b9aa93ba7 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp @@ -41,16 +41,6 @@ #include /*--------------------------------------------------------------------------*/ -namespace Kokkos { -namespace Impl { - -inline bool execute_in_serial(OpenMP const& space = OpenMP()) { - return (OpenMP::in_parallel(space) && - !(omp_get_nested() && (omp_get_level() == 1))); -} - -} // namespace Impl -} // namespace Kokkos namespace Kokkos { namespace Impl { @@ -99,11 +89,6 @@ class OpenMPInternal { // Release lock used to protect access to m_pool void release_lock(); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - static void validate_partition_impl(const int nthreads, int& num_partitions, - int& partition_size); -#endif - void resize_thread_data(size_t pool_reduce_bytes, size_t team_reduce_bytes, size_t team_shared_bytes, size_t thread_local_bytes); @@ -115,6 +100,8 @@ class OpenMPInternal { return m_pool[i]; } + int get_level() const { return m_level; } + bool is_initialized() const { return m_initialized; } bool verify_is_initialized(const char* const label) const; @@ -122,32 +109,20 @@ class OpenMPInternal { void print_configuration(std::ostream& s) const; }; -} // namespace Impl - -namespace Experimental { - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -template <> -class MasterLock { - public: - void lock() { omp_set_lock(&m_lock); } - void unlock() { omp_unset_lock(&m_lock); } - bool try_lock() { return static_cast(omp_test_lock(&m_lock)); } - - KOKKOS_DEPRECATED MasterLock() { omp_init_lock(&m_lock); } - ~MasterLock() { omp_destroy_lock(&m_lock); } - - MasterLock(MasterLock const&) = delete; - MasterLock(MasterLock&&) = delete; - MasterLock& operator=(MasterLock const&) = delete; - MasterLock& operator=(MasterLock&&) = delete; - - private: - omp_lock_t m_lock; -}; +inline bool execute_in_serial(OpenMP const& space = OpenMP()) { +// The default value returned by `omp_get_max_active_levels` with gcc version +// lower than 11.1.0 is 2147483647 instead of 1. +#if (!defined(KOKKOS_COMPILER_GNU) || KOKKOS_COMPILER_GNU >= 1110) && \ + _OPENMP >= 201511 + bool is_nested = omp_get_max_active_levels() > 1; +#else + bool is_nested = static_cast(omp_get_nested()); #endif + return (space.impl_internal_space_instance()->get_level() < omp_get_level() && + !(is_nested && (omp_get_level() == 1))); +} -} // namespace Experimental +} // namespace Impl namespace Experimental { namespace Impl { @@ -202,50 +177,6 @@ std::vector partition_space(OpenMP const& main_instance, return Impl::create_OpenMP_instances(main_instance, weights); } } // namespace Experimental - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -template -KOKKOS_DEPRECATED void OpenMP::partition_master(F const& f, int num_partitions, - int partition_size) { -#if _OPENMP >= 201511 - if (omp_get_max_active_levels() > 1) { -#else - if (omp_get_nested()) { -#endif - using Exec = Impl::OpenMPInternal; - - Exec* prev_instance = &Impl::OpenMPInternal::singleton(); - - Exec::validate_partition_impl(prev_instance->m_pool_size, num_partitions, - partition_size); - - OpenMP::memory_space space; - -#pragma omp parallel num_threads(num_partitions) - { - Exec thread_local_instance(partition_size); - Impl::t_openmp_instance = &thread_local_instance; - - size_t pool_reduce_bytes = 32 * partition_size; - size_t team_reduce_bytes = 32 * partition_size; - size_t team_shared_bytes = 1024 * partition_size; - size_t thread_local_bytes = 1024; - - thread_local_instance.resize_thread_data( - pool_reduce_bytes, team_reduce_bytes, team_shared_bytes, - thread_local_bytes); - - omp_set_num_threads(partition_size); - f(omp_get_thread_num(), omp_get_num_threads()); - Impl::t_openmp_instance = nullptr; - } - } else { - // nested openmp not enabled - f(0, 1); - } -} -#endif - } // namespace Kokkos #endif diff --git a/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp b/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp index 96dc664eb79..823a7e668e5 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp @@ -147,15 +147,7 @@ class ParallelFor, Kokkos::OpenMP> { inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy) : m_instance(nullptr), m_functor(arg_functor), m_policy(arg_policy) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } }; @@ -251,16 +243,9 @@ class ParallelFor, inline ParallelFor(const FunctorType& arg_functor, MDRangePolicy arg_policy) : m_instance(nullptr), m_iter(arg_policy, arg_functor) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } + template static int max_tile_size_product(const Policy&, const Functor&) { /** @@ -409,15 +394,7 @@ class ParallelFor, m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize::value( arg_functor, arg_policy.team_size())) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } }; diff --git a/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp b/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp index 52cdef18e65..05fd1c9dce3 100644 --- a/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp +++ b/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp @@ -170,15 +170,7 @@ class ParallelReduce, m_functor_reducer(arg_functor_reducer), m_policy(arg_policy), m_result_ptr(arg_view.data()) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, @@ -319,15 +311,7 @@ class ParallelReduce::accessible, @@ -543,15 +527,7 @@ class ParallelReduce::value( arg_functor_reducer.get_functor(), arg_policy.team_size())) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif static_assert( Kokkos::Impl::MemorySpaceAccess, inline ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy) : m_instance(nullptr), m_functor(arg_functor), m_policy(arg_policy) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } }; @@ -292,15 +284,7 @@ class ParallelScanWithTotal, Kokkos::Impl::MemorySpaceAccess::accessible, "Kokkos::OpenMP parallel_scan result must be host-accessible!"); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - if (t_openmp_instance) { - m_instance = t_openmp_instance; - } else { - m_instance = arg_policy.space().impl_internal_space_instance(); - } -#else m_instance = arg_policy.space().impl_internal_space_instance(); -#endif } //---------------------------------------- diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp index adf972dd081..ea4e7f6baba 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp @@ -65,7 +65,11 @@ class OpenMPTarget { using scratch_memory_space = ScratchMemorySpace; - inline static bool in_parallel() { return omp_in_parallel(); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED inline static bool in_parallel() { + return omp_in_parallel(); + } +#endif static void fence(const std::string& name = "Kokkos::OpenMPTarget::fence: Unnamed Instance Fence"); diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp index 81fbc56de00..a414b34d7c6 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp @@ -37,7 +37,6 @@ #include #include #include -#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -110,79 +109,13 @@ void OpenMPTargetSpace::deallocate(const char* arg_label, } // namespace Experimental } // namespace Kokkos -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord SharedAllocationRecord< - Kokkos::Experimental::OpenMPTargetSpace, void>::s_root_record; -#endif - -SharedAllocationRecord::~SharedAllocationRecord() { - auto alloc_size = SharedAllocationRecord::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - alloc_size, (alloc_size - sizeof(SharedAllocationHeader))); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::OpenMPTargetSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, arg_label); - - // TODO DeepCopy - // DeepCopy - Kokkos::Impl::DeepCopy( - RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - Kokkos::fence( - "SharedAllocationRecord::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -//---------------------------------------------------------------------------- - -} // namespace Impl -} // namespace Kokkos - //============================================================================== // {{{1 #include -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; -template class SharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; - -} // end namespace Impl -} // end namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::Experimental::OpenMPTargetSpace); // end Explicit instantiations of CRTP Base classes }}}1 //============================================================================== diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp index e5b33d0982f..ed625cfcc82 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp @@ -98,6 +98,16 @@ class OpenMPTargetSpace { ~OpenMPTargetSpace() = default; /**\brief Allocate untracked memory in the space */ + // FIXME_OPENMPTARGET Use execution space instance + void* allocate(const OpenMPTarget&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + // FIXME_OPENMPTARGET Use execution space instance + void* allocate(const OpenMPTarget&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; @@ -121,9 +131,6 @@ class OpenMPTargetSpace { const size_t arg_logical_size = 0, const Kokkos::Tools::SpaceHandle = Kokkos::Tools::make_space_handle(name())) const; - - friend class Kokkos::Impl::SharedAllocationRecord< - Kokkos::Experimental::OpenMPTargetSpace, void>; }; } // namespace Experimental } // namespace Kokkos @@ -131,64 +138,8 @@ class OpenMPTargetSpace { //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -namespace Kokkos { -namespace Impl { - -template <> -class SharedAllocationRecord - : public HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace> { - private: - friend class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; - friend class SharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; - friend Kokkos::Experimental::OpenMPTargetSpace; - - using base_t = HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::OpenMPTargetSpace>; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - - /**\brief Root record for tracked allocations from this OpenMPTargetSpace - * instance */ - static RecordBase s_root_record; - - const Kokkos::Experimental::OpenMPTargetSpace m_space; - - protected: - ~SharedAllocationRecord(); - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::OpenMPTargetSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::OpenMPTargetSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); - - public: - KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( - const Kokkos::Experimental::OpenMPTargetSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc) { - KOKKOS_IF_ON_HOST( - (return new SharedAllocationRecord(arg_space, arg_label, arg_alloc);)) - KOKKOS_IF_ON_DEVICE( - ((void)arg_space; (void)arg_label; (void)arg_alloc; return nullptr;)) - } -}; - -} // namespace Impl -} // namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::Experimental::OpenMPTargetSpace); //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp index 1902c38409a..b39f5aca353 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp @@ -75,6 +75,7 @@ int* OpenMPTargetExec::m_lock_array = nullptr; uint64_t OpenMPTargetExec::m_lock_size = 0; uint32_t* OpenMPTargetExec::m_uniquetoken_ptr = nullptr; int OpenMPTargetExec::MAX_ACTIVE_THREADS = 0; +std::mutex OpenMPTargetExec::m_mutex_scratch_ptr; void OpenMPTargetExec::clear_scratch() { Kokkos::Experimental::OpenMPTargetSpace space; @@ -98,6 +99,11 @@ void OpenMPTargetExec::resize_scratch(int64_t team_size, int64_t shmem_size_L0, int64_t shmem_size_L1, int64_t league_size) { Kokkos::Experimental::OpenMPTargetSpace space; + // Level-0 scratch when using clang/17 and higher comes from their OpenMP + // extension, `ompx_dyn_cgroup_mem`. +#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) + shmem_size_L0 = 0; +#endif const int64_t shmem_size = shmem_size_L0 + shmem_size_L1; // L0 + L1 scratch memory per team. const int64_t padding = shmem_size * 10 / 100; // Padding per team. diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp index 9e8844a6f20..3387108da39 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp @@ -178,8 +178,10 @@ void OpenMPTarget::impl_static_fence(const std::string& name) { } void OpenMPTarget::impl_initialize(InitializationSettings const& settings) { + using Kokkos::Impl::get_visible_devices; + std::vector const& visible_devices = get_visible_devices(); using Kokkos::Impl::get_gpu; - const int device_num = get_gpu(settings); + const int device_num = get_gpu(settings).value_or(visible_devices[0]); omp_set_default_device(device_num); Impl::OpenMPTargetInternal::impl_singleton()->impl_initialize(); diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp new file mode 100644 index 00000000000..2bd672f4d06 --- /dev/null +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp @@ -0,0 +1,46 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_MACROS_HPP +#define KOKKOS_OPENMPTARGET_MACROS_HPP + +// Intel architectures prefer the classical hierarchical parallelism that relies +// on OpenMP. +#if defined(KOKKOS_ARCH_INTEL_GPU) +#define KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU +#endif + +// Define a macro for llvm compiler greater than version 17 and on NVIDIA and +// AMD GPUs. This would be useful in cases where non-OpenMP standard llvm +// extensions can be used. +#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1700) && \ + (defined(KOKKOS_ARCH_AMD_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU)) +#define KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS +#endif + +#define KOKKOS_IMPL_OPENMPTARGET_PRAGMA_HELPER(x) _Pragma(#x) +#define KOKKOS_IMPL_OMPTARGET_PRAGMA(x) \ + KOKKOS_IMPL_OPENMPTARGET_PRAGMA_HELPER(omp target x) + +// Use scratch memory extensions to request dynamic shared memory for the +// right compiler/architecture combination. +#ifdef KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS +#define KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(N) ompx_dyn_cgroup_mem(N) +#else +#define KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(N) +#endif + +#endif // KOKKOS_OPENMPTARGET_MACROS_HPP diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp index 9767d8e53ef..dcc509d2faf 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp @@ -21,16 +21,10 @@ #include #include #include -#include #include #include "Kokkos_OpenMPTarget_Abort.hpp" - -// Intel architectures prefer the classical hierarchical parallelism that relies -// on OpenMP. -#if defined(KOKKOS_ARCH_INTEL_GPU) -#define KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU -#endif +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -113,14 +107,20 @@ class OpenMPTargetExecTeamMember { team_broadcast(value, thread_id); } - // FIXME_OPENMPTARGET this function has the wrong interface and currently - // ignores the reducer passed. - template - KOKKOS_INLINE_FUNCTION ValueType team_reduce(const ValueType& value, - const JoinOp&) const { + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + team_reduce(ReducerType const& reducer) const noexcept { + team_reduce(reducer, reducer.reference()); + } + + // FIXME_OPENMPTARGET this function currently ignores the reducer passed. + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + team_reduce(ReducerType const&, typename ReducerType::value_type& value) const + noexcept { #pragma omp barrier - using value_type = ValueType; + using value_type = typename ReducerType::value_type; // const JoinLambdaAdapter op(op_in); // Make sure there is enough scratch space: @@ -149,8 +149,9 @@ class OpenMPTargetExecTeamMember { } #pragma omp barrier } - return team_scratch[0]; + value = team_scratch[0]; } + /** \brief Intra-team exclusive prefix sum with team_rank() ordering * with intra-team non-deterministic ordering accumulation. * @@ -249,15 +250,37 @@ class OpenMPTargetExecTeamMember { // and L1 shmem size. TEAM_REDUCE_SIZE = 512 bytes saved per team for // hierarchical reduction. There is an additional 10% of the requested // scratch memory allocated per team as padding. Hence the product with 0.1. + // + // Use llvm extensions for dynamic shared memory with compilers/architecture + // combinations where it is supported. + // + // Size allocated in HBM will now change based on whether we use llvm + // extensions. +#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) + const int total_shmem = shmem_size_L1 + shmem_size_L1 * 0.1; +#else + const int total_shmem = + shmem_size_L0 + shmem_size_L1 + (shmem_size_L0 + shmem_size_L1) * 0.1; +#endif + + // Per team offset for buffer in HBM. const int reduce_offset = - m_shmem_block_index * - (shmem_size_L0 + shmem_size_L1 + - ((shmem_size_L0 + shmem_size_L1) * 0.1) + TEAM_REDUCE_SIZE); + m_shmem_block_index * (total_shmem + TEAM_REDUCE_SIZE); + +#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) + const int l1_offset = reduce_offset + TEAM_REDUCE_SIZE; + char* l0_scratch = + static_cast(llvm_omp_target_dynamic_shared_alloc()); + m_team_shared = scratch_memory_space( + l0_scratch, shmem_size_L0, static_cast(glb_scratch) + l1_offset, + shmem_size_L1); +#else const int l0_offset = reduce_offset + TEAM_REDUCE_SIZE; const int l1_offset = l0_offset + shmem_size_L0; m_team_shared = scratch_memory_space( (static_cast(glb_scratch) + l0_offset), shmem_size_L0, static_cast(glb_scratch) + l1_offset, shmem_size_L1); +#endif m_reduce_scratch = static_cast(glb_scratch) + reduce_offset; m_league_rank = league_rank; m_team_rank = omp_tid; @@ -751,6 +774,7 @@ class OpenMPTargetExec { int64_t thread_local_bytes, int64_t league_size); static void* m_scratch_ptr; + static std::mutex m_mutex_scratch_ptr; static int64_t m_scratch_size; static int* m_lock_array; static uint64_t m_lock_size; diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp index 1abc925caed..26085f11400 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -140,8 +141,10 @@ class ParallelFor, // guarantees that the number of teams specified in the `num_teams` clause is // always less than or equal to the maximum concurrently running teams. #if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU) -#pragma omp target teams thread_limit(team_size) firstprivate(a_functor) \ - num_teams(max_active_teams) is_device_ptr(scratch_ptr) + KOKKOS_IMPL_OMPTARGET_PRAGMA( + teams thread_limit(team_size) firstprivate(a_functor) + num_teams(max_active_teams) is_device_ptr(scratch_ptr) + KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) #pragma omp parallel { if (omp_get_num_teams() > max_active_teams) diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp index 4452af3846d..caa568a8925 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp @@ -55,6 +55,9 @@ class ParallelReduce, const pointer_type m_result_ptr; bool m_result_ptr_on_device; const int m_result_ptr_num_elems; + // Only let one ParallelReduce instance at a time use the scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_memory_lock; using TagType = typename Policy::work_tag; public: @@ -105,7 +108,8 @@ class ParallelReduce, m_result_ptr_on_device( MemorySpaceAccess::accessible), - m_result_ptr_num_elems(arg_result_view.size()) {} + m_result_ptr_num_elems(arg_result_view.size()), + m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} }; } // namespace Impl diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp index a302fa71511..8abffa47a43 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp @@ -470,6 +470,10 @@ class ParallelReduce m_scratch_memory_lock; + public: void execute() const { const FunctorType& functor = m_functor_reducer.get_functor(); @@ -517,7 +521,8 @@ class ParallelReduce::value( - arg_functor_reducer.get_functor(), arg_policy.team_size())) {} + arg_functor_reducer.get_functor(), arg_policy.team_size())), + m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} }; } // namespace Impl diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp index 1d6677a1df6..c1f7851f413 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp @@ -238,8 +238,10 @@ class ParallelScanWithTotal, if (!base_t::m_result_ptr_device_accessible) { const int size = base_t::m_functor_reducer.get_reducer().value_size(); - DeepCopy( - base_t::m_result_ptr, chunk_values.data() + (n_chunks - 1), size); + DeepCopy( + base_t::m_policy.space(), base_t::m_result_ptr, + chunk_values.data() + (n_chunks - 1), size); } } else if (!base_t::m_result_ptr_device_accessible) { base_t::m_functor_reducer.get_reducer().init(base_t::m_result_ptr); diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp index fb75f05f270..eb3dc3773c4 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp @@ -21,6 +21,7 @@ #include #include #include +#include namespace Kokkos { namespace Impl { @@ -394,9 +395,11 @@ struct ParallelReduceSpecialize, initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) #if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU) -#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \ - firstprivate(f) is_device_ptr(scratch_ptr) reduction(custom \ - : result) + KOKKOS_IMPL_OMPTARGET_PRAGMA( + teams num_teams(max_active_teams) thread_limit(team_size) + firstprivate(f) is_device_ptr(scratch_ptr) reduction(custom + : result) + KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) #pragma omp parallel reduction(custom : result) { if (omp_get_num_teams() > max_active_teams) @@ -482,9 +485,11 @@ struct ParallelReduceSpecialize, // Case where reduction is on a native data type. if constexpr (std::is_arithmetic::value) { -#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) reduction(+: result) + // Use scratch memory extensions to request dynamic shared memory for + // the right compiler/architecture combination. + KOKKOS_IMPL_OMPTARGET_PRAGMA(teams num_teams(max_active_teams) thread_limit(team_size) map(to: f) \ + is_device_ptr(scratch_ptr) reduction(+: result) \ + KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) #pragma omp parallel reduction(+ : result) { if (omp_get_num_teams() > max_active_teams) @@ -636,11 +641,13 @@ struct ParallelReduceSpecialize, return; } - -#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) - { + // Use scratch memory extensions to request dynamic shared memory for the + // right compiler/architecture combination. + KOKKOS_IMPL_OMPTARGET_PRAGMA( + teams num_teams(nteams) thread_limit(team_size) map(to + : f) + is_device_ptr(scratch_ptr) + KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) { #pragma omp parallel { const int team_num = omp_get_team_num(); @@ -665,9 +672,8 @@ struct ParallelReduceSpecialize, int tree_neighbor_offset = 1; do { -#pragma omp target teams distribute parallel for simd map(to \ - : final_reducer) \ - is_device_ptr(scratch_ptr) +#pragma omp target teams distribute parallel for simd firstprivate( \ + final_reducer) is_device_ptr(scratch_ptr) for (int i = 0; i < nteams - tree_neighbor_offset; i += 2 * tree_neighbor_offset) { ValueType* team_scratch = static_cast(scratch_ptr); diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp index 41e62ce6e6b..6878531730d 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp @@ -438,6 +438,10 @@ class ParallelReduce m_scratch_memory_lock; + public: inline void execute() const { execute_tile( @@ -452,7 +456,8 @@ class ParallelReduce::accessible) {} + typename ViewType::memory_space>::accessible), + m_scratch_memory_lock(OpenMPTargetExec::m_mutex_scratch_ptr) {} template inline std::enable_if_t execute_tile(const FunctorType& functor, diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp index 672271ed6b9..9b578aca112 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp @@ -18,7 +18,6 @@ #define KOKKOS_OPENMPTARGETREDUCER_HPP #include -#include #include #include "Kokkos_OpenMPTarget_Abort.hpp" diff --git a/core/src/SYCL/Kokkos_SYCL.cpp b/core/src/SYCL/Kokkos_SYCL.cpp index 7fa935f693a..9a246f7642f 100644 --- a/core/src/SYCL/Kokkos_SYCL.cpp +++ b/core/src/SYCL/Kokkos_SYCL.cpp @@ -88,26 +88,57 @@ bool SYCL::impl_is_initialized() { void SYCL::impl_finalize() { Impl::SYCLInternal::singleton().finalize(); } void SYCL::print_configuration(std::ostream& os, bool verbose) const { - os << "Devices:\n"; - os << " KOKKOS_ENABLE_SYCL: yes\n"; - os << "\nRuntime Configuration:\n"; - os << "macro KOKKOS_ENABLE_SYCL : defined\n"; +#ifdef KOKKOS_ENABLE_ONEDPL + os << "macro KOKKOS_ENABLE_ONEDPL : defined\n"; +#else + os << "macro KOKKOS_ENABLE_ONEDPL : undefined\n"; +#endif #ifdef KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED os << "macro KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED : defined\n"; #else os << "macro KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED : undefined\n"; #endif - +#ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL + os << "macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL : defined\n"; +#else + os << "macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL : undefined\n"; +#endif #ifdef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES os << "macro KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES : defined\n"; #else os << "macro KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES : undefined\n"; #endif - if (verbose) + int counter = 0; + int active_device = Kokkos::device_id(); + std::cout << "\nAvailable devices: \n"; + std::vector devices = Impl::get_sycl_devices(); + for (const auto& device : devices) { + std::string device_type; + switch (device.get_info()) { + case sycl::info::device_type::cpu: device_type = "cpu"; break; + case sycl::info::device_type::gpu: device_type = "gpu"; break; + case sycl::info::device_type::accelerator: + device_type = "accelerator"; + break; + case sycl::info::device_type::custom: device_type = "custom"; break; + case sycl::info::device_type::automatic: device_type = "automatic"; break; + case sycl::info::device_type::host: device_type = "host"; break; + case sycl::info::device_type::all: device_type = "all"; break; + } + os << "[" << device.get_backend() << "]:" << device_type << ':' << counter + << "] " << device.get_info(); + if (counter == active_device) os << " : Selected"; + os << '\n'; + ++counter; + } + + if (verbose) { + os << '\n'; SYCL::impl_sycl_info(os, m_space_instance->m_queue->get_device()); + } } void SYCL::fence(const std::string& name) const { @@ -137,20 +168,11 @@ void SYCL::impl_static_fence(const std::string& name) { } void SYCL::impl_initialize(InitializationSettings const& settings) { - std::vector gpu_devices = - sycl::device::get_devices(sycl::info::device_type::gpu); - // If the device id is not specified and there are no GPUs, sidestep Kokkos - // device selection and use whatever is available (if no GPU architecture is - // specified). -#if !defined(KOKKOS_ARCH_INTEL_GPU) && !defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) - if (!settings.has_device_id() && gpu_devices.empty()) { - Impl::SYCLInternal::singleton().initialize(sycl::device()); - Impl::SYCLInternal::m_syclDev = 0; - return; - } -#endif - const auto id = ::Kokkos::Impl::get_gpu(settings); - Impl::SYCLInternal::singleton().initialize(gpu_devices[id]); + const auto& visible_devices = ::Kokkos::Impl::get_visible_devices(); + const auto id = + ::Kokkos::Impl::get_gpu(settings).value_or(visible_devices[0]); + std::vector sycl_devices = Impl::get_sycl_devices(); + Impl::SYCLInternal::singleton().initialize(sycl_devices[id]); Impl::SYCLInternal::m_syclDev = id; } @@ -243,9 +265,32 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os, namespace Impl { +std::vector get_sycl_devices() { +#if defined(KOKKOS_ARCH_INTEL_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) || \ + defined(KOKKOS_ARCH_AMD_GPU) + std::vector devices = + sycl::device::get_devices(sycl::info::device_type::gpu); +#if defined(KOKKOS_ARCH_INTEL_GPU) + sycl::backend backend = sycl::backend::ext_oneapi_level_zero; +#elif defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + sycl::backend backend = sycl::backend::ext_oneapi_cuda; +#elif defined(KOKKOS_ARCH_AMD_GPU) + sycl::backend backend = sycl::backend::ext_oneapi_hip; +#endif + devices.erase(std::remove_if(devices.begin(), devices.end(), + [backend](const sycl::device& d) { + return d.get_backend() != backend; + }), + devices.end()); +#else + std::vector devices = sycl::device::get_devices(); +#endif + return devices; +} + int g_sycl_space_factory_initialized = Kokkos::Impl::initialize_space_factory("170_SYCL"); -} +} // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/core/src/SYCL/Kokkos_SYCL.hpp b/core/src/SYCL/Kokkos_SYCL.hpp index be6b4b89302..0f3d1f0994d 100644 --- a/core/src/SYCL/Kokkos_SYCL.hpp +++ b/core/src/SYCL/Kokkos_SYCL.hpp @@ -78,19 +78,15 @@ class SYCL { //! \name Functions that all Kokkos devices must implement. //@{ - KOKKOS_INLINE_FUNCTION static int in_parallel() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static int in_parallel() { #if defined(__SYCL_DEVICE_ONLY__) return true; #else return false; #endif } - - /** \brief Set the device in a "sleep" state. */ - static bool sleep(); - - /** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */ - static bool wake(); +#endif /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */ static void impl_static_fence(const std::string& name); @@ -188,6 +184,10 @@ std::vector partition_space(const SYCL& sycl_space, sycl::queue(context, device, sycl::property::queue::in_order())); return instances; } + +namespace Impl { +std::vector get_sycl_devices(); +} // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 080369770d7..0e67adb5787 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -54,7 +54,7 @@ Kokkos::View sycl_global_unique_token_locks( } SYCLInternal::~SYCLInternal() { - if (!was_finalized || m_scratchSpace || m_scratchFlags) { + if (!was_finalized || m_scratchSpace || m_scratchHost || m_scratchFlags) { std::cerr << "Kokkos::Experimental::SYCL ERROR: Failed to call " "Kokkos::Experimental::SYCL::finalize()" << std::endl; @@ -102,6 +102,23 @@ void SYCLInternal::initialize(const sycl::device& d) { void SYCLInternal::initialize(const sycl::queue& q) { KOKKOS_EXPECTS(!is_initialized()); +#define KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(BACKEND, REQUIRED) \ + if (BACKEND != REQUIRED) \ + Kokkos::abort( \ + "The SYCL execution space instance was initialized with an " \ + "unsupported backend type! For this GPU architecture, only " #REQUIRED \ + " is supported.") +#if defined(KOKKOS_ARCH_INTEL_GPU) + KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(q.get_backend(), + sycl::backend::ext_oneapi_level_zero); +#elif defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(q.get_backend(), + sycl::backend::ext_oneapi_cuda); +#elif defined(KOKKOS_ARCH_AMD_GPU) + KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(q.get_backend(), + sycl::backend::ext_oneapi_hip); +#endif + if (was_finalized) Kokkos::abort("Calling SYCL::initialize after SYCL::finalize is illegal\n"); @@ -196,14 +213,22 @@ void SYCLInternal::finalize() { #endif } - using RecordSYCL = Kokkos::Impl::SharedAllocationRecord; + auto device_mem_space = SYCLDeviceUSMSpace(*m_queue); + auto host_mem_space = SYCLHostUSMSpace(*m_queue); if (nullptr != m_scratchSpace) - RecordSYCL::decrement(RecordSYCL::get_record(m_scratchSpace)); + device_mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + if (nullptr != m_scratchHost) + host_mem_space.deallocate(m_scratchHost, + m_scratchHostCount * sizeScratchGrain); if (nullptr != m_scratchFlags) - RecordSYCL::decrement(RecordSYCL::get_record(m_scratchFlags)); + device_mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); m_syclDev = -1; m_scratchSpaceCount = 0; m_scratchSpace = nullptr; + m_scratchHostCount = 0; + m_scratchHost = nullptr; m_scratchFlagsCount = 0; m_scratchFlags = nullptr; @@ -228,54 +253,68 @@ void SYCLInternal::finalize() { sycl::device_ptr SYCLInternal::scratch_space(const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { - m_scratchSpaceCount = scratch_count(size); - - using Record = Kokkos::Impl::SharedAllocationRecord< - Kokkos::Experimental::SYCLDeviceUSMSpace, void>; + auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); if (nullptr != m_scratchSpace) - Record::decrement(Record::get_record(m_scratchSpace)); + mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + + m_scratchSpaceCount = scratch_count(size); std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchSpaceCount, sizeScratchGrain); - Record* const r = Record::allocate( - Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue), - "Kokkos::Experimental::SYCL::InternalScratchSpace", alloc_size); + m_scratchSpace = static_cast(mem_space.allocate( + "Kokkos::Experimental::SYCL::InternalScratchSpace", alloc_size)); + } + + return m_scratchSpace; +} + +sycl::host_ptr SYCLInternal::scratch_host(const std::size_t size) { + if (verify_is_initialized("scratch_unified") && + m_scratchHostCount < scratch_count(size)) { + auto mem_space = Kokkos::Experimental::SYCLHostUSMSpace(*m_queue); - Record::increment(r); + if (nullptr != m_scratchHost) + mem_space.deallocate(m_scratchHost, + m_scratchHostCount * sizeScratchGrain); - m_scratchSpace = reinterpret_cast(r->data()); + m_scratchHostCount = scratch_count(size); + + std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( + m_scratchHostCount, sizeScratchGrain); + m_scratchHost = static_cast(mem_space.allocate( + "Kokkos::Experimental::SYCL::InternalScratchHost", alloc_size)); } - return m_scratchSpace; + return m_scratchHost; } sycl::device_ptr SYCLInternal::scratch_flags(const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { - m_scratchFlagsCount = scratch_count(size); - - using Record = Kokkos::Impl::SharedAllocationRecord< - Kokkos::Experimental::SYCLDeviceUSMSpace, void>; + auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); if (nullptr != m_scratchFlags) - Record::decrement(Record::get_record(m_scratchFlags)); + mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); + + m_scratchFlagsCount = scratch_count(size); std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchFlagsCount, sizeScratchGrain); - Record* const r = Record::allocate( - Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue), - "Kokkos::Experimental::SYCL::InternalScratchFlags", alloc_size); - - Record::increment(r); - - m_scratchFlags = reinterpret_cast(r->data()); - } - auto memset_event = m_queue->memset(m_scratchFlags, 0, - m_scratchFlagsCount * sizeScratchGrain); + m_scratchFlags = static_cast(mem_space.allocate( + "Kokkos::Experimental::SYCL::InternalScratchFlags", alloc_size)); + + // We only zero-initialize the allocation when we actually allocate. + // It's the responsibility of the features using scratch_flags, + // namely parallel_reduce and parallel_scan, to reset the used values to 0. + auto memset_event = m_queue->memset(m_scratchFlags, 0, + m_scratchFlagsCount * sizeScratchGrain); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES - m_queue->ext_oneapi_submit_barrier(std::vector{memset_event}); + m_queue->ext_oneapi_submit_barrier(std::vector{memset_event}); #endif + } return m_scratchFlags; } @@ -318,15 +357,12 @@ size_t SYCLInternal::USMObjectMem::reserve(size_t n) { assert(m_q); if (m_capacity < n) { - using Record = Kokkos::Impl::SharedAllocationRecord; - // First free what we have (in case malloc can reuse it) - if (m_data) Record::decrement(Record::get_record(m_data)); + AllocationSpace alloc_space(*m_q); + if (m_data) alloc_space.deallocate(m_data, m_capacity); - Record* const r = Record::allocate( - AllocationSpace(*m_q), "Kokkos::Experimental::SYCL::USMObjectMem", n); - Record::increment(r); + m_data = + alloc_space.allocate("Kokkos::Experimental::SYCL::USMObjectMem", n); - m_data = r->data(); if constexpr (sycl::usm::alloc::device == Kind) m_staging.reset(new char[n]); m_capacity = n; @@ -340,8 +376,8 @@ void SYCLInternal::USMObjectMem::reset() { if (m_data) { // This implies a fence since this class is not copyable // and deallocating implies a fence across all registered queues. - using Record = Kokkos::Impl::SharedAllocationRecord; - Record::decrement(Record::get_record(m_data)); + AllocationSpace alloc_space(*m_q); + alloc_space.deallocate(m_data, m_capacity); m_capacity = 0; m_data = nullptr; diff --git a/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/core/src/SYCL/Kokkos_SYCL_Instance.hpp index 51a617054d6..ab7e8ce71e0 100644 --- a/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -45,6 +45,7 @@ class SYCLInternal { sycl::device_ptr scratch_space(const std::size_t size); sycl::device_ptr scratch_flags(const std::size_t size); + sycl::host_ptr scratch_host(const std::size_t size); int acquire_team_scratch_space(); sycl::device_ptr resize_team_scratch_space(int scratch_pool_id, std::int64_t bytes, @@ -60,6 +61,8 @@ class SYCLInternal { std::size_t m_scratchSpaceCount = 0; sycl::device_ptr m_scratchSpace = nullptr; + std::size_t m_scratchHostCount = 0; + sycl::host_ptr m_scratchHost = nullptr; std::size_t m_scratchFlagsCount = 0; sycl::device_ptr m_scratchFlags = nullptr; // mutex to access shared memory @@ -330,8 +333,8 @@ struct sycl::is_device_copyable< Kokkos::Experimental::Impl::SYCLFunctionWrapper> : std::true_type {}; -// FIXME_SYCL Remove when this specialization when specializations for -// sycl::device_copyable also apply to const-qualified types. +#if (defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER < 20240000) || \ + (defined(__LIBSYCL_MAJOR_VERSION) && __LIBSYCL_MAJOR_VERSION < 7) template struct NonTriviallyCopyableAndDeviceCopyable { NonTriviallyCopyableAndDeviceCopyable( @@ -356,3 +359,4 @@ struct sycl::is_device_copyable< : std::true_type {}; #endif #endif +#endif diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp index f4fada570b0..7fbf5420f83 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp @@ -118,6 +118,8 @@ class Kokkos::Impl::ParallelFor, const BarePolicy bare_policy(m_policy); + desul::ensure_sycl_lock_arrays_on_device(q); + auto parallel_for_event = q.submit([&](sycl::handler& cgh) { const auto range = compute_ranges(); const sycl::range<3> global_range = range.get_global_range(); diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp index 9c5767d209f..b4de7eb89ff 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp @@ -81,6 +81,8 @@ class Kokkos::Impl::ParallelFor, const Kokkos::Experimental::SYCL& space = policy.space(); sycl::queue& q = space.sycl_queue(); + desul::ensure_sycl_lock_arrays_on_device(q); + auto parallel_for_event = q.submit([&](sycl::handler& cgh) { #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(memcpy_event); diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index 4fc5818ce9b..ecb4a863da2 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -46,9 +46,9 @@ class Kokkos::Impl::ParallelFor, int m_shmem_size; sycl::device_ptr m_global_scratch_ptr; size_t m_scratch_size[2]; - // Only let one ParallelFor/Reduce modify the team scratch memory. The - // constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_scratch_lock; + // Only let one ParallelFor instance at a time use the team scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_buffers_lock; int m_scratch_pool_id = -1; template @@ -59,6 +59,8 @@ class Kokkos::Impl::ParallelFor, const Kokkos::Experimental::SYCL& space = policy.space(); sycl::queue& q = space.sycl_queue(); + desul::ensure_sycl_lock_arrays_on_device(q); + auto parallel_for_event = q.submit([&](sycl::handler& cgh) { // FIXME_SYCL accessors seem to need a size greater than zero at least for // host queues @@ -74,7 +76,8 @@ class Kokkos::Impl::ParallelFor, auto lambda = [=](sycl::nd_item<2> item) { const member_type team_member( - team_scratch_memory_L0.get_pointer(), shmem_begin, scratch_size[0], + KOKKOS_IMPL_SYCL_GET_MULTI_PTR(team_scratch_memory_L0), shmem_begin, + scratch_size[0], global_scratch_ptr + item.get_group(1) * scratch_size[1], scratch_size[1], item, item.get_group_linear_id(), item.get_group_range(1)); @@ -141,9 +144,9 @@ class Kokkos::Impl::ParallelFor, m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), m_vector_size(arg_policy.impl_vector_length()), - m_scratch_lock(arg_policy.space() - .impl_internal_space_instance() - ->m_team_scratch_mutex) { + m_scratch_buffers_lock(arg_policy.space() + .impl_internal_space_instance() + ->m_team_scratch_mutex) { // FIXME_SYCL optimize if (m_team_size < 0) m_team_size = diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index 6964c2dbcf0..f55280e22e3 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -78,7 +78,7 @@ class Kokkos::Impl::ParallelReduce::accessible), - m_shared_memory_lock( + m_scratch_buffers_lock( m_space.impl_internal_space_instance()->m_mutexScratchSpace) {} private: @@ -95,9 +95,16 @@ class Kokkos::Impl::ParallelReduce results_ptr; + auto host_result_ptr = + (m_result_ptr && !m_result_ptr_device_accessible) + ? static_cast>( + instance.scratch_host(sizeof(value_type) * value_count)) + : nullptr; sycl::event last_reduction_event; + desul::ensure_sycl_lock_arrays_on_device(q); + // If n_tiles==0 we only call init() and final() working with the global // scratch memory but don't copy back to m_result_ptr yet. if (n_tiles == 0) { @@ -109,8 +116,10 @@ class Kokkos::Impl::ParallelReduce>( instance.scratch_space(sizeof(value_type) * value_count)); - sycl::global_ptr device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast>(m_result_ptr) + : static_cast>(host_result_ptr); cgh.single_task([=]() { const CombinedFunctorReducerType& functor_reducer = functor_reducer_wrapper.get_functor(); @@ -148,8 +157,10 @@ class Kokkos::Impl::ParallelReduce>( instance.scratch_space(sizeof(value_type) * value_count * n_wgroups)); - sycl::global_ptr device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast>(m_result_ptr) + : static_cast>(host_result_ptr); auto scratch_flags = static_cast>( instance.scratch_flags(sizeof(unsigned int))); @@ -223,6 +234,7 @@ class Kokkos::Impl::ParallelReduce= static_cast(n_wgroups)) reducer.init(&local_mem[local_id * value_count]); else { @@ -268,6 +280,7 @@ class Kokkos::Impl::ParallelReduce= static_cast(n_wgroups)) reducer.init(&local_value); else { @@ -296,11 +309,13 @@ class Kokkos::Impl::ParallelReduce( - m_space, m_result_ptr, results_ptr, - sizeof(*m_result_ptr) * value_count); + // Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower. + if (host_result_ptr) { + m_space.fence( + "Kokkos::Impl::ParallelReduce::execute: result " + "not device-accessible"); + std::memcpy(m_result_ptr, host_result_ptr, + sizeof(value_type) * value_count); } return last_reduction_event; @@ -335,9 +350,9 @@ class Kokkos::Impl::ParallelReduce m_shared_memory_lock; + // Only let one ParallelReduce instance at a time use the host scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_buffers_lock; }; #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_MDRANGE_HPP */ diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp index 8c900cfa428..5333e3c8a83 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp @@ -51,7 +51,7 @@ class Kokkos::Impl::ParallelReduce::accessible), - m_shared_memory_lock( + m_scratch_buffers_lock( p.space().impl_internal_space_instance()->m_mutexScratchSpace) {} private: @@ -70,11 +70,20 @@ class Kokkos::Impl::ParallelReduce results_ptr = nullptr; - sycl::global_ptr device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; + auto host_result_ptr = + (m_result_ptr && !m_result_ptr_device_accessible) + ? static_cast>( + instance.scratch_host(sizeof(value_type) * value_count)) + : nullptr; + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast>(m_result_ptr) + : static_cast>(host_result_ptr); sycl::event last_reduction_event; + desul::ensure_sycl_lock_arrays_on_device(q); + // If size<=1 we only call init(), the functor and possibly final once // working with the global scratch memory but don't copy back to // m_result_ptr yet. @@ -168,6 +177,7 @@ class Kokkos::Impl::ParallelReduce= n_wgroups) reducer.init(&local_mem[local_id * value_count]); else { @@ -210,6 +220,7 @@ class Kokkos::Impl::ParallelReduce= n_wgroups) reducer.init(&local_value); else { @@ -320,11 +331,13 @@ class Kokkos::Impl::ParallelReduce( - space, m_result_ptr, results_ptr, - sizeof(*m_result_ptr) * value_count); + // Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower. + if (host_result_ptr) { + space.fence( + "Kokkos::Impl::ParallelReduce::execute: result " + "not device-accessible"); + std::memcpy(m_result_ptr, host_result_ptr, + sizeof(*m_result_ptr) * value_count); } return last_reduction_event; @@ -354,9 +367,9 @@ class Kokkos::Impl::ParallelReduce m_shared_memory_lock; + // Only let one ParallelReduce instance at a time use the host scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_buffers_lock; }; #endif /* KOKKOS_SYCL_PARALLEL_REDUCE_RANGE_HPP */ diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index 07145b0fb93..27165c59e3a 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -59,9 +59,10 @@ class Kokkos::Impl::ParallelReduce m_scratch_lock; + // Only let one ParallelReduce instance at a time use the team scratch memory + // and the host scratch memory. The constructor acquires the mutex which is + // released in the destructor. + std::scoped_lock m_scratch_buffers_lock; int m_scratch_pool_id = -1; template @@ -79,9 +80,16 @@ class Kokkos::Impl::ParallelReduce>( + instance.scratch_host(sizeof(value_type) * value_count)) + : nullptr; sycl::event last_reduction_event; + desul::ensure_sycl_lock_arrays_on_device(q); + // If size<=1 we only call init(), the functor and possibly final once // working with the global scratch memory but don't copy back to // m_result_ptr yet. @@ -89,8 +97,10 @@ class Kokkos::Impl::ParallelReduce>(instance.scratch_space( sizeof(value_type) * std::max(value_count, 1u))); - sycl::global_ptr device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast>(m_result_ptr) + : static_cast>(host_result_ptr); auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) { // FIXME_SYCL accessors seem to need a size greater than zero at least @@ -121,9 +131,10 @@ class Kokkos::Impl::ParallelReduce) functor(team_member, update); else @@ -160,12 +171,16 @@ class Kokkos::Impl::ParallelReduce const global_scratch_ptr = m_global_scratch_ptr; + sycl::local_accessor num_teams_done(1, cgh); auto team_reduction_factory = [&](sycl::local_accessor local_mem, sycl::device_ptr results_ptr) { - sycl::global_ptr device_accessible_result_ptr = - m_result_ptr_device_accessible ? m_result_ptr : nullptr; + auto device_accessible_result_ptr = + m_result_ptr_device_accessible + ? static_cast>(m_result_ptr) + : static_cast>( + host_result_ptr); auto lambda = [=](sycl::nd_item<2> item) { auto n_wgroups = item.get_group_range()[1]; int wgroup_size = @@ -173,8 +188,6 @@ class Kokkos::Impl::ParallelReduce( - local_mem[wgroup_size * std::max(value_count, 1u)]); const auto local_id = item.get_local_linear_id(); const CombinedFunctorReducerType& functor_reducer = functor_reducer_wrapper.get_functor(); @@ -188,8 +201,8 @@ class Kokkos::Impl::ParallelReduce scratch_flags_ref(*scratch_flags); - num_teams_done = ++scratch_flags_ref; + num_teams_done[0] = ++scratch_flags_ref; } sycl::group_barrier(item.get_group()); - if (num_teams_done == n_wgroups) { + if (num_teams_done[0] == n_wgroups) { + if (local_id == 0) *scratch_flags = 0; if (local_id >= n_wgroups) reducer.init(&local_mem[local_id * value_count]); else { @@ -241,8 +255,8 @@ class Kokkos::Impl::ParallelReduce scratch_flags_ref(*scratch_flags); - num_teams_done = ++scratch_flags_ref; + num_teams_done[0] = ++scratch_flags_ref; } item.barrier(sycl::access::fence_space::local_space); - if (num_teams_done == n_wgroups) { + if (num_teams_done[0] == n_wgroups) { + if (local_id == 0) *scratch_flags = 0; if (local_id >= n_wgroups) reducer.init(&local_value); else { @@ -311,10 +326,7 @@ class Kokkos::Impl::ParallelReduce local_mem( - sycl::range<1>(wgroup_size) * std::max(value_count, 1u) + - (sizeof(unsigned int) + sizeof(value_type) - 1) / - sizeof(value_type), - cgh); + sycl::range<1>(wgroup_size) * std::max(value_count, 1u), cgh); const auto init_size = std::max((size + wgroup_size - 1) / wgroup_size, 1); @@ -358,11 +370,13 @@ class Kokkos::Impl::ParallelReduce( - space, m_result_ptr, results_ptr, - sizeof(*m_result_ptr) * value_count); + // Using DeepCopy instead of fence+memcpy turned out to be up to 2x slower. + if (host_result_ptr) { + space.fence( + "Kokkos::Impl::ParallelReduce::execute: result not " + "device-accessible"); + std::memcpy(m_result_ptr, host_result_ptr, + sizeof(*m_result_ptr) * value_count); } return last_reduction_event; @@ -448,9 +462,9 @@ class Kokkos::Impl::ParallelReducem_team_scratch_mutex) { + m_scratch_buffers_lock(arg_policy.space() + .impl_internal_space_instance() + ->m_team_scratch_mutex) { initialize(); } }; diff --git a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index 04425723e19..977b69bc9eb 100644 --- a/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKO_SYCL_PARALLEL_SCAN_RANGE_HPP -#define KOKKO_SYCL_PARALLEL_SCAN_RANGE_HPP +#ifndef KOKKOS_SYCL_PARALLEL_SCAN_RANGE_HPP +#define KOKKOS_SYCL_PARALLEL_SCAN_RANGE_HPP #include #include @@ -111,13 +111,13 @@ class ParallelScanSYCLBase { const CombinedFunctorReducer m_functor_reducer; const Policy m_policy; - pointer_type m_scratch_space = nullptr; - const pointer_type m_result_ptr; + sycl::host_ptr m_scratch_host = nullptr; + pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; - // Only let one Parallel/Scan modify the shared memory. The - // constructor acquires the mutex which is released in the destructor. - std::scoped_lock m_shared_memory_lock; + // Only let one ParallelScan instance at a time use the host scratch memory. + // The constructor acquires the mutex which is released in the destructor. + std::scoped_lock m_scratch_buffers_lock; private: template @@ -187,6 +187,7 @@ class ParallelScanSYCLBase { } item.barrier(sycl::access::fence_space::global_space); if (num_teams_done[0] == n_wgroups) { + if (local_id == 0) *scratch_flags = 0; value_type total; reducer.init(&total); @@ -220,6 +221,8 @@ class ParallelScanSYCLBase { sycl::device_ptr global_mem; sycl::device_ptr group_results; + desul::ensure_sycl_lock_arrays_on_device(q); + auto perform_work_group_scans = q.submit([&](sycl::handler& cgh) { sycl::local_accessor num_teams_done(1, cgh); @@ -253,7 +256,8 @@ class ParallelScanSYCLBase { global_mem = static_cast>(instance.scratch_space( n_wgroups * (wgroup_size + 1) * sizeof(value_type))); - m_scratch_space = global_mem; + m_scratch_host = static_cast>( + instance.scratch_host(sizeof(value_type))); group_results = global_mem + n_wgroups * wgroup_size; @@ -281,10 +285,11 @@ class ParallelScanSYCLBase { // Write results to global memory auto update_global_results = q.submit([&](sycl::handler& cgh) { - auto result_ptr_device_accessible = m_result_ptr_device_accessible; // The compiler failed with CL_INVALID_ARG_VALUE if using m_result_ptr // directly. - auto result_ptr = m_result_ptr_device_accessible ? m_result_ptr : nullptr; + pointer_type result_ptr = m_result_ptr_device_accessible + ? m_result_ptr + : static_cast(m_scratch_host); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES cgh.depends_on(perform_work_group_scans); @@ -293,7 +298,6 @@ class ParallelScanSYCLBase { cgh.parallel_for( sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size), [=](sycl::nd_item<1> item) { - auto global_mem_copy = global_mem; const index_type global_id = item.get_global_linear_id(); const CombinedFunctorReducer< FunctorType, typename Analysis::Reducer>& functor_reducer = @@ -312,9 +316,7 @@ class ParallelScanSYCLBase { else functor(WorkTag(), global_id + begin, update, true); - global_mem_copy[global_id] = update; - if (global_id == size - 1 && result_ptr_device_accessible) - *result_ptr = update; + if (global_id == size - 1) *result_ptr = update; } }); }); @@ -351,9 +353,9 @@ class ParallelScanSYCLBase { m_policy(arg_policy), m_result_ptr(arg_result_ptr), m_result_ptr_device_accessible(arg_result_ptr_device_accessible), - m_shared_memory_lock(m_policy.space() - .impl_internal_space_instance() - ->m_mutexScratchSpace) {} + m_scratch_buffers_lock(m_policy.space() + .impl_internal_space_instance() + ->m_mutexScratchSpace) {} }; } // namespace Kokkos::Impl @@ -390,11 +392,13 @@ class Kokkos::Impl::ParallelScanWithTotal< Base::impl_execute([&]() { const long long nwork = Base::m_policy.end() - Base::m_policy.begin(); if (nwork > 0 && !Base::m_result_ptr_device_accessible) { + // Using DeepCopy instead of fence+memcpy turned out to be up to 2x + // slower. + m_exec.fence( + "Kokkos::Impl::ParallelReduce::execute: " + "result not device-accessible"); const int size = Base::m_functor_reducer.get_reducer().value_size(); - DeepCopy(m_exec, Base::m_result_ptr, - Base::m_scratch_space + nwork - 1, - size); + std::memcpy(Base::m_result_ptr, Base::m_scratch_host, size); } }); } diff --git a/core/src/SYCL/Kokkos_SYCL_Space.cpp b/core/src/SYCL/Kokkos_SYCL_Space.cpp index 64b7f56796a..9cc8008cdf3 100644 --- a/core/src/SYCL/Kokkos_SYCL_Space.cpp +++ b/core/src/SYCL/Kokkos_SYCL_Space.cpp @@ -25,7 +25,6 @@ #include #include #include -#include #include /*--------------------------------------------------------------------------*/ @@ -243,226 +242,17 @@ void SYCLHostUSMSpace::deallocate(const char* arg_label, } // namespace Experimental } // namespace Kokkos -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord SharedAllocationRecord< - Kokkos::Experimental::SYCLDeviceUSMSpace, void>::s_root_record; - -SharedAllocationRecord SharedAllocationRecord< - Kokkos::Experimental::SYCLSharedUSMSpace, void>::s_root_record; - -SharedAllocationRecord SharedAllocationRecord< - Kokkos::Experimental::SYCLHostUSMSpace, void>::s_root_record; -#endif - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCLDeviceUSMSpace& space, - const std::string& label, const size_t size, - const SharedAllocationRecord::function_type dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(space, label, size), - sizeof(SharedAllocationHeader) + size, dealloc, label), - m_space(space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, label); - - // Copy to device memory - Kokkos::Experimental::SYCL exec; - Kokkos::Impl::DeepCopy( - exec, RecordBase::m_alloc_ptr, &header, sizeof(SharedAllocationHeader)); - exec.fence( - "SharedAllocationRecord::SharedAllocationRecord(): fence after copying header from " - "HostSpace"); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& arg_exec_space, - const Kokkos::Experimental::SYCLDeviceUSMSpace& space, - const std::string& label, const size_t size, - const SharedAllocationRecord::function_type dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Kokkos::Impl::checked_allocation_with_header(arg_exec_space, space, - label, size), - sizeof(SharedAllocationHeader) + size, dealloc, label), - m_space(space) { - SharedAllocationHeader header; - - this->base_t::_fill_host_accessible_header_info(header, label); - - // Copy to device memory - Kokkos::Impl::DeepCopy( - arg_exec_space, RecordBase::m_alloc_ptr, &header, - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(exec_space, arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(exec_space, arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - - this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, - arg_label); -} - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -SharedAllocationRecord::~SharedAllocationRecord() { - const auto alloc_size = SharedAllocationRecord::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - alloc_size, alloc_size - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - const auto alloc_size = SharedAllocationRecord::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - alloc_size, alloc_size - sizeof(SharedAllocationHeader)); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - const auto alloc_size = SharedAllocationRecord::m_alloc_size; - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - alloc_size, alloc_size - sizeof(SharedAllocationHeader)); -} - -//---------------------------------------------------------------------------- - -} // namespace Impl -} // namespace Kokkos - //============================================================================== // {{{1 #include -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; -template class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; -template class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLSharedUSMSpace>; -template class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLHostUSMSpace>; - -} // namespace Impl -} // namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::Experimental::SYCLDeviceUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::Experimental::SYCLSharedUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( + Kokkos::Experimental::SYCLHostUSMSpace); // end Explicit instantiations of CRTP Base classes }}}1 //============================================================================== diff --git a/core/src/SYCL/Kokkos_SYCL_Space.hpp b/core/src/SYCL/Kokkos_SYCL_Space.hpp index 239c6e3ce0b..b86cfca413c 100644 --- a/core/src/SYCL/Kokkos_SYCL_Space.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Space.hpp @@ -66,11 +66,6 @@ class SYCLDeviceUSMSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; - private: - template - friend class LogicalMemorySpace; - - public: static constexpr const char* name() { return "SYCLDeviceUSM"; }; private: @@ -87,6 +82,16 @@ class SYCLSharedUSMSpace { SYCLSharedUSMSpace(); explicit SYCLSharedUSMSpace(sycl::queue queue); + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const SYCL& exec_space, const std::size_t arg_alloc_size) const; void* allocate(const SYCL& exec_space, const char* arg_label, @@ -102,11 +107,6 @@ class SYCLSharedUSMSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; - private: - template - friend class LogicalMemorySpace; - - public: static constexpr const char* name() { return "SYCLSharedUSM"; }; private: @@ -123,6 +123,16 @@ class SYCLHostUSMSpace { SYCLHostUSMSpace(); explicit SYCLHostUSMSpace(sycl::queue queue); + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } void* allocate(const SYCL& exec_space, const std::size_t arg_alloc_size) const; void* allocate(const SYCL& exec_space, const char* arg_label, @@ -138,11 +148,6 @@ class SYCLHostUSMSpace { const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; - private: - template - friend class LogicalMemorySpace; - - public: static constexpr const char* name() { return "SYCLHostUSM"; }; private: @@ -166,19 +171,16 @@ struct is_sycl_type_space : public std::true_type {}; static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); template <> struct MemorySpaceAccess -class SharedAllocationRecord - : public HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace> { - private: - friend class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; - friend class HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; - using base_t = HostInaccessibleSharedAllocationRecordCommon< - Kokkos::Experimental::SYCLDeviceUSMSpace>; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord(SharedAllocationRecord&&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete; - -#ifdef KOKKOS_ENABLE_DEBUG - static RecordBase s_root_record; -#endif - - const Kokkos::Experimental::SYCLDeviceUSMSpace m_space; - - protected: - ~SharedAllocationRecord(); - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const Kokkos::Experimental::SYCLDeviceUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLSharedUSMSpace> { - private: - friend class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLSharedUSMSpace>; - using base_t = - SharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord(SharedAllocationRecord&&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete; - - static RecordBase s_root_record; - - const Kokkos::Experimental::SYCLSharedUSMSpace m_space; - - protected: - ~SharedAllocationRecord(); - - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const Kokkos::Experimental::SYCLSharedUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -template <> -class SharedAllocationRecord - : public SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLHostUSMSpace> { - private: - friend class SharedAllocationRecordCommon< - Kokkos::Experimental::SYCLHostUSMSpace>; - using base_t = - SharedAllocationRecordCommon; - using RecordBase = SharedAllocationRecord; - - SharedAllocationRecord(const SharedAllocationRecord&) = delete; - SharedAllocationRecord(SharedAllocationRecord&&) = delete; - SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete; - - static RecordBase s_root_record; - - const Kokkos::Experimental::SYCLHostUSMSpace m_space; - - protected: - ~SharedAllocationRecord(); - - SharedAllocationRecord() = default; - - template - SharedAllocationRecord( - const ExecutionSpace& /*exec_space*/, - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate) - : SharedAllocationRecord(arg_space, arg_label, arg_alloc_size, - arg_dealloc) {} - - SharedAllocationRecord( - const Kokkos::Experimental::SYCL& exec_space, - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); - - SharedAllocationRecord( - const Kokkos::Experimental::SYCLHostUSMSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &base_t::deallocate); -}; - -} // namespace Impl - } // namespace Kokkos +KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::Experimental::SYCLDeviceUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::Experimental::SYCLSharedUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION( + Kokkos::Experimental::SYCLHostUSMSpace); + #endif #endif diff --git a/core/src/SYCL/Kokkos_SYCL_Team.hpp b/core/src/SYCL/Kokkos_SYCL_Team.hpp index 89c09c3195f..dbba3827581 100644 --- a/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -140,9 +140,14 @@ class SYCLTeamMember { } value = sg.shuffle(value, 0); + const auto n_subgroups = sg.get_group_range()[0]; + if (n_subgroups == 1) { + reducer.reference() = value; + return; + } + // We need to chunk up the whole reduction because we might not have // allocated enough memory. - const auto n_subgroups = sg.get_group_range()[0]; const unsigned int maximum_work_range = std::min(m_team_reduce_size / sizeof(value_type), n_subgroups); diff --git a/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp b/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp index 9548f211d9e..61db6b34aac 100644 --- a/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp +++ b/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp @@ -26,8 +26,7 @@ namespace Impl { template struct ZeroMemset> { ZeroMemset(const Kokkos::Experimental::SYCL& exec_space, - const View& dst, - typename View::const_value_type&) { + const View& dst) { auto event = exec_space.impl_internal_space_instance()->m_queue->memset( dst.data(), 0, dst.size() * sizeof(typename View::value_type)); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES @@ -35,12 +34,6 @@ struct ZeroMemset> { ->m_queue->ext_oneapi_submit_barrier(std::vector{event}); #endif } - - ZeroMemset(const View& dst, - typename View::const_value_type&) { - Experimental::Impl::SYCLInternal::singleton().m_queue->memset( - dst.data(), 0, dst.size() * sizeof(typename View::value_type)); - } }; } // namespace Impl diff --git a/core/src/Serial/Kokkos_Serial.cpp b/core/src/Serial/Kokkos_Serial.cpp index 071ecdbc4fa..39b201976b5 100644 --- a/core/src/Serial/Kokkos_Serial.cpp +++ b/core/src/Serial/Kokkos_Serial.cpp @@ -153,7 +153,7 @@ void Serial::print_configuration(std::ostream& os, bool /*verbose*/) const { os << "Host Serial Execution Space:\n"; os << " KOKKOS_ENABLE_SERIAL: yes\n"; -#ifdef KOKKOS_INTERNAL_NOT_PARALLEL +#ifdef KOKKOS_ENABLE_ATOMICS_BYPASS os << "Kokkos atomics disabled\n"; #endif diff --git a/core/src/Serial/Kokkos_Serial.hpp b/core/src/Serial/Kokkos_Serial.hpp index 67119cac164..43eb4992ed7 100644 --- a/core/src/Serial/Kokkos_Serial.hpp +++ b/core/src/Serial/Kokkos_Serial.hpp @@ -121,7 +121,10 @@ class Serial { /// For the Serial device, this method always returns false, /// because parallel_for or parallel_reduce with the Serial device /// always execute sequentially. - inline static int in_parallel() { return false; } + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED inline static int in_parallel() { return false; } +#endif /// \brief Wait until all dispatched functors complete. /// diff --git a/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp b/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp index 69787aa5001..67978aa3e9f 100644 --- a/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp +++ b/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKO_SERIAL_PARALLEL_MDRANGE_HPP -#define KOKKO_SERIAL_PARALLEL_MDRANGE_HPP +#ifndef KOKKOS_SERIAL_PARALLEL_MDRANGE_HPP +#define KOKKOS_SERIAL_PARALLEL_MDRANGE_HPP #include #include diff --git a/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp b/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp index 56894716dbd..91b4c567113 100644 --- a/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp +++ b/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKO_SERIAL_PARALLEL_RANGE_HPP -#define KOKKO_SERIAL_PARALLEL_RANGE_HPP +#ifndef KOKKOS_SERIAL_PARALLEL_RANGE_HPP +#define KOKKOS_SERIAL_PARALLEL_RANGE_HPP #include diff --git a/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp b/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp index 0876f1af229..f34a7daaca0 100644 --- a/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp +++ b/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKO_SERIAL_PARALLEL_TEAM_HPP -#define KOKKO_SERIAL_PARALLEL_TEAM_HPP +#ifndef KOKKOS_SERIAL_PARALLEL_TEAM_HPP +#define KOKKOS_SERIAL_PARALLEL_TEAM_HPP #include diff --git a/core/src/Serial/Kokkos_Serial_Task.hpp b/core/src/Serial/Kokkos_Serial_Task.hpp index f9c86f55ce0..5905d6d32e1 100644 --- a/core/src/Serial/Kokkos_Serial_Task.hpp +++ b/core/src/Serial/Kokkos_Serial_Task.hpp @@ -121,7 +121,7 @@ class TaskQueueSpecializationConstrained< using task_base_type = TaskBase; using queue_type = typename scheduler_type::queue_type; - task_base_type* const end = (task_base_type*)task_base_type::EndTag; + auto* const end = reinterpret_cast(task_base_type::EndTag); execution_space serial_execution_space; auto& data = serial_execution_space.impl_internal_space_instance() @@ -157,7 +157,7 @@ class TaskQueueSpecializationConstrained< using task_base_type = TaskBase; using queue_type = typename scheduler_type::queue_type; - task_base_type* const end = (task_base_type*)task_base_type::EndTag; + auto* const end = reinterpret_cast(task_base_type::EndTag); execution_space serial_execution_space; diff --git a/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp b/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp index 3ec2dfbcfa0..6ad6aabc5a7 100644 --- a/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp +++ b/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp @@ -22,6 +22,7 @@ #include #include +#include namespace Kokkos { namespace Impl { @@ -34,14 +35,11 @@ template struct ZeroMemset< std::conditional_t::value, Serial, DummyExecutionSpace>, - View> - : public ZeroMemset> { - using Base = ZeroMemset>; - using Base::Base; - - ZeroMemset(const Serial&, const View& dst, - typename View::const_value_type& value) - : Base(dst, value) {} + View> { + ZeroMemset(const Serial&, const View& dst) { + using ValueType = typename View::value_type; + std::memset(dst.data(), 0, sizeof(ValueType) * dst.size()); + } }; } // namespace Impl diff --git a/core/src/Threads/Kokkos_Threads.hpp b/core/src/Threads/Kokkos_Threads.hpp index c0d70c03ecb..31653c46cac 100644 --- a/core/src/Threads/Kokkos_Threads.hpp +++ b/core/src/Threads/Kokkos_Threads.hpp @@ -38,15 +38,6 @@ static_assert(false, /*--------------------------------------------------------------------------*/ -namespace Kokkos { -namespace Impl { -class ThreadsExec; -enum class fence_is_static { yes, no }; -} // namespace Impl -} // namespace Kokkos - -/*--------------------------------------------------------------------------*/ - namespace Kokkos { /** \brief Execution space for a pool of C++11 threads on a CPU. */ @@ -73,7 +64,9 @@ class Threads { /// \brief True if and only if this method is being called in a /// thread-parallel function. - static int in_parallel(); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static int in_parallel(); +#endif /// \brief Print configuration information to the given output stream. void print_configuration(std::ostream& os, bool verbose = false) const; diff --git a/core/src/Threads/Kokkos_ThreadsExec.cpp b/core/src/Threads/Kokkos_Threads_Instance.cpp similarity index 56% rename from core/src/Threads/Kokkos_ThreadsExec.cpp rename to core/src/Threads/Kokkos_Threads_Instance.cpp index 801a1ac82e9..3842966cd77 100644 --- a/core/src/Threads/Kokkos_ThreadsExec.cpp +++ b/core/src/Threads/Kokkos_Threads_Instance.cpp @@ -16,17 +16,15 @@ #ifndef KOKKOS_IMPL_PUBLIC_INCLUDE #define KOKKOS_IMPL_PUBLIC_INCLUDE +#include "Threads/Kokkos_Threads_Instance.hpp" #endif #include -#include -#include #include #include #include #include -#include #include @@ -41,7 +39,6 @@ namespace Kokkos { namespace Impl { namespace { -std::mutex host_internal_cppthread_mutex; // std::thread compatible driver. // Recovery from an exception would require constant intra-thread health @@ -49,7 +46,7 @@ std::mutex host_internal_cppthread_mutex; // abort the process. void internal_cppthread_driver() { try { - ThreadsExec::driver(); + ThreadsInternal::driver(); } catch (const std::exception &x) { std::cerr << "Exception thrown from worker thread: " << x.what() << std::endl; @@ -62,32 +59,17 @@ void internal_cppthread_driver() { } } -ThreadsExec s_threads_process; -ThreadsExec *s_threads_exec[ThreadsExec::MAX_THREAD_COUNT] = {nullptr}; -std::thread::id s_threads_pid[ThreadsExec::MAX_THREAD_COUNT]; -std::pair s_threads_coord[ThreadsExec::MAX_THREAD_COUNT]; +ThreadsInternal s_threads_process; +ThreadsInternal *s_threads_exec[ThreadsInternal::MAX_THREAD_COUNT] = {nullptr}; +std::thread::id s_threads_pid[ThreadsInternal::MAX_THREAD_COUNT]; +std::pair + s_threads_coord[ThreadsInternal::MAX_THREAD_COUNT]; int s_thread_pool_size[3] = {0, 0, 0}; -unsigned s_current_reduce_size = 0; -unsigned s_current_shared_size = 0; - -void (*volatile s_current_function)(ThreadsExec &, const void *); +void (*volatile s_current_function)(ThreadsInternal &, const void *); const void *volatile s_current_function_arg = nullptr; -struct Sentinel { - ~Sentinel() { - if (s_thread_pool_size[0] || s_thread_pool_size[1] || - s_thread_pool_size[2] || s_current_reduce_size || - s_current_shared_size || s_current_function || s_current_function_arg || - s_threads_exec[0]) { - std::cerr << "ERROR : Process exiting while Kokkos::Threads is still " - "initialized" - << std::endl; - } - } -}; - inline unsigned fan_size(const unsigned rank, const unsigned size) { const unsigned rank_rev = size - (rank + 1); unsigned count = 0; @@ -97,6 +79,12 @@ inline unsigned fan_size(const unsigned rank, const unsigned size) { return count; } +void wait_yield(volatile ThreadState &flag, const ThreadState value) { + while (value == flag) { + std::this_thread::yield(); + } +} + } // namespace } // namespace Impl } // namespace Kokkos @@ -107,66 +95,44 @@ inline unsigned fan_size(const unsigned rank, const unsigned size) { namespace Kokkos { namespace Impl { -//---------------------------------------------------------------------------- -// Spawn a thread - -void ThreadsExec::spawn() { - std::thread t(internal_cppthread_driver); - t.detach(); -} - -//---------------------------------------------------------------------------- - -bool ThreadsExec::is_process() { +bool ThreadsInternal::is_process() { static const std::thread::id master_pid = std::this_thread::get_id(); return master_pid == std::this_thread::get_id(); } -void ThreadsExec::global_lock() { host_internal_cppthread_mutex.lock(); } - -void ThreadsExec::global_unlock() { host_internal_cppthread_mutex.unlock(); } - //---------------------------------------------------------------------------- -void ThreadsExec::wait_yield(volatile int &flag, const int value) { - while (value == flag) { - std::this_thread::yield(); - } -} - -void execute_function_noop(ThreadsExec &, const void *) {} +void execute_function_noop(ThreadsInternal &, const void *) {} -void ThreadsExec::driver() { +void ThreadsInternal::driver() { SharedAllocationRecord::tracking_enable(); - ThreadsExec this_thread; + ThreadsInternal this_thread; - while (ThreadsExec::Active == this_thread.m_pool_state) { + while (this_thread.m_pool_state == ThreadState::Active) { (*s_current_function)(this_thread, s_current_function_arg); // Deactivate thread and wait for reactivation - this_thread.m_pool_state = ThreadsExec::Inactive; + this_thread.m_pool_state = ThreadState::Inactive; - wait_yield(this_thread.m_pool_state, ThreadsExec::Inactive); + wait_yield(this_thread.m_pool_state, ThreadState::Inactive); } } -ThreadsExec::ThreadsExec() +ThreadsInternal::ThreadsInternal() : m_pool_base(nullptr), m_scratch(nullptr), m_scratch_reduce_end(0), m_scratch_thread_end(0), - m_numa_rank(0), - m_numa_core_rank(0), m_pool_rank(0), m_pool_size(0), m_pool_fan_size(0), - m_pool_state(ThreadsExec::Terminating) { + m_pool_state(ThreadState::Terminating) { if (&s_threads_process != this) { - // A spawned thread - - ThreadsExec *const nil = nullptr; + // The code in the if is executed by a spawned thread not by the root + // thread + ThreadsInternal *const nil = nullptr; // Which entry in 's_threads_exec', possibly determined from hwloc binding const int entry = reinterpret_cast(s_current_function_arg) < @@ -178,80 +144,66 @@ ThreadsExec::ThreadsExec() // Given a good entry set this thread in the 's_threads_exec' array if (entry < s_thread_pool_size[0] && nil == atomic_compare_exchange(s_threads_exec + entry, nil, this)) { - const std::pair coord = - Kokkos::hwloc::get_this_thread_coordinate(); - - m_numa_rank = coord.first; - m_numa_core_rank = coord.second; - m_pool_base = s_threads_exec; - m_pool_rank = s_thread_pool_size[0] - (entry + 1); - m_pool_rank_rev = s_thread_pool_size[0] - (pool_rank() + 1); - m_pool_size = s_thread_pool_size[0]; - m_pool_fan_size = fan_size(m_pool_rank, m_pool_size); - m_pool_state = ThreadsExec::Active; + m_pool_base = s_threads_exec; + m_pool_rank = s_thread_pool_size[0] - (entry + 1); + m_pool_rank_rev = s_thread_pool_size[0] - (pool_rank() + 1); + m_pool_size = s_thread_pool_size[0]; + m_pool_fan_size = fan_size(m_pool_rank, m_pool_size); + m_pool_state = ThreadState::Active; s_threads_pid[m_pool_rank] = std::this_thread::get_id(); // Inform spawning process that the threads_exec entry has been set. - s_threads_process.m_pool_state = ThreadsExec::Active; + s_threads_process.m_pool_state = ThreadState::Active; } else { // Inform spawning process that the threads_exec entry could not be set. - s_threads_process.m_pool_state = ThreadsExec::Terminating; + s_threads_process.m_pool_state = ThreadState::Terminating; } } else { // Enables 'parallel_for' to execute on unitialized Threads device m_pool_rank = 0; m_pool_size = 1; - m_pool_state = ThreadsExec::Inactive; + m_pool_state = ThreadState::Inactive; s_threads_pid[m_pool_rank] = std::this_thread::get_id(); } } -ThreadsExec::~ThreadsExec() { +ThreadsInternal::~ThreadsInternal() { const unsigned entry = m_pool_size - (m_pool_rank + 1); - using Record = Kokkos::Impl::SharedAllocationRecord; - if (m_scratch) { - Record *const r = Record::get_record(m_scratch); - + Kokkos::kokkos_free(m_scratch); m_scratch = nullptr; - - Record::decrement(r); } m_pool_base = nullptr; m_scratch_reduce_end = 0; m_scratch_thread_end = 0; - m_numa_rank = 0; - m_numa_core_rank = 0; m_pool_rank = 0; m_pool_size = 0; m_pool_fan_size = 0; - m_pool_state = ThreadsExec::Terminating; + m_pool_state = ThreadState::Terminating; if (&s_threads_process != this && entry < MAX_THREAD_COUNT) { - ThreadsExec *const nil = nullptr; + ThreadsInternal *const nil = nullptr; atomic_compare_exchange(s_threads_exec + entry, this, nil); - s_threads_process.m_pool_state = ThreadsExec::Terminating; + s_threads_process.m_pool_state = ThreadState::Terminating; } } -int ThreadsExec::get_thread_count() { return s_thread_pool_size[0]; } - -ThreadsExec *ThreadsExec::get_thread(const int init_thread_rank) { - ThreadsExec *const th = +ThreadsInternal *ThreadsInternal::get_thread(const int init_thread_rank) { + ThreadsInternal *const th = init_thread_rank < s_thread_pool_size[0] ? s_threads_exec[s_thread_pool_size[0] - (init_thread_rank + 1)] : nullptr; if (nullptr == th || th->m_pool_rank != init_thread_rank) { std::ostringstream msg; - msg << "Kokkos::Impl::ThreadsExec::get_thread ERROR : " + msg << "Kokkos::Impl::ThreadsInternal::get_thread ERROR : " << "thread " << init_thread_rank << " of " << s_thread_pool_size[0]; if (nullptr == th) { msg << " does not exist"; @@ -264,24 +216,6 @@ ThreadsExec *ThreadsExec::get_thread(const int init_thread_rank) { return th; } -//---------------------------------------------------------------------------- - -void ThreadsExec::execute_sleep(ThreadsExec &exec, const void *) { - ThreadsExec::global_lock(); - ThreadsExec::global_unlock(); - - const int n = exec.m_pool_fan_size; - const int rank_rev = exec.m_pool_size - (exec.m_pool_rank + 1); - - for (int i = 0; i < n; ++i) { - Impl::spinwait_while_equal( - exec.m_pool_base[rank_rev + (1 << i)]->m_pool_state, - ThreadsExec::Active); - } - - exec.m_pool_state = ThreadsExec::Inactive; -} - } // namespace Impl } // namespace Kokkos @@ -290,8 +224,8 @@ void ThreadsExec::execute_sleep(ThreadsExec &exec, const void *) { namespace Kokkos { namespace Impl { -void ThreadsExec::verify_is_process(const std::string &name, - const bool initialized) { +void ThreadsInternal::verify_is_process(const std::string &name, + const bool initialized) { if (!is_process()) { std::string msg(name); msg.append( @@ -307,63 +241,48 @@ void ThreadsExec::verify_is_process(const std::string &name, } } -int ThreadsExec::in_parallel() { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_DEPRECATED int ThreadsInternal::in_parallel() { // A thread function is in execution and // the function argument is not the special threads process argument and // the master process is a worker or is not the master process. return s_current_function && (&s_threads_process != s_current_function_arg) && (s_threads_process.m_pool_base || !is_process()); } -void ThreadsExec::fence() { internal_fence(Impl::fence_is_static::yes); } -void ThreadsExec::fence(const std::string &name) { - internal_fence(name, Impl::fence_is_static::yes); +#endif +void ThreadsInternal::fence() { + fence("Kokkos::ThreadsInternal::fence: Unnamed Instance Fence"); } - -void ThreadsExec::internal_fence(Impl::fence_is_static is_static) { - internal_fence((is_static == Impl::fence_is_static::no) - ? "Kokkos::ThreadsExec::fence: Unnamed Instance Fence" - : "Kokkos::ThreadsExec::fence: Unnamed Static Fence", - is_static); +void ThreadsInternal::fence(const std::string &name) { + Kokkos::Tools::Experimental::Impl::profile_fence_event( + name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, + internal_fence); } // Wait for root thread to become inactive -void ThreadsExec::internal_fence(const std::string &name, - Impl::fence_is_static is_static) { - const auto &fence_lam = [&]() { - if (s_thread_pool_size[0]) { - // Wait for the root thread to complete: - Impl::spinwait_while_equal(s_threads_exec[0]->m_pool_state, - ThreadsExec::Active); - } +void ThreadsInternal::internal_fence() { + if (s_thread_pool_size[0]) { + // Wait for the root thread to complete: + Impl::spinwait_while_equal(s_threads_exec[0]->m_pool_state, + ThreadState::Active); + } - s_current_function = nullptr; - s_current_function_arg = nullptr; + s_current_function = nullptr; + s_current_function_arg = nullptr; - // Make sure function and arguments are cleared before - // potentially re-activating threads with a subsequent launch. - memory_fence(); - }; - if (is_static == Impl::fence_is_static::yes) { - Kokkos::Tools::Experimental::Impl::profile_fence_event( - name, - Kokkos::Tools::Experimental::SpecialSynchronizationCases:: - GlobalDeviceSynchronization, - fence_lam); - } else { - Kokkos::Tools::Experimental::Impl::profile_fence_event( - name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, - fence_lam); - } + // Make sure function and arguments are cleared before + // potentially re-activating threads with a subsequent launch. + memory_fence(); } /** \brief Begin execution of the asynchronous functor */ -void ThreadsExec::start(void (*func)(ThreadsExec &, const void *), - const void *arg) { - verify_is_process("ThreadsExec::start", true); +void ThreadsInternal::start(void (*func)(ThreadsInternal &, const void *), + const void *arg) { + verify_is_process("ThreadsInternal::start", true); if (s_current_function || s_current_function_arg) { Kokkos::Impl::throw_runtime_exception( - std::string("ThreadsExec::start() FAILED : already executing")); + std::string("ThreadsInternal::start() FAILED : already executing")); } s_current_function = func; @@ -372,68 +291,29 @@ void ThreadsExec::start(void (*func)(ThreadsExec &, const void *), // Make sure function and arguments are written before activating threads. memory_fence(); - // Activate threads: + // Activate threads. The spawned threads will start working on + // s_current_function. The root thread is only set to active, we still need to + // call s_current_function. for (int i = s_thread_pool_size[0]; 0 < i--;) { - s_threads_exec[i]->m_pool_state = ThreadsExec::Active; + s_threads_exec[i]->m_pool_state = ThreadState::Active; } if (s_threads_process.m_pool_size) { // Master process is the root thread, run it: (*func)(s_threads_process, arg); - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_state = ThreadState::Inactive; } } //---------------------------------------------------------------------------- -bool ThreadsExec::sleep() { - verify_is_process("ThreadsExec::sleep", true); - - if (&execute_sleep == s_current_function) return false; - - fence(); - - ThreadsExec::global_lock(); - - s_current_function = &execute_sleep; - - // Activate threads: - for (unsigned i = s_thread_pool_size[0]; 0 < i;) { - s_threads_exec[--i]->m_pool_state = ThreadsExec::Active; - } - - return true; -} - -bool ThreadsExec::wake() { - verify_is_process("ThreadsExec::wake", true); - - if (&execute_sleep != s_current_function) return false; - - ThreadsExec::global_unlock(); - - if (s_threads_process.m_pool_base) { - execute_sleep(s_threads_process, nullptr); - s_threads_process.m_pool_state = ThreadsExec::Inactive; - } - - fence(); - - return true; -} - -//---------------------------------------------------------------------------- - -void ThreadsExec::execute_resize_scratch_in_serial() { +void ThreadsInternal::execute_resize_scratch_in_serial() { const unsigned begin = s_threads_process.m_pool_base ? 1 : 0; - auto deallocate_scratch_memory = [](ThreadsExec &exec) { + auto deallocate_scratch_memory = [](ThreadsInternal &exec) { if (exec.m_scratch) { - using Record = - Kokkos::Impl::SharedAllocationRecord; - Record *const r = Record::get_record(exec.m_scratch); - exec.m_scratch = nullptr; - Record::decrement(r); + Kokkos::kokkos_free(exec.m_scratch); + exec.m_scratch = nullptr; } }; if (s_threads_process.m_pool_base) { @@ -449,18 +329,18 @@ void ThreadsExec::execute_resize_scratch_in_serial() { memory_fence(); for (unsigned i = s_thread_pool_size[0]; begin < i;) { - ThreadsExec &th = *s_threads_exec[--i]; + ThreadsInternal &th = *s_threads_exec[--i]; - th.m_pool_state = ThreadsExec::Active; + th.m_pool_state = ThreadState::Active; - wait_yield(th.m_pool_state, ThreadsExec::Active); + wait_yield(th.m_pool_state, ThreadState::Active); } if (s_threads_process.m_pool_base) { deallocate_scratch_memory(s_threads_process); - s_threads_process.m_pool_state = ThreadsExec::Active; + s_threads_process.m_pool_state = ThreadState::Active; first_touch_allocate_thread_private_scratch(s_threads_process, nullptr); - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_state = ThreadState::Inactive; } s_current_function_arg = nullptr; @@ -472,27 +352,20 @@ void ThreadsExec::execute_resize_scratch_in_serial() { //---------------------------------------------------------------------------- -void *ThreadsExec::root_reduce_scratch() { +void *ThreadsInternal::root_reduce_scratch() { return s_threads_process.reduce_memory(); } -void ThreadsExec::first_touch_allocate_thread_private_scratch(ThreadsExec &exec, - const void *) { +void ThreadsInternal::first_touch_allocate_thread_private_scratch( + ThreadsInternal &exec, const void *) { exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end; exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end; if (s_threads_process.m_scratch_thread_end) { // Allocate tracked memory: { - using Record = - Kokkos::Impl::SharedAllocationRecord; - Record *const r = - Record::allocate(Kokkos::HostSpace(), "Kokkos::thread_scratch", - s_threads_process.m_scratch_thread_end); - - Record::increment(r); - - exec.m_scratch = r->data(); + exec.m_scratch = Kokkos::kokkos_malloc( + "Kokkos::thread_scratch", s_threads_process.m_scratch_thread_end); } unsigned *ptr = reinterpret_cast(exec.m_scratch); @@ -505,7 +378,7 @@ void ThreadsExec::first_touch_allocate_thread_private_scratch(ThreadsExec &exec, } } -void *ThreadsExec::resize_scratch(size_t reduce_size, size_t thread_size) { +void *ThreadsInternal::resize_scratch(size_t reduce_size, size_t thread_size) { enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 }; fence(); @@ -522,7 +395,7 @@ void *ThreadsExec::resize_scratch(size_t reduce_size, size_t thread_size) { if ((old_reduce_size < reduce_size) || (old_thread_size < thread_size) || ((reduce_size == 0 && thread_size == 0) && (old_reduce_size != 0 || old_thread_size != 0))) { - verify_is_process("ThreadsExec::resize_scratch", true); + verify_is_process("ThreadsInternal::resize_scratch", true); s_threads_process.m_scratch_reduce_end = reduce_size; s_threads_process.m_scratch_thread_end = reduce_size + thread_size; @@ -537,27 +410,22 @@ void *ThreadsExec::resize_scratch(size_t reduce_size, size_t thread_size) { //---------------------------------------------------------------------------- -void ThreadsExec::print_configuration(std::ostream &s, const bool detail) { - verify_is_process("ThreadsExec::print_configuration", false); +void ThreadsInternal::print_configuration(std::ostream &s, const bool detail) { + verify_is_process("ThreadsInternal::print_configuration", false); fence(); - const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); - const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); - const unsigned threads_per_core = - Kokkos::hwloc::get_available_threads_per_core(); - - // Forestall compiler warnings for unused variables. - (void)numa_count; - (void)cores_per_numa; - (void)threads_per_core; - s << "Kokkos::Threads"; #if defined(KOKKOS_ENABLE_THREADS) s << " KOKKOS_ENABLE_THREADS"; #endif #if defined(KOKKOS_ENABLE_HWLOC) + const unsigned numa_count = Kokkos::hwloc::get_available_numa_count(); + const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa(); + const unsigned threads_per_core = + Kokkos::hwloc::get_available_threads_per_core(); + s << " hwloc[" << numa_count << "x" << cores_per_numa << "x" << threads_per_core << "]"; #endif @@ -569,25 +437,21 @@ void ThreadsExec::print_configuration(std::ostream &s, const bool detail) { if (nullptr == s_threads_process.m_pool_base) { s << " Asynchronous"; } - s << " ReduceScratch[" << s_current_reduce_size << "]" - << " SharedScratch[" << s_current_shared_size << "]"; s << std::endl; if (detail) { for (int i = 0; i < s_thread_pool_size[0]; ++i) { - ThreadsExec *const th = s_threads_exec[i]; + ThreadsInternal *const th = s_threads_exec[i]; if (th) { const int rank_rev = th->m_pool_size - (th->m_pool_rank + 1); - s << " Thread[ " << th->m_pool_rank << " : " << th->m_numa_rank << "." - << th->m_numa_core_rank << " ]"; + s << " Thread[ " << th->m_pool_rank << " ]"; s << " Fan{"; for (int j = 0; j < th->m_pool_fan_size; ++j) { - ThreadsExec *const thfan = th->m_pool_base[rank_rev + (1 << j)]; - s << " [ " << thfan->m_pool_rank << " : " << thfan->m_numa_rank - << "." << thfan->m_numa_core_rank << " ]"; + ThreadsInternal *const thfan = th->m_pool_base[rank_rev + (1 << j)]; + s << " [ " << thfan->m_pool_rank << " ]"; } s << " }"; @@ -605,29 +469,21 @@ void ThreadsExec::print_configuration(std::ostream &s, const bool detail) { //---------------------------------------------------------------------------- -int ThreadsExec::is_initialized() { return nullptr != s_threads_exec[0]; } +int ThreadsInternal::is_initialized() { return nullptr != s_threads_exec[0]; } -void ThreadsExec::initialize(int thread_count_arg) { - // legacy arguments - unsigned thread_count = thread_count_arg == -1 ? 0 : thread_count_arg; - unsigned use_numa_count = 0; - unsigned use_cores_per_numa = 0; - bool allow_asynchronous_threadpool = false; - // need to provide an initializer for Intel compilers - static const Sentinel sentinel = {}; +void ThreadsInternal::initialize(int thread_count_arg) { + unsigned thread_count = thread_count_arg == -1 ? 0 : thread_count_arg; const bool is_initialized = 0 != s_thread_pool_size[0]; unsigned thread_spawn_failed = 0; - for (int i = 0; i < ThreadsExec::MAX_THREAD_COUNT; i++) + for (int i = 0; i < ThreadsInternal::MAX_THREAD_COUNT; i++) s_threads_exec[i] = nullptr; if (!is_initialized) { - // If thread_count, use_numa_count, or use_cores_per_numa are zero - // then they will be given default values based upon hwloc detection - // and allowed asynchronous execution. - + // If thread_count is zero then it will be given default values based upon + // hwloc detection. const bool hwloc_avail = Kokkos::hwloc::available(); const bool hwloc_can_bind = hwloc_avail && Kokkos::hwloc::can_bind_threads(); @@ -640,17 +496,18 @@ void ThreadsExec::initialize(int thread_count_arg) { : 1; } - const unsigned thread_spawn_begin = hwloc::thread_mapping( - "Kokkos::Threads::initialize", allow_asynchronous_threadpool, - thread_count, use_numa_count, use_cores_per_numa, s_threads_coord); + const bool allow_asynchronous_threadpool = false; + unsigned use_numa_count = 0; + unsigned use_cores_per_numa = 0; + hwloc::thread_mapping("Kokkos::Threads::initialize", + allow_asynchronous_threadpool, thread_count, + use_numa_count, use_cores_per_numa, s_threads_coord); const std::pair proc_coord = s_threads_coord[0]; - if (thread_spawn_begin) { - // Synchronous with s_threads_coord[0] as the process core - // Claim entry #0 for binding the process core. - s_threads_coord[0] = std::pair(~0u, ~0u); - } + // Synchronous with s_threads_coord[0] as the process core + // Claim entry #0 for binding the process core. + s_threads_coord[0] = std::pair(~0u, ~0u); s_thread_pool_size[0] = thread_count; s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count; @@ -658,8 +515,8 @@ void ThreadsExec::initialize(int thread_count_arg) { s_current_function = &execute_function_noop; // Initialization work function - for (unsigned ith = thread_spawn_begin; ith < thread_count; ++ith) { - s_threads_process.m_pool_state = ThreadsExec::Inactive; + for (unsigned ith = 1; ith < thread_count; ++ith) { + s_threads_process.m_pool_state = ThreadState::Inactive; // If hwloc available then spawned thread will // choose its own entry in 's_threads_coord' @@ -675,18 +532,20 @@ void ThreadsExec::initialize(int thread_count_arg) { // Wait until spawned thread has attempted to initialize. // If spawning and initialization is successful then // an entry in 's_threads_exec' will be assigned. - ThreadsExec::spawn(); - wait_yield(s_threads_process.m_pool_state, ThreadsExec::Inactive); - if (s_threads_process.m_pool_state == ThreadsExec::Terminating) break; + std::thread t(internal_cppthread_driver); + t.detach(); + wait_yield(s_threads_process.m_pool_state, ThreadState::Inactive); + if (s_threads_process.m_pool_state == ThreadState::Terminating) break; } // Wait for all spawned threads to deactivate before zeroing the function. - for (unsigned ith = thread_spawn_begin; ith < thread_count; ++ith) { + for (unsigned ith = 1; ith < thread_count; ++ith) { // Try to protect against cache coherency failure by casting to volatile. - ThreadsExec *const th = ((ThreadsExec * volatile *)s_threads_exec)[ith]; + ThreadsInternal *const th = + ((ThreadsInternal * volatile *)s_threads_exec)[ith]; if (th) { - wait_yield(th->m_pool_state, ThreadsExec::Active); + wait_yield(th->m_pool_state, ThreadState::Active); } else { ++thread_spawn_failed; } @@ -694,7 +553,7 @@ void ThreadsExec::initialize(int thread_count_arg) { s_current_function = nullptr; s_current_function_arg = nullptr; - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_state = ThreadState::Inactive; memory_fence(); @@ -705,30 +564,17 @@ void ThreadsExec::initialize(int thread_count_arg) { Kokkos::hwloc::bind_this_thread(proc_coord); } - if (thread_spawn_begin) { // Include process in pool. - const std::pair coord = - Kokkos::hwloc::get_this_thread_coordinate(); - - s_threads_exec[0] = &s_threads_process; - s_threads_process.m_numa_rank = coord.first; - s_threads_process.m_numa_core_rank = coord.second; - s_threads_process.m_pool_base = s_threads_exec; - s_threads_process.m_pool_rank = - thread_count - 1; // Reversed for scan-compatible reductions - s_threads_process.m_pool_size = thread_count; - s_threads_process.m_pool_fan_size = fan_size( - s_threads_process.m_pool_rank, s_threads_process.m_pool_size); - s_threads_pid[s_threads_process.m_pool_rank] = - std::this_thread::get_id(); - } else { - s_threads_process.m_pool_base = nullptr; - s_threads_process.m_pool_rank = 0; - s_threads_process.m_pool_size = 0; - s_threads_process.m_pool_fan_size = 0; - } + s_threads_exec[0] = &s_threads_process; + s_threads_process.m_pool_base = s_threads_exec; + s_threads_process.m_pool_rank = + thread_count - 1; // Reversed for scan-compatible reductions + s_threads_process.m_pool_size = thread_count; + s_threads_process.m_pool_fan_size = fan_size( + s_threads_process.m_pool_rank, s_threads_process.m_pool_size); + s_threads_pid[s_threads_process.m_pool_rank] = std::this_thread::get_id(); // Initial allocations: - ThreadsExec::resize_scratch(1024, 1024); + ThreadsInternal::resize_scratch(1024, 1024); } else { s_thread_pool_size[0] = 0; s_thread_pool_size[1] = 0; @@ -773,8 +619,8 @@ void ThreadsExec::initialize(int thread_count_arg) { //---------------------------------------------------------------------------- -void ThreadsExec::finalize() { - verify_is_process("ThreadsExec::finalize", false); +void ThreadsInternal::finalize() { + verify_is_process("ThreadsInternal::finalize", false); fence(); @@ -784,18 +630,18 @@ void ThreadsExec::finalize() { for (unsigned i = s_thread_pool_size[0]; begin < i--;) { if (s_threads_exec[i]) { - s_threads_exec[i]->m_pool_state = ThreadsExec::Terminating; + s_threads_exec[i]->m_pool_state = ThreadState::Terminating; - wait_yield(s_threads_process.m_pool_state, ThreadsExec::Inactive); + wait_yield(s_threads_process.m_pool_state, ThreadState::Inactive); - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_state = ThreadState::Inactive; } s_threads_pid[i] = std::thread::id(); } if (s_threads_process.m_pool_base) { - (&s_threads_process)->~ThreadsExec(); + (&s_threads_process)->~ThreadsInternal(); s_threads_exec[0] = nullptr; } @@ -808,13 +654,11 @@ void ThreadsExec::finalize() { s_thread_pool_size[2] = 0; // Reset master thread to run solo. - s_threads_process.m_numa_rank = 0; - s_threads_process.m_numa_core_rank = 0; - s_threads_process.m_pool_base = nullptr; - s_threads_process.m_pool_rank = 0; - s_threads_process.m_pool_size = 1; - s_threads_process.m_pool_fan_size = 0; - s_threads_process.m_pool_state = ThreadsExec::Inactive; + s_threads_process.m_pool_base = nullptr; + s_threads_process.m_pool_rank = 0; + s_threads_process.m_pool_size = 1; + s_threads_process.m_pool_fan_size = 0; + s_threads_process.m_pool_state = ThreadState::Inactive; } //---------------------------------------------------------------------------- @@ -834,7 +678,7 @@ int Threads::concurrency() const { return impl_thread_pool_size(0); } #endif void Threads::fence(const std::string &name) const { - Impl::ThreadsExec::internal_fence(name, Impl::fence_is_static::no); + Impl::ThreadsInternal::fence(name); } Threads &Threads::impl_instance(int) { diff --git a/core/src/Threads/Kokkos_ThreadsExec.hpp b/core/src/Threads/Kokkos_Threads_Instance.hpp similarity index 76% rename from core/src/Threads/Kokkos_ThreadsExec.hpp rename to core/src/Threads/Kokkos_Threads_Instance.hpp index 377e096bfbe..a5eb231cb01 100644 --- a/core/src/Threads/Kokkos_ThreadsExec.hpp +++ b/core/src/Threads/Kokkos_Threads_Instance.hpp @@ -14,8 +14,8 @@ // //@HEADER -#ifndef KOKKOS_THREADSEXEC_HPP -#define KOKKOS_THREADSEXEC_HPP +#ifndef KOKKOS_THREADS_INSTANCE_HPP +#define KOKKOS_THREADS_INSTANCE_HPP #include @@ -23,41 +23,25 @@ #include #include -#include - #include #include #include #include +#include +#include //---------------------------------------------------------------------------- namespace Kokkos { namespace Impl { -class ThreadsExec { +class ThreadsInternal { public: // Fan array has log_2(NT) reduction threads plus 2 scan threads // Currently limited to 16k threads. - enum { MAX_FAN_COUNT = 16 }; - enum { MAX_THREAD_COUNT = 1 << (MAX_FAN_COUNT - 2) }; - enum { VECTOR_LENGTH = 8 }; - - /** \brief States of a worker thread */ - enum { - Terminating ///< Termination in progress - , - Inactive ///< Exists, waiting for work - , - Active ///< Exists, performing work - , - Rendezvous ///< Exists, waiting in a barrier or reduce - - , - ScanCompleted, - ScanAvailable, - ReductionAvailable - }; + static constexpr int MAX_FAN_COUNT = 16; + static constexpr int MAX_THREAD_COUNT = 1 << (MAX_FAN_COUNT - 2); + static constexpr int VECTOR_LENGTH = 8; private: friend class Kokkos::Threads; @@ -67,18 +51,16 @@ class ThreadsExec { // the threads that need them. // For a simple reduction the thread location is arbitrary. - ThreadsExec *const *m_pool_base; ///< Base for pool fan-in + ThreadsInternal *const *m_pool_base; ///< Base for pool fan-in void *m_scratch; int m_scratch_reduce_end; size_t m_scratch_thread_end; - int m_numa_rank; - int m_numa_core_rank; int m_pool_rank; int m_pool_rank_rev; int m_pool_size; int m_pool_fan_size; - int volatile m_pool_state; ///< State for global synchronizations + ThreadState volatile m_pool_state; ///< State for global synchronizations // Members for dynamic scheduling // Which thread am I stealing from currently @@ -93,41 +75,36 @@ class ThreadsExec { static void global_lock(); static void global_unlock(); - static void spawn(); - static void first_touch_allocate_thread_private_scratch(ThreadsExec &, + static void first_touch_allocate_thread_private_scratch(ThreadsInternal &, const void *); - static void execute_sleep(ThreadsExec &, const void *); - ThreadsExec(const ThreadsExec &); - ThreadsExec &operator=(const ThreadsExec &); + ThreadsInternal(const ThreadsInternal &); + ThreadsInternal &operator=(const ThreadsInternal &); static void execute_resize_scratch_in_serial(); public: KOKKOS_INLINE_FUNCTION int pool_size() const { return m_pool_size; } KOKKOS_INLINE_FUNCTION int pool_rank() const { return m_pool_rank; } - KOKKOS_INLINE_FUNCTION int numa_rank() const { return m_numa_rank; } - KOKKOS_INLINE_FUNCTION int numa_core_rank() const { return m_numa_core_rank; } inline long team_work_index() const { return m_team_work_index; } - static int get_thread_count(); - static ThreadsExec *get_thread(const int init_thread_rank); + static ThreadsInternal *get_thread(const int init_thread_rank); inline void *reduce_memory() const { return m_scratch; } KOKKOS_INLINE_FUNCTION void *scratch_memory() const { return reinterpret_cast(m_scratch) + m_scratch_reduce_end; } - KOKKOS_INLINE_FUNCTION int volatile &state() { return m_pool_state; } - KOKKOS_INLINE_FUNCTION ThreadsExec *const *pool_base() const { + KOKKOS_INLINE_FUNCTION ThreadState volatile &state() { return m_pool_state; } + KOKKOS_INLINE_FUNCTION ThreadsInternal *const *pool_base() const { return m_pool_base; } static void driver(void); - ~ThreadsExec(); - ThreadsExec(); + ~ThreadsInternal(); + ThreadsInternal(); static void *resize_scratch(size_t reduce_size, size_t thread_size); @@ -143,15 +120,8 @@ class ThreadsExec { static void finalize(); - /* Given a requested team size, return valid team size */ - static unsigned team_size_valid(unsigned); - static void print_configuration(std::ostream &, const bool detail = false); - //------------------------------------ - - static void wait_yield(volatile int &, const int); - //------------------------------------ // All-thread functions: @@ -166,14 +136,14 @@ class ThreadsExec { // Fan-in reduction with highest ranking thread as the root for (int i = 0; i < m_pool_fan_size; ++i) { // Wait: Active -> Rendezvous - Impl::spinwait_while_equal( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Active); } if (rev_rank) { - m_pool_state = ThreadsExec::Rendezvous; + m_pool_state = ThreadState::Rendezvous; // Wait: Rendezvous -> Active - Impl::spinwait_while_equal(m_pool_state, ThreadsExec::Rendezvous); + spinwait_while_equal(m_pool_state, ThreadState::Rendezvous); } else { // Root thread does the reduction and broadcast @@ -191,7 +161,7 @@ class ThreadsExec { memory_fence(); for (int rank = 0; rank < m_pool_size; ++rank) { - get_thread(rank)->m_pool_state = ThreadsExec::Active; + get_thread(rank)->m_pool_state = ThreadState::Active; } } @@ -207,21 +177,21 @@ class ThreadsExec { // Fan-in reduction with highest ranking thread as the root for (int i = 0; i < m_pool_fan_size; ++i) { // Wait: Active -> Rendezvous - Impl::spinwait_while_equal( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Active); } if (rev_rank) { - m_pool_state = ThreadsExec::Rendezvous; + m_pool_state = ThreadState::Rendezvous; // Wait: Rendezvous -> Active - Impl::spinwait_while_equal(m_pool_state, ThreadsExec::Rendezvous); + spinwait_while_equal(m_pool_state, ThreadState::Rendezvous); } else { // Root thread does the reduction and broadcast memory_fence(); for (int rank = 0; rank < m_pool_size; ++rank) { - get_thread(rank)->m_pool_state = ThreadsExec::Active; + get_thread(rank)->m_pool_state = ThreadState::Active; } } } @@ -234,9 +204,9 @@ class ThreadsExec { const int rev_rank = m_pool_size - (m_pool_rank + 1); for (int i = 0; i < m_pool_fan_size; ++i) { - ThreadsExec &fan = *m_pool_base[rev_rank + (1 << i)]; + ThreadsInternal &fan = *m_pool_base[rev_rank + (1 << i)]; - Impl::spinwait_while_equal(fan.m_pool_state, ThreadsExec::Active); + spinwait_while_equal(fan.m_pool_state, ThreadState::Active); f.join( reinterpret_cast(reduce_memory()), @@ -265,8 +235,8 @@ class ThreadsExec { const int rev_rank = m_pool_size - (m_pool_rank + 1); for (int i = 0; i < m_pool_fan_size; ++i) { - Impl::spinwait_while_equal( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Active); } } @@ -289,10 +259,10 @@ class ThreadsExec { //-------------------------------- // Fan-in reduction with highest ranking thread as the root for (int i = 0; i < m_pool_fan_size; ++i) { - ThreadsExec &fan = *m_pool_base[rev_rank + (1 << i)]; + ThreadsInternal &fan = *m_pool_base[rev_rank + (1 << i)]; // Wait: Active -> ReductionAvailable (or ScanAvailable) - Impl::spinwait_while_equal(fan.m_pool_state, ThreadsExec::Active); + spinwait_while_equal(fan.m_pool_state, ThreadState::Active); f.join(work_value, fan.reduce_memory()); } @@ -303,39 +273,37 @@ class ThreadsExec { if (rev_rank) { // Set: Active -> ReductionAvailable - m_pool_state = ThreadsExec::ReductionAvailable; + m_pool_state = ThreadState::ReductionAvailable; // Wait for contributing threads' scan value to be available. if ((1 << m_pool_fan_size) < (m_pool_rank + 1)) { - ThreadsExec &th = *m_pool_base[rev_rank + (1 << m_pool_fan_size)]; + ThreadsInternal &th = *m_pool_base[rev_rank + (1 << m_pool_fan_size)]; // Wait: Active -> ReductionAvailable // Wait: ReductionAvailable -> ScanAvailable - Impl::spinwait_while_equal(th.m_pool_state, ThreadsExec::Active); - Impl::spinwait_while_equal(th.m_pool_state, - ThreadsExec::ReductionAvailable); + spinwait_while_equal(th.m_pool_state, ThreadState::Active); + spinwait_while_equal(th.m_pool_state, ThreadState::ReductionAvailable); f.join(work_value + count, ((scalar_type *)th.reduce_memory()) + count); } // This thread has completed inclusive scan // Set: ReductionAvailable -> ScanAvailable - m_pool_state = ThreadsExec::ScanAvailable; + m_pool_state = ThreadState::ScanAvailable; // Wait for all threads to complete inclusive scan // Wait: ScanAvailable -> Rendezvous - Impl::spinwait_while_equal(m_pool_state, ThreadsExec::ScanAvailable); + spinwait_while_equal(m_pool_state, ThreadState::ScanAvailable); } //-------------------------------- for (int i = 0; i < m_pool_fan_size; ++i) { - ThreadsExec &fan = *m_pool_base[rev_rank + (1 << i)]; + ThreadsInternal &fan = *m_pool_base[rev_rank + (1 << i)]; // Wait: ReductionAvailable -> ScanAvailable - Impl::spinwait_while_equal(fan.m_pool_state, - ThreadsExec::ReductionAvailable); + spinwait_while_equal(fan.m_pool_state, ThreadState::ReductionAvailable); // Set: ScanAvailable -> Rendezvous - fan.m_pool_state = ThreadsExec::Rendezvous; + fan.m_pool_state = ThreadState::Rendezvous; } // All threads have completed the inclusive scan. @@ -346,7 +314,7 @@ class ThreadsExec { if ((rev_rank + 1) < m_pool_size) { // Exclusive scan: copy the previous thread's inclusive scan value - ThreadsExec &th = *m_pool_base[rev_rank + 1]; // Not the root thread + ThreadsInternal &th = *m_pool_base[rev_rank + 1]; // Not the root thread const scalar_type *const src_value = ((scalar_type *)th.reduce_memory()) + count; @@ -362,19 +330,18 @@ class ThreadsExec { // Wait for all threads to copy previous thread's inclusive scan value // Wait for all threads: Rendezvous -> ScanCompleted for (int i = 0; i < m_pool_fan_size; ++i) { - Impl::spinwait_while_equal( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, - ThreadsExec::Rendezvous); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Rendezvous); } if (rev_rank) { // Set: ScanAvailable -> ScanCompleted - m_pool_state = ThreadsExec::ScanCompleted; + m_pool_state = ThreadState::ScanCompleted; // Wait: ScanCompleted -> Active - Impl::spinwait_while_equal(m_pool_state, ThreadsExec::ScanCompleted); + spinwait_while_equal(m_pool_state, ThreadState::ScanCompleted); } // Set: ScanCompleted -> Active for (int i = 0; i < m_pool_fan_size; ++i) { - m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadsExec::Active; + m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadState::Active; } } @@ -391,8 +358,8 @@ class ThreadsExec { // Fan-in reduction with highest ranking thread as the root for (int i = 0; i < m_pool_fan_size; ++i) { // Wait: Active -> Rendezvous - Impl::spinwait_while_equal( - m_pool_base[rev_rank + (1 << i)]->m_pool_state, ThreadsExec::Active); + spinwait_while_equal(m_pool_base[rev_rank + (1 << i)]->m_pool_state, + ThreadState::Active); } for (unsigned i = 0; i < count; ++i) { @@ -400,9 +367,9 @@ class ThreadsExec { } if (rev_rank) { - m_pool_state = ThreadsExec::Rendezvous; + m_pool_state = ThreadState::Rendezvous; // Wait: Rendezvous -> Active - Impl::spinwait_while_equal(m_pool_state, ThreadsExec::Rendezvous); + spinwait_while_equal(m_pool_state, ThreadState::Rendezvous); } else { // Root thread does the thread-scan before releasing threads @@ -424,7 +391,7 @@ class ThreadsExec { } for (int i = 0; i < m_pool_fan_size; ++i) { - m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadsExec::Active; + m_pool_base[rev_rank + (1 << i)]->m_pool_state = ThreadState::Active; } } @@ -433,18 +400,14 @@ class ThreadsExec { * complete and release the Threads device. * Acquire the Threads device and start this functor. */ - static void start(void (*)(ThreadsExec &, const void *), const void *); + static void start(void (*)(ThreadsInternal &, const void *), const void *); - static int in_parallel(); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static int in_parallel(); +#endif static void fence(); static void fence(const std::string &); - static void internal_fence( - Impl::fence_is_static is_static = Impl::fence_is_static::yes); - static void internal_fence( - const std::string &, - Impl::fence_is_static is_static = Impl::fence_is_static::yes); - static bool sleep(); - static bool wake(); + static void internal_fence(); /* Dynamic Scheduling related functionality */ // Initialize the work range for this thread @@ -583,30 +546,38 @@ class ThreadsExec { namespace Kokkos { -inline int Threads::in_parallel() { return Impl::ThreadsExec::in_parallel(); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_DEPRECATED inline int Threads::in_parallel() { + return Impl::ThreadsInternal::in_parallel(); +} +#endif inline int Threads::impl_is_initialized() { - return Impl::ThreadsExec::is_initialized(); + return Impl::ThreadsInternal::is_initialized(); } inline void Threads::impl_initialize(InitializationSettings const &settings) { - Impl::ThreadsExec::initialize( + Impl::ThreadsInternal::initialize( settings.has_num_threads() ? settings.get_num_threads() : -1); } -inline void Threads::impl_finalize() { Impl::ThreadsExec::finalize(); } +inline void Threads::impl_finalize() { Impl::ThreadsInternal::finalize(); } inline void Threads::print_configuration(std::ostream &os, bool verbose) const { os << "Host Parallel Execution Space:\n"; os << " KOKKOS_ENABLE_THREADS: yes\n"; os << "\nThreads Runtime Configuration:\n"; - Impl::ThreadsExec::print_configuration(os, verbose); + Impl::ThreadsInternal::print_configuration(os, verbose); } inline void Threads::impl_static_fence(const std::string &name) { - Impl::ThreadsExec::internal_fence(name, Impl::fence_is_static::yes); + Kokkos::Tools::Experimental::Impl::profile_fence_event( + name, + Kokkos::Tools::Experimental::SpecialSynchronizationCases:: + GlobalDeviceSynchronization, + Impl::ThreadsInternal::internal_fence); } } /* namespace Kokkos */ -#endif /* #define KOKKOS_THREADSEXEC_HPP */ +#endif diff --git a/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp b/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp index 0828f262993..59577609ab7 100644 --- a/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp +++ b/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp @@ -46,54 +46,54 @@ class ParallelFor, } } - static void exec(ThreadsExec &exec, const void *arg) { - exec_schedule(exec, arg); + static void exec(ThreadsInternal &instance, const void *arg) { + exec_schedule(instance, arg); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); auto const num_tiles = self.m_iter.m_rp.m_num_tiles; - WorkRange range(Policy(0, num_tiles).set_chunk_size(1), exec.pool_rank(), - exec.pool_size()); + WorkRange range(Policy(0, num_tiles).set_chunk_size(1), + instance.pool_rank(), instance.pool_size()); self.exec_range(range.begin(), range.end()); - exec.fan_in(); + instance.fan_in(); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); auto const num_tiles = self.m_iter.m_rp.m_num_tiles; - WorkRange range(Policy(0, num_tiles).set_chunk_size(1), exec.pool_rank(), - exec.pool_size()); + WorkRange range(Policy(0, num_tiles).set_chunk_size(1), + instance.pool_rank(), instance.pool_size()); - exec.set_work_range(range.begin(), range.end(), 1); - exec.reset_steal_target(); - exec.barrier(); + instance.set_work_range(range.begin(), range.end(), 1); + instance.reset_steal_target(); + instance.barrier(); - long work_index = exec.get_work_index(); + long work_index = instance.get_work_index(); while (work_index != -1) { const Member begin = static_cast(work_index); const Member end = begin + 1 < num_tiles ? begin + 1 : num_tiles; self.exec_range(begin, end); - work_index = exec.get_work_index(); + work_index = instance.get_work_index(); } - exec.fan_in(); + instance.fan_in(); } public: inline void execute() const { - ThreadsExec::start(&ParallelFor::exec, this); - ThreadsExec::fence(); + ThreadsInternal::start(&ParallelFor::exec, this); + ThreadsInternal::fence(); } ParallelFor(const FunctorType &arg_functor, const MDRangePolicy &arg_policy) diff --git a/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp b/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp index 3698416ef18..4a89c4fad82 100644 --- a/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp +++ b/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp @@ -59,37 +59,37 @@ class ParallelFor, } } - static void exec(ThreadsExec &exec, const void *arg) { - exec_schedule(exec, arg); + static void exec(ThreadsInternal &instance, const void *arg) { + exec_schedule(instance, arg); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); - WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + WorkRange range(self.m_policy, instance.pool_rank(), instance.pool_size()); ParallelFor::template exec_range(self.m_functor, range.begin(), range.end()); - exec.fan_in(); + instance.fan_in(); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); - WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + WorkRange range(self.m_policy, instance.pool_rank(), instance.pool_size()); - exec.set_work_range(range.begin() - self.m_policy.begin(), - range.end() - self.m_policy.begin(), - self.m_policy.chunk_size()); - exec.reset_steal_target(); - exec.barrier(); + instance.set_work_range(range.begin() - self.m_policy.begin(), + range.end() - self.m_policy.begin(), + self.m_policy.chunk_size()); + instance.reset_steal_target(); + instance.barrier(); - long work_index = exec.get_work_index(); + long work_index = instance.get_work_index(); while (work_index != -1) { const Member begin = @@ -100,16 +100,16 @@ class ParallelFor, ? begin + self.m_policy.chunk_size() : self.m_policy.end(); ParallelFor::template exec_range(self.m_functor, begin, end); - work_index = exec.get_work_index(); + work_index = instance.get_work_index(); } - exec.fan_in(); + instance.fan_in(); } public: inline void execute() const { - ThreadsExec::start(&ParallelFor::exec, this); - ThreadsExec::fence(); + ThreadsInternal::start(&ParallelFor::exec, this); + ThreadsInternal::fence(); } ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy) diff --git a/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp b/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp index 36404857a22..f927d7c6a67 100644 --- a/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp +++ b/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp @@ -73,14 +73,14 @@ class ParallelFor, } } - static void exec(ThreadsExec &exec, const void *arg) { + static void exec(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); ParallelFor::exec_team( - self.m_functor, Member(&exec, self.m_policy, self.m_shared)); + self.m_functor, Member(&instance, self.m_policy, self.m_shared)); - exec.barrier(); - exec.fan_in(); + instance.barrier(); + instance.fan_in(); } template Policy fix_policy(Policy policy) { @@ -96,12 +96,12 @@ class ParallelFor, public: inline void execute() const { - ThreadsExec::resize_scratch( + ThreadsInternal::resize_scratch( 0, Policy::member_type::team_reduce_size() + m_shared); - ThreadsExec::start(&ParallelFor::exec, this); + ThreadsInternal::start(&ParallelFor::exec, this); - ThreadsExec::fence(); + ThreadsInternal::fence(); } ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy) diff --git a/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp b/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp index 3d06379480f..fa63215a9e5 100644 --- a/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp +++ b/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp @@ -54,67 +54,67 @@ class ParallelReduce(exec, arg); + static void exec(ThreadsInternal &instance, const void *arg) { + exec_schedule(instance, arg); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); const auto num_tiles = self.m_iter.m_rp.m_num_tiles; const WorkRange range(Policy(0, num_tiles).set_chunk_size(1), - exec.pool_rank(), exec.pool_size()); + instance.pool_rank(), instance.pool_size()); const ReducerType &reducer = self.m_iter.m_func.get_reducer(); self.exec_range( range.begin(), range.end(), - reducer.init(static_cast(exec.reduce_memory()))); + reducer.init(static_cast(instance.reduce_memory()))); - exec.fan_in_reduce(reducer); + instance.fan_in_reduce(reducer); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); const auto num_tiles = self.m_iter.m_rp.m_num_tiles; const WorkRange range(Policy(0, num_tiles).set_chunk_size(1), - exec.pool_rank(), exec.pool_size()); + instance.pool_rank(), instance.pool_size()); - exec.set_work_range(range.begin(), range.end(), 1); - exec.reset_steal_target(); - exec.barrier(); + instance.set_work_range(range.begin(), range.end(), 1); + instance.reset_steal_target(); + instance.barrier(); - long work_index = exec.get_work_index(); + long work_index = instance.get_work_index(); const ReducerType &reducer = self.m_iter.m_func.get_reducer(); - reference_type update = - self.m_reducer.init(static_cast(exec.reduce_memory())); + reference_type update = self.m_reducer.init( + static_cast(instance.reduce_memory())); while (work_index != -1) { const Member begin = static_cast(work_index); const Member end = begin + 1 < num_tiles ? begin + 1 : num_tiles; self.exec_range(begin, end, update); - work_index = exec.get_work_index(); + work_index = instance.get_work_index(); } - exec.fan_in_reduce(self.m_reducer); + instance.fan_in_reduce(self.m_reducer); } public: inline void execute() const { const ReducerType &reducer = m_iter.m_func.get_reducer(); - ThreadsExec::resize_scratch(reducer.value_size(), 0); + ThreadsInternal::resize_scratch(reducer.value_size(), 0); - ThreadsExec::start(&ParallelReduce::exec, this); + ThreadsInternal::start(&ParallelReduce::exec, this); - ThreadsExec::fence(); + ThreadsInternal::fence(); if (m_result_ptr) { const pointer_type data = - (pointer_type)ThreadsExec::root_reduce_scratch(); + (pointer_type)ThreadsInternal::root_reduce_scratch(); const unsigned n = reducer.value_count(); for (unsigned i = 0; i < n; ++i) { diff --git a/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp b/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp index 5fa97b403c4..bf4c2a532a1 100644 --- a/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp +++ b/core/src/Threads/Kokkos_Threads_ParallelReduce_Range.hpp @@ -68,42 +68,44 @@ class ParallelReduce, } } - static void exec(ThreadsExec &exec, const void *arg) { - exec_schedule(exec, arg); + static void exec(ThreadsInternal &instance, const void *arg) { + exec_schedule(instance, arg); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + const WorkRange range(self.m_policy, instance.pool_rank(), + instance.pool_size()); const ReducerType &reducer = self.m_functor_reducer.get_reducer(); ParallelReduce::template exec_range( self.m_functor_reducer.get_functor(), range.begin(), range.end(), - reducer.init(static_cast(exec.reduce_memory()))); + reducer.init(static_cast(instance.reduce_memory()))); - exec.fan_in_reduce(reducer); + instance.fan_in_reduce(reducer); } template static std::enable_if_t::value> - exec_schedule(ThreadsExec &exec, const void *arg) { + exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + const WorkRange range(self.m_policy, instance.pool_rank(), + instance.pool_size()); - exec.set_work_range(range.begin() - self.m_policy.begin(), - range.end() - self.m_policy.begin(), - self.m_policy.chunk_size()); - exec.reset_steal_target(); - exec.barrier(); + instance.set_work_range(range.begin() - self.m_policy.begin(), + range.end() - self.m_policy.begin(), + self.m_policy.chunk_size()); + instance.reset_steal_target(); + instance.barrier(); - long work_index = exec.get_work_index(); + long work_index = instance.get_work_index(); const ReducerType &reducer = self.m_functor_reducer.get_reducer(); reference_type update = - reducer.init(static_cast(exec.reduce_memory())); + reducer.init(static_cast(instance.reduce_memory())); while (work_index != -1) { const Member begin = static_cast(work_index) * self.m_policy.chunk_size() + @@ -114,10 +116,10 @@ class ParallelReduce, : self.m_policy.end(); ParallelReduce::template exec_range( self.m_functor_reducer.get_functor(), begin, end, update); - work_index = exec.get_work_index(); + work_index = instance.get_work_index(); } - exec.fan_in_reduce(reducer); + instance.fan_in_reduce(reducer); } public: @@ -130,15 +132,15 @@ class ParallelReduce, reducer.final(m_result_ptr); } } else { - ThreadsExec::resize_scratch(reducer.value_size(), 0); + ThreadsInternal::resize_scratch(reducer.value_size(), 0); - ThreadsExec::start(&ParallelReduce::exec, this); + ThreadsInternal::start(&ParallelReduce::exec, this); - ThreadsExec::fence(); + ThreadsInternal::fence(); if (m_result_ptr) { const pointer_type data = - (pointer_type)ThreadsExec::root_reduce_scratch(); + (pointer_type)ThreadsInternal::root_reduce_scratch(); const unsigned n = reducer.value_count(); for (unsigned i = 0; i < n; ++i) { diff --git a/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp b/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp index c4b6100a9df..4db310701f9 100644 --- a/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp +++ b/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp @@ -58,16 +58,16 @@ class ParallelReduce( self.m_functor_reducer.get_functor(), - Member(&exec, self.m_policy, self.m_shared), + Member(&instance, self.m_policy, self.m_shared), self.m_functor_reducer.get_reducer().init( - static_cast(exec.reduce_memory()))); + static_cast(instance.reduce_memory()))); - exec.fan_in_reduce(self.m_functor_reducer.get_reducer()); + instance.fan_in_reduce(self.m_functor_reducer.get_reducer()); } public: @@ -80,17 +80,17 @@ class ParallelReduce, } } - static void exec(ThreadsExec &exec, const void *arg) { + static void exec(ThreadsInternal &instance, const void *arg) { const ParallelScan &self = *((const ParallelScan *)arg); - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + const WorkRange range(self.m_policy, instance.pool_rank(), + instance.pool_size()); typename Analysis::Reducer final_reducer(self.m_functor); reference_type update = - final_reducer.init(static_cast(exec.reduce_memory())); + final_reducer.init(static_cast(instance.reduce_memory())); ParallelScan::template exec_range(self.m_functor, range.begin(), range.end(), update, false); - // exec.template scan_large( final_reducer ); - exec.scan_small(final_reducer); + instance.scan_small(final_reducer); ParallelScan::template exec_range(self.m_functor, range.begin(), range.end(), update, true); - exec.fan_in(); + instance.fan_in(); } public: inline void execute() const { - ThreadsExec::resize_scratch(2 * Analysis::value_size(m_functor), 0); - ThreadsExec::start(&ParallelScan::exec, this); - ThreadsExec::fence(); + ThreadsInternal::resize_scratch(2 * Analysis::value_size(m_functor), 0); + ThreadsInternal::start(&ParallelScan::exec, this); + ThreadsInternal::fence(); } ParallelScan(const FunctorType &arg_functor, const Policy &arg_policy) @@ -145,37 +145,37 @@ class ParallelScanWithTotal, } } - static void exec(ThreadsExec &exec, const void *arg) { + static void exec(ThreadsInternal &instance, const void *arg) { const ParallelScanWithTotal &self = *((const ParallelScanWithTotal *)arg); - const WorkRange range(self.m_policy, exec.pool_rank(), exec.pool_size()); + const WorkRange range(self.m_policy, instance.pool_rank(), + instance.pool_size()); typename Analysis::Reducer final_reducer(self.m_functor); reference_type update = - final_reducer.init(static_cast(exec.reduce_memory())); + final_reducer.init(static_cast(instance.reduce_memory())); ParallelScanWithTotal::template exec_range( self.m_functor, range.begin(), range.end(), update, false); - // exec.template scan_large(final_reducer); - exec.scan_small(final_reducer); + instance.scan_small(final_reducer); ParallelScanWithTotal::template exec_range( self.m_functor, range.begin(), range.end(), update, true); - exec.fan_in(); + instance.fan_in(); - if (exec.pool_rank() == exec.pool_size() - 1) { + if (instance.pool_rank() == instance.pool_size() - 1) { *self.m_result_ptr = update; } } public: inline void execute() const { - ThreadsExec::resize_scratch(2 * Analysis::value_size(m_functor), 0); - ThreadsExec::start(&ParallelScanWithTotal::exec, this); - ThreadsExec::fence(); + ThreadsInternal::resize_scratch(2 * Analysis::value_size(m_functor), 0); + ThreadsInternal::start(&ParallelScanWithTotal::exec, this); + ThreadsInternal::fence(); } template diff --git a/core/src/impl/Kokkos_Spinwait.cpp b/core/src/Threads/Kokkos_Threads_Spinwait.cpp similarity index 90% rename from core/src/impl/Kokkos_Spinwait.cpp rename to core/src/Threads/Kokkos_Threads_Spinwait.cpp index 0a7eda29bcf..3df9dc07bf4 100644 --- a/core/src/impl/Kokkos_Spinwait.cpp +++ b/core/src/Threads/Kokkos_Threads_Spinwait.cpp @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include @@ -108,5 +108,15 @@ void host_thread_yield(const uint32_t i, const WaitMode mode) { #endif /* defined( KOKKOS_ENABLE_ASM ) */ } +void spinwait_while_equal(ThreadState const volatile& flag, + ThreadState const value) { + Kokkos::store_fence(); + uint32_t i = 0; + while (value == flag) { + host_thread_yield(++i, WaitMode::ACTIVE); + } + Kokkos::load_fence(); +} + } // namespace Impl } // namespace Kokkos diff --git a/algorithms/src/std_algorithms/Kokkos_Swap.hpp b/core/src/Threads/Kokkos_Threads_Spinwait.hpp similarity index 52% rename from algorithms/src/std_algorithms/Kokkos_Swap.hpp rename to core/src/Threads/Kokkos_Threads_Spinwait.hpp index acd2a572c8c..b98b6dbb73b 100644 --- a/algorithms/src/std_algorithms/Kokkos_Swap.hpp +++ b/core/src/Threads/Kokkos_Threads_Spinwait.hpp @@ -14,28 +14,30 @@ // //@HEADER -#ifndef KOKKOS_STD_ALGORITHMS_SWAP_HPP -#define KOKKOS_STD_ALGORITHMS_SWAP_HPP +#ifndef KOKKOS_THREADS_SPINWAIT_HPP +#define KOKKOS_THREADS_SPINWAIT_HPP -#include +#include + +#include namespace Kokkos { -namespace Experimental { - -// swap -template -KOKKOS_INLINE_FUNCTION void swap(T& a, T& b) noexcept { - static_assert( - std::is_move_assignable::value && std::is_move_constructible::value, - "Kokkos::Experimental::swap arguments must be move assignable " - "and move constructible"); - - T tmp = std::move(a); - a = std::move(b); - b = std::move(tmp); -} - -} // namespace Experimental +namespace Impl { + +enum class WaitMode : int { + ACTIVE // Used for tight loops to keep threads active longest + , + PASSIVE // Used to quickly yield the thread to quite down the system + , + ROOT // Never sleep or yield the root thread +}; + +void host_thread_yield(const uint32_t i, const WaitMode mode); + +void spinwait_while_equal(ThreadState const volatile& flag, + ThreadState const value); + +} // namespace Impl } // namespace Kokkos #endif diff --git a/core/src/fwd/Kokkos_Fwd_HBWSpace.hpp b/core/src/Threads/Kokkos_Threads_State.hpp similarity index 59% rename from core/src/fwd/Kokkos_Fwd_HBWSpace.hpp rename to core/src/Threads/Kokkos_Threads_State.hpp index 21ba7fad01c..148e9aa4e05 100644 --- a/core/src/fwd/Kokkos_Fwd_HBWSpace.hpp +++ b/core/src/Threads/Kokkos_Threads_State.hpp @@ -14,16 +14,26 @@ // //@HEADER -#ifndef KOKKOS_HBWSPACE_FWD_HPP_ -#define KOKKOS_HBWSPACE_FWD_HPP_ +#ifndef KOKKOS_THREADS_STATE_HPP +#define KOKKOS_THREADS_STATE_HPP -#ifdef KOKKOS_ENABLE_HBWSPACE namespace Kokkos { - -namespace Experimental { -class HBWSpace; /// Memory space for hbw_malloc from memkind (e.g. for KNL - /// processor) -} // namespace Experimental +namespace Impl { +/** \brief States of a worker thread */ +enum class ThreadState { + Terminating ///< Termination in progress + , + Inactive ///< Exists, waiting for work + , + Active ///< Exists, performing work + , + Rendezvous ///< Exists, waiting in a barrier or reduce + , + ScanCompleted, + ScanAvailable, + ReductionAvailable +}; +} // namespace Impl } // namespace Kokkos -#endif + #endif diff --git a/core/src/Threads/Kokkos_ThreadsTeam.hpp b/core/src/Threads/Kokkos_Threads_Team.hpp similarity index 95% rename from core/src/Threads/Kokkos_ThreadsTeam.hpp rename to core/src/Threads/Kokkos_Threads_Team.hpp index b1cadc7c485..fd0f221365b 100644 --- a/core/src/Threads/Kokkos_ThreadsTeam.hpp +++ b/core/src/Threads/Kokkos_Threads_Team.hpp @@ -22,10 +22,11 @@ #include #include -#include #include #include +#include +#include //---------------------------------------------------------------------------- @@ -50,8 +51,8 @@ class ThreadsExecTeamMember { private: using space = execution_space::scratch_memory_space; - ThreadsExec* const m_exec; - ThreadsExec* const* m_team_base; ///< Base for team fan-in + ThreadsInternal* const m_instance; + ThreadsInternal* const* m_team_base; ///< Base for team fan-in space m_team_shared; size_t m_team_shared_size; int m_team_size; @@ -84,14 +85,13 @@ class ThreadsExecTeamMember { for (n = 1; (!(m_team_rank_rev & n)) && ((j = m_team_rank_rev + n) < m_team_size); n <<= 1) { - Impl::spinwait_while_equal(m_team_base[j]->state(), - ThreadsExec::Active); + spinwait_while_equal(m_team_base[j]->state(), ThreadState::Active); } // If not root then wait for release if (m_team_rank_rev) { - m_exec->state() = ThreadsExec::Rendezvous; - Impl::spinwait_while_equal(m_exec->state(), ThreadsExec::Rendezvous); + m_instance->state() = ThreadState::Rendezvous; + spinwait_while_equal(m_instance->state(), ThreadState::Rendezvous); } return !m_team_rank_rev; @@ -102,7 +102,7 @@ class ThreadsExecTeamMember { for (n = 1; (!(m_team_rank_rev & n)) && ((j = m_team_rank_rev + n) < m_team_size); n <<= 1) { - m_team_base[j]->state() = ThreadsExec::Active; + m_team_base[j]->state() = ThreadState::Active; } } @@ -188,10 +188,10 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (nullptr == m_exec) return value; + if (m_instance == nullptr) return value; if (team_rank() != team_size() - 1) * - ((volatile type*)m_exec->scratch_memory()) = value; + ((volatile type*)m_instance->scratch_memory()) = value; memory_fence(); @@ -229,9 +229,9 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (nullptr == m_exec) return; + if (m_instance == nullptr) return; - type* const local_value = ((type*)m_exec->scratch_memory()); + type* const local_value = ((type*)m_instance->scratch_memory()); // Set this thread's contribution if (team_rank() != team_size() - 1) { *local_value = contribution; } @@ -285,9 +285,9 @@ class ThreadsExecTeamMember { using type = typename if_c::type; - if (nullptr == m_exec) return type(0); + if (m_instance == nullptr) return type(0); - volatile type* const work_value = ((type*)m_exec->scratch_memory()); + volatile type* const work_value = ((type*)m_instance->scratch_memory()); *work_value = value; @@ -342,10 +342,10 @@ class ThreadsExecTeamMember { template ThreadsExecTeamMember( - Impl::ThreadsExec* exec, + Impl::ThreadsInternal* instance, const TeamPolicyInternal& team, const size_t shared_size) - : m_exec(exec), + : m_instance(instance), m_team_base(nullptr), m_team_shared(nullptr, 0), m_team_shared_size(shared_size), @@ -361,9 +361,11 @@ class ThreadsExecTeamMember { if (team.league_size()) { // Execution is using device-team interface: - const int pool_rank_rev = m_exec->pool_size() - (m_exec->pool_rank() + 1); + const int pool_rank_rev = + m_instance->pool_size() - (m_instance->pool_rank() + 1); const int team_rank_rev = pool_rank_rev % team.team_alloc(); - const size_t pool_league_size = m_exec->pool_size() / team.team_alloc(); + const size_t pool_league_size = + m_instance->pool_size() / team.team_alloc(); const size_t pool_league_rank_rev = pool_rank_rev / team.team_alloc(); if (pool_league_rank_rev >= pool_league_size) { m_invalid_thread = 1; @@ -372,7 +374,7 @@ class ThreadsExecTeamMember { const size_t pool_league_rank = pool_league_size - (pool_league_rank_rev + 1); - const int pool_num_teams = m_exec->pool_size() / team.team_alloc(); + const int pool_num_teams = m_instance->pool_size() / team.team_alloc(); const int chunk_size = team.chunk_size() > 0 ? team.chunk_size() : team.team_iter(); const int chunks_per_team = @@ -387,8 +389,8 @@ class ThreadsExecTeamMember { if ((team.team_alloc() > size_t(m_team_size)) ? (team_rank_rev >= m_team_size) - : (m_exec->pool_size() - pool_num_teams * m_team_size > - m_exec->pool_rank())) + : (m_instance->pool_size() - pool_num_teams * m_team_size > + m_instance->pool_rank())) m_invalid_thread = 1; else m_invalid_thread = 0; @@ -398,7 +400,7 @@ class ThreadsExecTeamMember { if (team_rank_rev < team.team_size() && !m_invalid_thread) { m_team_base = - m_exec->pool_base() + team.team_alloc() * pool_league_rank_rev; + m_instance->pool_base() + team.team_alloc() * pool_league_rank_rev; m_team_size = team.team_size(); m_team_rank = team.team_size() - (team_rank_rev + 1); m_team_rank_rev = team_rank_rev; @@ -413,13 +415,13 @@ class ThreadsExecTeamMember { } if ((m_team_rank_rev == 0) && (m_invalid_thread == 0)) { - m_exec->set_work_range(m_league_rank, m_league_end, m_chunk_size); - m_exec->reset_steal_target(m_team_size); + m_instance->set_work_range(m_league_rank, m_league_end, m_chunk_size); + m_instance->reset_steal_target(m_team_size); } if (std::is_same::schedule_type::type, Kokkos::Dynamic>::value) { - m_exec->barrier(); + m_instance->barrier(); } } else { m_invalid_thread = 1; @@ -427,7 +429,7 @@ class ThreadsExecTeamMember { } ThreadsExecTeamMember() - : m_exec(nullptr), + : m_instance(nullptr), m_team_base(nullptr), m_team_shared(nullptr, 0), m_team_shared_size(0), @@ -442,8 +444,8 @@ class ThreadsExecTeamMember { m_invalid_thread(0), m_team_alloc(0) {} - inline ThreadsExec& threads_exec_team_base() const { - return m_team_base ? **m_team_base : *m_exec; + inline ThreadsInternal& threads_exec_team_base() const { + return m_team_base ? **m_team_base : *m_instance; } bool valid_static() const { return m_league_rank < m_league_end; } @@ -999,8 +1001,10 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( lambda(i, scan_val, false); } + auto& team_member = loop_bounds.thread; + // 'scan_val' output is the exclusive prefix sum - scan_val = loop_bounds.thread.team_scan(scan_val); + scan_val = team_member.team_scan(scan_val); #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep @@ -1010,6 +1014,8 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( lambda(i, scan_val, true); } + team_member.team_broadcast(scan_val, team_member.team_size() - 1); + return_val = scan_val; } diff --git a/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp b/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp index d4ce697548f..c88d66db5f9 100644 --- a/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp +++ b/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp @@ -18,7 +18,7 @@ #define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP #include -#include +#include namespace Kokkos { namespace Impl { @@ -61,16 +61,17 @@ class ParallelFor, } } - static inline void thread_main(ThreadsExec& exec, const void* arg) noexcept { + static inline void thread_main(ThreadsInternal& instance, + const void* arg) noexcept { const Self& self = *(static_cast(arg)); self.exec_one_thread(); - exec.fan_in(); + instance.fan_in(); } public: inline void execute() { - ThreadsExec::start(&Self::thread_main, this); - ThreadsExec::fence(); + ThreadsInternal::start(&Self::thread_main, this); + ThreadsInternal::fence(); } inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) diff --git a/core/src/decl/Kokkos_Declare_HBWSpace.hpp b/core/src/decl/Kokkos_Declare_HBWSpace.hpp deleted file mode 100644 index 1328c931352..00000000000 --- a/core/src/decl/Kokkos_Declare_HBWSpace.hpp +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_DECLARE_HBWSPACE_HPP -#define KOKKOS_DECLARE_HBWSPACE_HPP - -#ifdef KOKKOS_ENABLE_HBWSPACE -#include -#endif - -#endif diff --git a/core/src/decl/Kokkos_Declare_HIP.hpp b/core/src/decl/Kokkos_Declare_HIP.hpp index e115f7051f3..cf405e57b8f 100644 --- a/core/src/decl/Kokkos_Declare_HIP.hpp +++ b/core/src/decl/Kokkos_Declare_HIP.hpp @@ -25,9 +25,13 @@ #include #include #include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include #include #include #include diff --git a/core/src/decl/Kokkos_Declare_THREADS.hpp b/core/src/decl/Kokkos_Declare_THREADS.hpp index f5cbc0c1d1d..4d7caec6f5f 100644 --- a/core/src/decl/Kokkos_Declare_THREADS.hpp +++ b/core/src/decl/Kokkos_Declare_THREADS.hpp @@ -19,7 +19,7 @@ #if defined(KOKKOS_ENABLE_THREADS) #include -#include +#include #include #include #include @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #endif diff --git a/core/src/impl/Kokkos_Core.cpp b/core/src/impl/Kokkos_Core.cpp index 5c182db5663..4a696526161 100644 --- a/core/src/impl/Kokkos_Core.cpp +++ b/core/src/impl/Kokkos_Core.cpp @@ -90,8 +90,6 @@ void combine(Kokkos::InitializationSettings& out, KOKKOS_IMPL_COMBINE_SETTING(num_threads); KOKKOS_IMPL_COMBINE_SETTING(map_device_id_by); KOKKOS_IMPL_COMBINE_SETTING(device_id); - KOKKOS_IMPL_COMBINE_SETTING(num_devices); - KOKKOS_IMPL_COMBINE_SETTING(skip_device); KOKKOS_IMPL_COMBINE_SETTING(disable_warnings); KOKKOS_IMPL_COMBINE_SETTING(tune_internals); KOKKOS_IMPL_COMBINE_SETTING(tools_help); @@ -131,11 +129,15 @@ void combine(Kokkos::Tools::InitArguments& out, int get_device_count() { #if defined(KOKKOS_ENABLE_CUDA) - return Kokkos::Cuda::detect_device_count(); + int count; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count)); + return count; #elif defined(KOKKOS_ENABLE_HIP) - return Kokkos::HIP::detect_device_count(); + int count; + KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&count)); + return count; #elif defined(KOKKOS_ENABLE_SYCL) - return sycl::device::get_devices(sycl::info::device_type::gpu).size(); + return Kokkos::Experimental::Impl::get_sycl_devices().size(); #elif defined(KOKKOS_ENABLE_OPENACC) return acc_get_num_devices( Kokkos::Experimental::Impl::OpenACC_Traits::dev_type); @@ -165,20 +167,43 @@ bool is_valid_map_device_id_by(std::string const& x) { } // namespace +std::vector const& Kokkos::Impl::get_visible_devices() { + static auto devices = get_visible_devices(get_device_count()); + return devices; +} + [[nodiscard]] int Kokkos::device_id() noexcept { #if defined(KOKKOS_ENABLE_CUDA) - return Cuda().cuda_device(); + int device = Cuda().cuda_device(); #elif defined(KOKKOS_ENABLE_HIP) - return HIP().hip_device(); + int device = HIP().hip_device(); #elif defined(KOKKOS_ENABLE_OPENACC) - return Experimental::OpenACC().acc_device_number(); + int device = Experimental::OpenACC().acc_device_number(); #elif defined(KOKKOS_ENABLE_OPENMPTARGET) - return omp_get_default_device(); // FIXME_OPENMPTARGET + int device = omp_get_default_device(); // FIXME_OPENMPTARGET #elif defined(KOKKOS_ENABLE_SYCL) - return Experimental::Impl::SYCLInternal::m_syclDev; + int device = Experimental::Impl::SYCLInternal::m_syclDev; #else - return -1; + int device = -1; + return device; #endif + auto const& visible_devices = Impl::get_visible_devices(); + for (std::size_t i = 0; i < visible_devices.size(); ++i) { + if (visible_devices[i] == device) { + return i; + } + } + Kokkos::abort("Unexpected error: cannot determine device id"); + return -1; +} + +[[nodiscard]] int Kokkos::num_devices() noexcept { + if constexpr (std::is_same_v) { + return -1; // no GPU backend enabled + } else { + return Impl::get_visible_devices().size(); + } } [[nodiscard]] int Kokkos::num_threads() noexcept { @@ -313,8 +338,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { return std::stoi(id.c_str()); } -std::vector Kokkos::Impl::get_visible_devices( - Kokkos::InitializationSettings const& settings, int device_count) { +std::vector Kokkos::Impl::get_visible_devices(int device_count) { std::vector visible_devices; char* env_visible_devices = std::getenv("KOKKOS_VISIBLE_DEVICES"); if (env_visible_devices) { @@ -341,30 +365,9 @@ std::vector Kokkos::Impl::get_visible_devices( } } } else { - int num_devices = - settings.has_num_devices() ? settings.get_num_devices() : device_count; - if (num_devices > device_count) { - std::stringstream ss; - ss << "Error: Specified number of devices '" << num_devices - << "' exceeds the actual number of GPUs available for execution '" - << device_count << "'." - << " Raised by Kokkos::initialize().\n"; - Kokkos::abort(ss.str().c_str()); - } - for (int i = 0; i < num_devices; ++i) { + for (int i = 0; i < device_count; ++i) { visible_devices.push_back(i); } - if (settings.has_skip_device()) { - if (visible_devices.size() == 1 && settings.get_skip_device() == 0) { - Kokkos::abort( - "Error: skipping the only GPU available for execution.\n" - " Raised by Kokkos::initialize().\n"); - } - visible_devices.erase( - std::remove(visible_devices.begin(), visible_devices.end(), - settings.get_skip_device()), - visible_devices.end()); - } } if (visible_devices.empty()) { Kokkos::abort( @@ -374,10 +377,10 @@ std::vector Kokkos::Impl::get_visible_devices( return visible_devices; } -int Kokkos::Impl::get_gpu(const InitializationSettings& settings) { - std::vector visible_devices = - get_visible_devices(settings, get_device_count()); - int const num_devices = visible_devices.size(); +std::optional Kokkos::Impl::get_gpu( + const InitializationSettings& settings) { + std::vector visible_devices = get_visible_devices(get_device_count()); + int const num_devices = visible_devices.size(); // device_id is provided if (settings.has_device_id()) { int const id = settings.get_device_id(); @@ -423,14 +426,15 @@ int Kokkos::Impl::get_gpu(const InitializationSettings& settings) { int const mpi_local_rank = mpi_local_rank_on_node(); - // use first GPU available for execution if unable to detect local MPI rank + // if unable to detect local MPI rank return nullopt to delegate device + // selection to the backend if (mpi_local_rank < 0) { if (settings.has_map_device_id_by()) { std::cerr << "Warning: unable to detect local MPI rank." << " Falling back to the first GPU available for execution." << " Raised by Kokkos::initialize()." << std::endl; } - return visible_devices[0]; + return std::nullopt; } // use device assigned by CTest when resource allocation is activated @@ -445,13 +449,6 @@ int Kokkos::Impl::get_gpu(const InitializationSettings& settings) { namespace { void initialize_backends(const Kokkos::InitializationSettings& settings) { -// This is an experimental setting -// For KNL in Flat mode this variable should be set, so that -// memkind allocates high bandwidth memory correctly. -#ifdef KOKKOS_ENABLE_HBWSPACE - setenv("MEMKIND_HBW_NODES", "1", 0); -#endif - Kokkos::Impl::ExecSpaceManager::get_instance().initialize_spaces(settings); } @@ -571,19 +568,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { "no"); #endif -#ifdef KOKKOS_ENABLE_HBWSPACE - declare_configuration_metadata("memory", "KOKKOS_ENABLE_HBWSPACE", "yes"); -#else - declare_configuration_metadata("memory", "KOKKOS_ENABLE_HBWSPACE", "no"); -#endif -#ifdef KOKKOS_ENABLE_INTEL_MM_ALLOC - declare_configuration_metadata("memory", "KOKKOS_ENABLE_INTEL_MM_ALLOC", - "yes"); -#else - declare_configuration_metadata("memory", "KOKKOS_ENABLE_INTEL_MM_ALLOC", - "no"); -#endif - #ifdef KOKKOS_ENABLE_ASM declare_configuration_metadata("options", "KOKKOS_ENABLE_ASM", "yes"); #else @@ -604,6 +588,11 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #else declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX23", "no"); #endif +#ifdef KOKKOS_ENABLE_CXX26 + declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX26", "yes"); +#else + declare_configuration_metadata("options", "KOKKOS_ENABLE_CXX26", "no"); +#endif #ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK declare_configuration_metadata("options", "KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK", "yes"); @@ -616,11 +605,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #else declare_configuration_metadata("options", "KOKKOS_ENABLE_HWLOC", "no"); #endif -#ifdef KOKKOS_ENABLE_LIBRT - declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "yes"); -#else - declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "no"); -#endif #ifdef KOKKOS_ENABLE_LIBDL declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBDL", "yes"); #else @@ -645,8 +629,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { "ARMV8_THUNDERX2"); #elif defined(KOKKOS_ARCH_BDW) declare_configuration_metadata("architecture", "CPU architecture", "BDW"); -#elif defined(KOKKOS_ARCH_BGQ) - declare_configuration_metadata("architecture", "CPU architecture", "BGQ"); #elif defined(KOKKOS_ARCH_HSW) declare_configuration_metadata("architecture", "CPU architecture", "HSW"); #elif defined(KOKKOS_ARCH_ICL) @@ -659,8 +641,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { declare_configuration_metadata("architecture", "CPU architecture", "KNL"); #elif defined(KOKKOS_ARCH_NATIVE) declare_configuration_metadata("architecture", "CPU architecture", "NATIVE"); -#elif defined(KOKKOS_ARCH_POWER7) - declare_configuration_metadata("architecture", "CPU architecture", "POWER7"); #elif defined(KOKKOS_ARCH_POWER8) declare_configuration_metadata("architecture", "CPU architecture", "POWER8"); #elif defined(KOKKOS_ARCH_POWER9) @@ -673,8 +653,6 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { declare_configuration_metadata("architecture", "CPU architecture", "SNB"); #elif defined(KOKKOS_ARCH_SPR) declare_configuration_metadata("architecture", "CPU architecture", "SPR"); -#elif defined(KOKKOS_ARCH_WSM) - declare_configuration_metadata("architecture", "CPU architecture", "WSM"); #elif defined(KOKKOS_ARCH_AMD_ZEN) declare_configuration_metadata("architecture", "CPU architecture", "AMD_ZEN"); #elif defined(KOKKOS_ARCH_AMD_ZEN2) @@ -683,6 +661,9 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #elif defined(KOKKOS_ARCH_AMD_ZEN3) declare_configuration_metadata("architecture", "CPU architecture", "AMD_ZEN3"); +#elif defined(KOKKOS_ARCH_RISCV_SG2042) + declare_configuration_metadata("architecture", "CPU architecture", + "SG2042 (RISC-V)") #else declare_configuration_metadata("architecture", "CPU architecture", "none"); #endif @@ -752,8 +733,8 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #elif defined(KOKKOS_ARCH_ADA89) declare_configuration_metadata("architecture", "GPU architecture", "ADA89"); #elif defined(KOKKOS_ARCH_HOPPER90) - declare_configuration_metadata("architecture", "GPU architecture", - "HOPPER90"); + declare_configuration_metadata("architecture", "GPU architecture", + "HOPPER90"); #elif defined(KOKKOS_ARCH_AMD_GFX906) declare_configuration_metadata("architecture", "GPU architecture", "AMD_GFX906"); @@ -911,36 +892,18 @@ void Kokkos::Impl::parse_command_line_arguments( int num_threads; int device_id; - int num_devices; // deprecated - int skip_device; // deprecated std::string map_device_id_by; bool disable_warnings; bool print_configuration; bool tune_internals; - auto get_flag = [](std::string s) -> std::string { - return s.erase(s.find('=')); - }; - bool help_flag = false; int iarg = 0; while (iarg < argc) { bool remove_flag = false; - if (check_arg(argv[iarg], "--kokkos-numa") || - check_arg(argv[iarg], "--numa")) { - warn_deprecated_command_line_argument(get_flag(argv[iarg])); - // remove flag if prefixed with '--kokkos-' - remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0; - } else if (check_arg_int(argv[iarg], "--kokkos-num-threads", num_threads) || - check_arg_int(argv[iarg], "--num-threads", num_threads) || - check_arg_int(argv[iarg], "--kokkos-threads", num_threads) || - check_arg_int(argv[iarg], "--threads", num_threads)) { - if (get_flag(argv[iarg]) != "--kokkos-num-threads") { - warn_deprecated_command_line_argument(get_flag(argv[iarg]), - "--kokkos-num-threads"); - } + if (check_arg_int(argv[iarg], "--kokkos-num-threads", num_threads)) { if (!is_valid_num_threads(num_threads)) { std::stringstream ss; ss << "Error: command line argument '" << argv[iarg] << "' is invalid." @@ -949,15 +912,8 @@ void Kokkos::Impl::parse_command_line_arguments( Kokkos::abort(ss.str().c_str()); } settings.set_num_threads(num_threads); - remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0; - } else if (check_arg_int(argv[iarg], "--kokkos-device-id", device_id) || - check_arg_int(argv[iarg], "--device-id", device_id) || - check_arg_int(argv[iarg], "--kokkos-device", device_id) || - check_arg_int(argv[iarg], "--device", device_id)) { - if (get_flag(argv[iarg]) != "--kokkos-device-id") { - warn_deprecated_command_line_argument(get_flag(argv[iarg]), - "--kokkos-device-id"); - } + remove_flag = true; + } else if (check_arg_int(argv[iarg], "--kokkos-device-id", device_id)) { if (!is_valid_device_id(device_id)) { std::stringstream ss; ss << "Error: command line argument '" << argv[iarg] << "' is invalid." @@ -966,70 +922,7 @@ void Kokkos::Impl::parse_command_line_arguments( Kokkos::abort(ss.str().c_str()); } settings.set_device_id(device_id); - remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0; - } else if (check_arg(argv[iarg], "--kokkos-num-devices") || - check_arg(argv[iarg], "--num-devices") || - check_arg(argv[iarg], "--kokkos-ndevices") || - check_arg(argv[iarg], "--ndevices")) { - if (check_arg(argv[iarg], "--num-devices")) { - warn_deprecated_command_line_argument("--num-devices", - "--kokkos-num-devices"); - } - if (check_arg(argv[iarg], "--ndevices")) { - warn_deprecated_command_line_argument("--ndevices", - "--kokkos-num-devices"); - } - if (check_arg(argv[iarg], "--kokkos-ndevices")) { - warn_deprecated_command_line_argument("--kokkos-ndevices", - "--kokkos-num-devices"); - } - warn_deprecated_command_line_argument( - "--kokkos-num-devices", "--kokkos-map-device-id-by=mpi_rank"); - // Find the number of device (expecting --device=XX) - if (!((strncmp(argv[iarg], "--kokkos-num-devices=", 21) == 0) || - (strncmp(argv[iarg], "--num-devices=", 14) == 0) || - (strncmp(argv[iarg], "--kokkos-ndevices=", 18) == 0) || - (strncmp(argv[iarg], "--ndevices=", 11) == 0))) - throw_runtime_exception( - "Error: expecting an '=INT[,INT]' after command line argument " - "'--kokkos-num-devices'." - " Raised by Kokkos::initialize()."); - - char* num1 = strchr(argv[iarg], '=') + 1; - char* num2 = strpbrk(num1, ","); - int num1_len = num2 == nullptr ? strlen(num1) : num2 - num1; - char* num1_only = new char[num1_len + 1]; - strncpy(num1_only, num1, num1_len); - num1_only[num1_len] = '\0'; - - if (!is_unsigned_int(num1_only) || (strlen(num1_only) == 0)) { - throw_runtime_exception( - "Error: expecting an integer number after command line argument " - "'--kokkos-num-devices'." - " Raised by Kokkos::initialize()."); - } - if (check_arg(argv[iarg], "--kokkos-num-devices") || - check_arg(argv[iarg], "--kokkos-ndevices")) { - num_devices = std::stoi(num1_only); - settings.set_num_devices(num_devices); - settings.set_map_device_id_by("mpi_rank"); - } - delete[] num1_only; - - if (num2 != nullptr) { - if ((!is_unsigned_int(num2 + 1)) || (strlen(num2) == 1)) - throw_runtime_exception( - "Error: expecting an integer number after command line argument " - "'--kokkos-num-devices=XX,'." - " Raised by Kokkos::initialize()."); - - if (check_arg(argv[iarg], "--kokkos-num-devices") || - check_arg(argv[iarg], "--kokkos-ndevices")) { - skip_device = std::stoi(num2 + 1); - settings.set_skip_device(skip_device); - } - } - remove_flag = std::string(argv[iarg]).find("--kokkos-") == 0; + remove_flag = true; } else if (check_arg_bool(argv[iarg], "--kokkos-disable-warnings", disable_warnings)) { settings.set_disable_warnings(disable_warnings); @@ -1098,9 +991,6 @@ void Kokkos::Impl::parse_environment_variables( } combine(settings, tools_init_arguments); - if (std::getenv("KOKKOS_NUMA")) { - warn_deprecated_environment_variable("KOKKOS_NUMA"); - } int num_threads; if (check_env_int("KOKKOS_NUM_THREADS", num_threads)) { if (!is_valid_num_threads(num_threads)) { @@ -1125,34 +1015,6 @@ void Kokkos::Impl::parse_environment_variables( } settings.set_device_id(device_id); } - int num_devices; - int rand_devices; - bool has_num_devices = check_env_int("KOKKOS_NUM_DEVICES", num_devices); - bool has_rand_devices = check_env_int("KOKKOS_RAND_DEVICES", rand_devices); - if (has_rand_devices && has_num_devices) { - Impl::throw_runtime_exception( - "Error: cannot specify both KOKKOS_NUM_DEVICES and " - "KOKKOS_RAND_DEVICES." - " Raised by Kokkos::initialize()."); - } - if (has_num_devices) { - warn_deprecated_environment_variable("KOKKOS_NUM_DEVICES", - "KOKKOS_MAP_DEVICE_ID_BY=mpi_rank"); - settings.set_map_device_id_by("mpi_rank"); - settings.set_num_devices(num_devices); - } - if (has_rand_devices) { - warn_deprecated_environment_variable("KOKKOS_RAND_DEVICES", - "KOKKOS_MAP_DEVICE_ID_BY=random"); - settings.set_map_device_id_by("random"); - settings.set_num_devices(rand_devices); - } - if (has_num_devices || has_rand_devices) { - int skip_device; - if (check_env_int("KOKKOS_SKIP_DEVICE", skip_device)) { - settings.set_skip_device(skip_device); - } - } bool disable_warnings; if (check_env_bool("KOKKOS_DISABLE_WARNINGS", disable_warnings)) { settings.set_disable_warnings(disable_warnings); diff --git a/core/src/impl/Kokkos_DeviceManagement.hpp b/core/src/impl/Kokkos_DeviceManagement.hpp index bd89c8b19ca..70dca5d8fad 100644 --- a/core/src/impl/Kokkos_DeviceManagement.hpp +++ b/core/src/impl/Kokkos_DeviceManagement.hpp @@ -17,17 +17,17 @@ #ifndef KOKKOS_DEVICE_MANAGEMENT_HPP #define KOKKOS_DEVICE_MANAGEMENT_HPP +#include #include namespace Kokkos { class InitializationSettings; namespace Impl { -int get_gpu(const Kokkos::InitializationSettings& settings); +std::optional get_gpu(const Kokkos::InitializationSettings& settings); // This declaration is provided for testing purposes only int get_ctest_gpu(int local_rank); -// ditto -std::vector get_visible_devices( - Kokkos::InitializationSettings const& settings, int device_count); +std::vector get_visible_devices(int device_count); // test-only +std::vector const& get_visible_devices(); // use this instead } // namespace Impl } // namespace Kokkos diff --git a/core/src/impl/Kokkos_Error.cpp b/core/src/impl/Kokkos_Error.cpp index 4babe2d72bd..de6e83ed1f2 100644 --- a/core/src/impl/Kokkos_Error.cpp +++ b/core/src/impl/Kokkos_Error.cpp @@ -21,10 +21,11 @@ #include #include -#include +#include #include #include #include +#include // show_warnings #include #include @@ -38,6 +39,12 @@ void throw_runtime_exception(const std::string &msg) { throw std::runtime_error(msg); } +void log_warning(const std::string &msg) { + if (show_warnings()) { + std::cerr << msg << std::flush; + } +} + std::string human_memory_size(size_t arg_bytes) { double bytes = arg_bytes; const double K = 1024; @@ -64,7 +71,8 @@ std::string human_memory_size(size_t arg_bytes) { void Experimental::RawMemoryAllocationFailure::print_error_message( std::ostream &o) const { - o << "Allocation of size " << Impl::human_memory_size(m_attempted_size); + o << "Allocation of size " + << ::Kokkos::Impl::human_memory_size(m_attempted_size); o << " failed"; switch (m_failure_mode) { case FailureMode::OutOfMemoryError: diff --git a/core/src/impl/Kokkos_Error.hpp b/core/src/impl/Kokkos_Error.hpp index 3d0b1d3274c..1058fd98dbf 100644 --- a/core/src/impl/Kokkos_Error.hpp +++ b/core/src/impl/Kokkos_Error.hpp @@ -28,6 +28,8 @@ namespace Impl { [[noreturn]] void throw_runtime_exception(const std::string &msg); +void log_warning(const std::string &msg); + std::string human_memory_size(size_t arg_bytes); } // namespace Impl @@ -58,7 +60,8 @@ class RawMemoryAllocationFailure : public std::bad_alloc { HIPMallocManaged, SYCLMallocDevice, SYCLMallocShared, - SYCLMallocHost + SYCLMallocHost, + OpenACCMalloc, }; private: diff --git a/core/src/impl/Kokkos_HBWSpace.cpp b/core/src/impl/Kokkos_HBWSpace.cpp deleted file mode 100644 index cd640b88cb9..00000000000 --- a/core/src/impl/Kokkos_HBWSpace.cpp +++ /dev/null @@ -1,313 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#ifdef KOKKOS_ENABLE_HBWSPACE -#include -#endif - -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- -#ifdef KOKKOS_ENABLE_HBWSPACE -#define MEMKIND_TYPE MEMKIND_HBW // hbw_get_kind(HBW_PAGESIZE_4KB) - -/*--------------------------------------------------------------------------*/ - -namespace Kokkos { -namespace Experimental { - -/* Default allocation mechanism */ -HBWSpace::HBWSpace() : m_alloc_mech(HBWSpace::STD_MALLOC) { - printf("Init\n"); - setenv("MEMKIND_HBW_NODES", "1", 0); -} - -/* Default allocation mechanism */ -HBWSpace::HBWSpace(const HBWSpace::AllocationMechanism &arg_alloc_mech) - : m_alloc_mech(HBWSpace::STD_MALLOC) { - printf("Init2\n"); - setenv("MEMKIND_HBW_NODES", "1", 0); - if (arg_alloc_mech == STD_MALLOC) { - m_alloc_mech = HBWSpace::STD_MALLOC; - } -} - -void *HBWSpace::allocate(const size_t arg_alloc_size) const { - return allocate("[unlabeled]", arg_alloc_size); -} -void *HBWSpace::allocate(const char *arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size) const { - return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); -} -void *HBWSpace::impl_allocate( - const char *arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size, - const Kokkos::Tools::SpaceHandle arg_handle) const { - static_assert(sizeof(void *) == sizeof(uintptr_t), - "Error sizeof(void*) != sizeof(uintptr_t)"); - - static_assert( - Kokkos::Impl::power_of_two::value, - "Memory alignment must be power of two"); - - constexpr uintptr_t alignment = Kokkos::Impl::MEMORY_ALIGNMENT; - constexpr uintptr_t alignment_mask = alignment - 1; - - void *ptr = nullptr; - - if (arg_alloc_size) { - if (m_alloc_mech == STD_MALLOC) { - // Over-allocate to and round up to guarantee proper alignment. - size_t size_padded = arg_alloc_size + sizeof(void *) + alignment; - - void *alloc_ptr = memkind_malloc(MEMKIND_TYPE, size_padded); - - if (alloc_ptr) { - uintptr_t address = reinterpret_cast(alloc_ptr); - - // offset enough to record the alloc_ptr - address += sizeof(void *); - uintptr_t rem = address % alignment; - uintptr_t offset = rem ? (alignment - rem) : 0u; - address += offset; - ptr = reinterpret_cast(address); - // record the alloc'd pointer - address -= sizeof(void *); - *reinterpret_cast(address) = alloc_ptr; - } - } - } - - if ((ptr == nullptr) || (reinterpret_cast(ptr) == ~uintptr_t(0)) || - (reinterpret_cast(ptr) & alignment_mask)) { - std::ostringstream msg; - msg << "Kokkos::Experimental::HBWSpace::allocate[ "; - switch (m_alloc_mech) { - case STD_MALLOC: msg << "STD_MALLOC"; break; - case POSIX_MEMALIGN: msg << "POSIX_MEMALIGN"; break; - case POSIX_MMAP: msg << "POSIX_MMAP"; break; - case INTEL_MM_ALLOC: msg << "INTEL_MM_ALLOC"; break; - } - msg << " ]( " << arg_alloc_size << " ) FAILED"; - if (ptr == nullptr) { - msg << " nullptr"; - } else { - msg << " NOT ALIGNED " << ptr; - } - - std::cerr << msg.str() << std::endl; - std::cerr.flush(); - - Kokkos::Impl::throw_runtime_exception(msg.str()); - } - if (Kokkos::Profiling::profileLibraryLoaded()) { - const size_t reported_size = - (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); - } - - return ptr; -} - -void HBWSpace::deallocate(void *const arg_alloc_ptr, - const size_t arg_alloc_size) const { - deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); -} -void HBWSpace::deallocate(const char *arg_label, void *const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size) const { - impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); -} -void HBWSpace::impl_deallocate( - const char *arg_label, void *const arg_alloc_ptr, - const size_t arg_alloc_size, const size_t arg_logical_size, - const Kokkos::Tools::SpaceHandle arg_handle) const { - if (arg_alloc_ptr) { - if (Kokkos::Profiling::profileLibraryLoaded()) { - const size_t reported_size = - (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, - reported_size); - } - - if (m_alloc_mech == STD_MALLOC) { - void *alloc_ptr = *(reinterpret_cast(arg_alloc_ptr) - 1); - memkind_free(MEMKIND_TYPE, alloc_ptr); - } - } -} - -} // namespace Experimental -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord - SharedAllocationRecord::s_root_record; -#endif - -void SharedAllocationRecord::deallocate( - SharedAllocationRecord *arg_rec) { - delete static_cast(arg_rec); -} - -SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size, - (SharedAllocationRecord::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -SharedAllocationRecord:: - SharedAllocationRecord( - const Kokkos::Experimental::HBWSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : SharedAllocationRecord( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - // Fill in the Header information - RecordBase::m_alloc_ptr->m_record = - static_cast *>(this); - - strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length - 1); - // Set last element zero, in case c_str is too long - RecordBase::m_alloc_ptr - ->m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0'; -} - -//---------------------------------------------------------------------------- - -void * -SharedAllocationRecord::allocate_tracked( - const Kokkos::Experimental::HBWSpace &arg_space, - const std::string &arg_alloc_label, const size_t arg_alloc_size) { - if (!arg_alloc_size) return nullptr; - - SharedAllocationRecord *const r = - allocate(arg_space, arg_alloc_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); -} - -void SharedAllocationRecord::deallocate_tracked(void *const - arg_alloc_ptr) { - if (arg_alloc_ptr != nullptr) { - SharedAllocationRecord *const r = get_record(arg_alloc_ptr); - - RecordBase::decrement(r); - } -} - -void *SharedAllocationRecord:: - reallocate_tracked(void *const arg_alloc_ptr, const size_t arg_alloc_size) { - SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord *const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - Kokkos::fence( - "SharedAllocationRecord::reallocate_tracked(): fence after copying data"); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); -} - -SharedAllocationRecord - *SharedAllocationRecord::get_record( - void *alloc_ptr) { - using Header = SharedAllocationHeader; - using RecordHost = - SharedAllocationRecord; - - SharedAllocationHeader const *const head = - alloc_ptr ? Header::get_header(alloc_ptr) : nullptr; - RecordHost *const record = - head ? static_cast(head->m_record) : nullptr; - - if (!alloc_ptr || record->m_alloc_ptr != head) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace " - ", void >::get_record ERROR")); - } - - return record; -} - -// Iterate records to print orphaned memory ... -void SharedAllocationRecord:: - print_records(std::ostream &s, const Kokkos::Experimental::HBWSpace &space, - bool detail) { -#ifdef KOKKOS_ENABLE_DEBUG - SharedAllocationRecord::print_host_accessible_records( - s, "HBWSpace", &s_root_record, detail); -#else - throw_runtime_exception( - "SharedAllocationRecord::print_records" - " only works with KOKKOS_ENABLE_DEBUG enabled"); -#endif -} - -} // namespace Impl -} // namespace Kokkos - -#endif diff --git a/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp b/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp index 4a22898d168..bcce013b00e 100644 --- a/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp +++ b/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp @@ -196,12 +196,12 @@ KOKKOS_INLINE_FUNCTION template static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::half_t cast_to_wrapper( - T x, const volatile Kokkos::Impl::half_impl_t::type&); + T x, const Kokkos::Impl::half_impl_t::type&); #ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED template static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::bhalf_t cast_to_wrapper( - T x, const volatile Kokkos::Impl::bhalf_impl_t::type&); + T x, const Kokkos::Impl::bhalf_impl_t::type&); #endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED template @@ -283,13 +283,6 @@ class alignas(FloatType) floating_point_wrapper { private: impl_type val; - using fixed_width_integer_type = std::conditional_t< - sizeof(impl_type) == 2, uint16_t, - std::conditional_t< - sizeof(impl_type) == 4, uint32_t, - std::conditional_t>>; - static_assert(!std::is_void::value, - "Invalid impl_type"); public: // In-class initialization and defaulted default constructors not used @@ -318,18 +311,6 @@ class alignas(FloatType) floating_point_wrapper { default; #endif - KOKKOS_INLINE_FUNCTION - floating_point_wrapper(const volatile floating_point_wrapper& rhs) { -#if defined(KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH) && !defined(KOKKOS_ENABLE_SYCL) - val = rhs.val; -#else - const volatile fixed_width_integer_type* rv_ptr = - reinterpret_cast(&rhs.val); - const fixed_width_integer_type rv_val = *rv_ptr; - val = reinterpret_cast(rv_val); -#endif // KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH - } - KOKKOS_FUNCTION floating_point_wrapper(bit_comparison_type rhs) { val = Kokkos::bit_cast(rhs); @@ -492,15 +473,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - template - KOKKOS_FUNCTION void operator=(T rhs) volatile { - impl_type new_val = cast_to_wrapper(rhs, val).val; - volatile fixed_width_integer_type* val_ptr = - reinterpret_cast( - const_cast(&val)); - *val_ptr = reinterpret_cast(new_val); - } - // Compound operators KOKKOS_FUNCTION floating_point_wrapper& operator+=(floating_point_wrapper rhs) { @@ -515,15 +487,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - KOKKOS_FUNCTION - void operator+=(const volatile floating_point_wrapper& rhs) volatile { - floating_point_wrapper tmp_rhs = rhs; - floating_point_wrapper tmp_lhs = *this; - - tmp_lhs += tmp_rhs; - *this = tmp_lhs; - } - // Compound operators: upcast overloads for += template KOKKOS_FUNCTION friend std::enable_if_t< @@ -560,15 +523,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - KOKKOS_FUNCTION - void operator-=(const volatile floating_point_wrapper& rhs) volatile { - floating_point_wrapper tmp_rhs = rhs; - floating_point_wrapper tmp_lhs = *this; - - tmp_lhs -= tmp_rhs; - *this = tmp_lhs; - } - // Compund operators: upcast overloads for -= template KOKKOS_FUNCTION friend std::enable_if_t< @@ -605,15 +559,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - KOKKOS_FUNCTION - void operator*=(const volatile floating_point_wrapper& rhs) volatile { - floating_point_wrapper tmp_rhs = rhs; - floating_point_wrapper tmp_lhs = *this; - - tmp_lhs *= tmp_rhs; - *this = tmp_lhs; - } - // Compund operators: upcast overloads for *= template KOKKOS_FUNCTION friend std::enable_if_t< @@ -650,15 +595,6 @@ class alignas(FloatType) floating_point_wrapper { return *this; } - KOKKOS_FUNCTION - void operator/=(const volatile floating_point_wrapper& rhs) volatile { - floating_point_wrapper tmp_rhs = rhs; - floating_point_wrapper tmp_lhs = *this; - - tmp_lhs /= tmp_rhs; - *this = tmp_lhs; - } - // Compund operators: upcast overloads for /= template KOKKOS_FUNCTION friend std::enable_if_t< @@ -884,27 +820,6 @@ class alignas(FloatType) floating_point_wrapper { #endif } - KOKKOS_FUNCTION - friend bool operator==(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs == tmp_rhs; - } - - KOKKOS_FUNCTION - friend bool operator!=(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs != tmp_rhs; - } - - KOKKOS_FUNCTION - friend bool operator<(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs < tmp_rhs; - } - template KOKKOS_FUNCTION friend std::enable_if_t && (std::is_same_v || @@ -923,13 +838,6 @@ class alignas(FloatType) floating_point_wrapper { return lhs < static_cast(rhs); } - KOKKOS_FUNCTION - friend bool operator>(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs > tmp_rhs; - } - template KOKKOS_FUNCTION friend std::enable_if_t && (std::is_same_v || @@ -948,13 +856,6 @@ class alignas(FloatType) floating_point_wrapper { return lhs > static_cast(rhs); } - KOKKOS_FUNCTION - friend bool operator<=(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs <= tmp_rhs; - } - template KOKKOS_FUNCTION friend std::enable_if_t && (std::is_same_v || @@ -973,13 +874,6 @@ class alignas(FloatType) floating_point_wrapper { return lhs <= static_cast(rhs); } - KOKKOS_FUNCTION - friend bool operator>=(const volatile floating_point_wrapper& lhs, - const volatile floating_point_wrapper& rhs) { - floating_point_wrapper tmp_lhs = lhs, tmp_rhs = rhs; - return tmp_lhs >= tmp_rhs; - } - template KOKKOS_FUNCTION friend std::enable_if_t && (std::is_same_v || @@ -1018,14 +912,14 @@ class alignas(FloatType) floating_point_wrapper { // Declare wrapper overloads now that floating_point_wrapper is declared template static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::half_t cast_to_wrapper( - T x, const volatile Kokkos::Impl::half_impl_t::type&) { + T x, const Kokkos::Impl::half_impl_t::type&) { return Kokkos::Experimental::cast_to_half(x); } #ifdef KOKKOS_IMPL_BHALF_TYPE_DEFINED template static KOKKOS_INLINE_FUNCTION Kokkos::Experimental::bhalf_t cast_to_wrapper( - T x, const volatile Kokkos::Impl::bhalf_impl_t::type&) { + T x, const Kokkos::Impl::bhalf_impl_t::type&) { return Kokkos::Experimental::cast_to_bhalf(x); } #endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED diff --git a/core/src/impl/Kokkos_HostSpace.cpp b/core/src/impl/Kokkos_HostSpace.cpp index a9d72160593..1047b773d77 100644 --- a/core/src/impl/Kokkos_HostSpace.cpp +++ b/core/src/impl/Kokkos_HostSpace.cpp @@ -20,23 +20,11 @@ #include +#include +#include #include -#include #include -/*--------------------------------------------------------------------------*/ - -#if (defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_INTEL_LLVM)) && \ - !defined(KOKKOS_ENABLE_CUDA) - -// Intel specialized allocator does not interoperate with CUDA memory allocation - -#define KOKKOS_ENABLE_INTEL_MM_ALLOC - -#endif - -/*--------------------------------------------------------------------------*/ - #include #include #include @@ -50,10 +38,6 @@ #include #endif -#include -#include -#include - //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -150,84 +134,6 @@ void HostSpace::impl_deallocate( } // namespace Kokkos -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -#ifdef KOKKOS_ENABLE_DEBUG -SharedAllocationRecord - SharedAllocationRecord::s_root_record; -#endif - -SharedAllocationRecord::~SharedAllocationRecord() { - m_space.deallocate(m_label.c_str(), - SharedAllocationRecord::m_alloc_ptr, - SharedAllocationRecord::m_alloc_size, - (SharedAllocationRecord::m_alloc_size - - sizeof(SharedAllocationHeader))); -} - -SharedAllocationHeader *_do_allocation(Kokkos::HostSpace const &space, - std::string const &label, - size_t alloc_size) { - try { - return reinterpret_cast( - space.allocate(alloc_size)); - } catch (Experimental::RawMemoryAllocationFailure const &failure) { - if (failure.failure_mode() == Experimental::RawMemoryAllocationFailure:: - FailureMode::AllocationNotAligned) { - // TODO: delete the misaligned memory - } - - std::cerr << "Kokkos failed to allocate memory for label \"" << label - << "\". Allocation using MemorySpace named \"" << space.name() - << " failed with the following error: "; - failure.print_error_message(std::cerr); - std::cerr.flush(); - Kokkos::Impl::throw_runtime_exception("Memory allocation failure"); - } - return nullptr; // unreachable -} - -SharedAllocationRecord::SharedAllocationRecord( - const Kokkos::HostSpace &arg_space, const std::string &arg_label, - const size_t arg_alloc_size, - const SharedAllocationRecord::function_type arg_dealloc) - // Pass through allocated [ SharedAllocationHeader , user_memory ] - // Pass through deallocation function - : base_t( -#ifdef KOKKOS_ENABLE_DEBUG - &SharedAllocationRecord::s_root_record, -#endif - Impl::checked_allocation_with_header(arg_space, arg_label, - arg_alloc_size), - sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc, - arg_label), - m_space(arg_space) { - this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, - arg_label); -} - -} // namespace Impl -} // namespace Kokkos - -//============================================================================== -// {{{1 - #include -namespace Kokkos { -namespace Impl { - -// To avoid additional compilation cost for something that's (mostly?) not -// performance sensitive, we explicity instantiate these CRTP base classes here, -// where we have access to the associated *_timpl.hpp header files. -template class SharedAllocationRecordCommon; - -} // end namespace Impl -} // end namespace Kokkos - -// end Explicit instantiations of CRTP Base classes }}}1 -//============================================================================== +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(Kokkos::HostSpace); diff --git a/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp b/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp index f740c408fb8..3072e2ce825 100644 --- a/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp +++ b/core/src/impl/Kokkos_HostSpace_ZeroMemset.hpp @@ -26,8 +26,7 @@ namespace Impl { template struct ZeroMemset> { - ZeroMemset(const HostSpace::execution_space& exec, const View& dst, - typename View::const_value_type&) { + ZeroMemset(const HostSpace::execution_space& exec, const View& dst) { // Host spaces, except for HPX, are synchronous and we need to fence for HPX // since we can't properly enqueue a std::memset otherwise. // We can't use exec.fence() directly since we don't have a full definition @@ -36,12 +35,6 @@ struct ZeroMemset> { using ValueType = typename View::value_type; std::memset(dst.data(), 0, sizeof(ValueType) * dst.size()); } - - ZeroMemset(const View& dst, - typename View::const_value_type&) { - using ValueType = typename View::value_type; - std::memset(dst.data(), 0, sizeof(ValueType) * dst.size()); - } }; } // end namespace Impl diff --git a/core/src/impl/Kokkos_HostThreadTeam.cpp b/core/src/impl/Kokkos_HostThreadTeam.cpp index bfe5902bf7f..11bf701b57a 100644 --- a/core/src/impl/Kokkos_HostThreadTeam.cpp +++ b/core/src/impl/Kokkos_HostThreadTeam.cpp @@ -22,7 +22,6 @@ #include #include #include -#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/core/src/impl/Kokkos_HostThreadTeam.hpp b/core/src/impl/Kokkos_HostThreadTeam.hpp index 51f25a8b60f..25f09b82865 100644 --- a/core/src/impl/Kokkos_HostThreadTeam.hpp +++ b/core/src/impl/Kokkos_HostThreadTeam.hpp @@ -885,7 +885,7 @@ KOKKOS_INLINE_FUNCTION closure(i, accum, false); } - auto team_member = loop_boundaries.thread; + auto& team_member = loop_boundaries.thread; // 'accum' output is the exclusive prefix sum accum = team_member.team_scan(accum); diff --git a/core/src/impl/Kokkos_InitializationSettings.hpp b/core/src/impl/Kokkos_InitializationSettings.hpp index ab4350f3a7a..11a93c6bb56 100644 --- a/core/src/impl/Kokkos_InitializationSettings.hpp +++ b/core/src/impl/Kokkos_InitializationSettings.hpp @@ -24,32 +24,6 @@ namespace Kokkos { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -struct InitArguments { - int num_threads; - int num_numa; - int device_id; - int ndevices; - int skip_device; - bool disable_warnings; - bool tune_internals; - bool tool_help = false; - std::string tool_lib = {}; - std::string tool_args = {}; - - KOKKOS_DEPRECATED_WITH_COMMENT("Use InitializationSettings instead!") - InitArguments(int nt = -1, int nn = -1, int dv = -1, bool dw = false, - bool ti = false) - : num_threads{nt}, - num_numa{nn}, - device_id{dv}, - ndevices{-1}, - skip_device{9999}, - disable_warnings{dw}, - tune_internals{ti} {} -}; -#endif - class InitializationSettings { #define KOKKOS_IMPL_DECLARE(TYPE, NAME) \ private: \ @@ -64,12 +38,32 @@ class InitializationSettings { TYPE get_##NAME() const noexcept { return *m_##NAME; } \ static_assert(true, "no-op to require trailing semicolon") +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#define KOKKOS_IMPL_DECLARE_DEPRECATED(TYPE, NAME) \ + private: \ + std::optional m_##NAME; \ + \ + public: \ + KOKKOS_DEPRECATED InitializationSettings& set_##NAME(TYPE NAME) { \ + m_##NAME = NAME; \ + return *this; \ + } \ + KOKKOS_DEPRECATED bool has_##NAME() const noexcept { \ + return static_cast(m_##NAME); \ + } \ + KOKKOS_DEPRECATED TYPE get_##NAME() const noexcept { return *m_##NAME; } \ + static_assert(true, "no-op to require trailing semicolon") +#else +#define KOKKOS_IMPL_DECLARE_DEPRECATED(TYPE, NAME) \ + static_assert(true, "no-op to require trailing semicolon") +#endif + public: KOKKOS_IMPL_DECLARE(int, num_threads); KOKKOS_IMPL_DECLARE(int, device_id); KOKKOS_IMPL_DECLARE(std::string, map_device_id_by); - KOKKOS_IMPL_DECLARE(int, num_devices); // deprecated - KOKKOS_IMPL_DECLARE(int, skip_device); // deprecated + KOKKOS_IMPL_DECLARE_DEPRECATED(int, num_devices); + KOKKOS_IMPL_DECLARE_DEPRECATED(int, skip_device); KOKKOS_IMPL_DECLARE(bool, disable_warnings); KOKKOS_IMPL_DECLARE(bool, print_configuration); KOKKOS_IMPL_DECLARE(bool, tune_internals); @@ -80,41 +74,6 @@ class InitializationSettings { #undef KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER_TYPE #undef KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER #undef KOKKOS_IMPL_DECLARE - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 - public: - InitializationSettings() = default; - - InitializationSettings(InitArguments const& old) { - if (old.num_threads != -1) { - set_num_threads(old.num_threads); - } - if (old.device_id != -1) { - set_device_id(old.device_id); - } - if (old.ndevices != -1) { - set_num_devices(old.ndevices); - } - if (old.skip_device != 9999) { - set_skip_device(old.skip_device); - } - if (old.disable_warnings) { - set_disable_warnings(true); - } - if (old.tune_internals) { - set_tune_internals(true); - } - if (old.tool_help) { - set_tools_help(true); - } - if (!old.tool_lib.empty()) { - set_tools_libs(old.tool_lib); - } - if (!old.tool_args.empty()) { - set_tools_args(old.tool_args); - } - } -#endif }; } // namespace Kokkos diff --git a/core/src/impl/Kokkos_MemorySpace.cpp b/core/src/impl/Kokkos_MemorySpace.cpp deleted file mode 100644 index 2f0e01c5b28..00000000000 --- a/core/src/impl/Kokkos_MemorySpace.cpp +++ /dev/null @@ -1,72 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -/** @file Kokkos_MemorySpace.cpp - * - * Operations common to memory space instances, or at least default - * implementations thereof. - */ - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include - -#include -#include -#include - -namespace Kokkos { -namespace Impl { - -void safe_throw_allocation_with_header_failure( - std::string const& space_name, std::string const& label, - Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { - auto generate_failure_message = [&](std::ostream& o) { - o << "Kokkos failed to allocate memory for label \"" << label - << "\". Allocation using MemorySpace named \"" << space_name - << "\" failed with the following error: "; - failure.print_error_message(o); - if (failure.failure_mode() == - Kokkos::Experimental::RawMemoryAllocationFailure::FailureMode:: - AllocationNotAligned) { - // TODO: delete the misaligned memory? - o << "Warning: Allocation failed due to misalignment; memory may " - "be leaked.\n"; - } - o.flush(); - }; - try { - std::ostringstream sstr; - generate_failure_message(sstr); - Kokkos::Impl::throw_runtime_exception(sstr.str()); - } catch (std::bad_alloc const&) { - // Probably failed to allocate the string because we're so close to out - // of memory. Try printing to std::cerr instead - try { - generate_failure_message(std::cerr); - } catch (std::bad_alloc const&) { - // oh well, we tried... - } - Kokkos::Impl::throw_runtime_exception( - "Kokkos encountered an allocation failure, then another allocation " - "failure while trying to create the error message."); - } -} - -} // end namespace Impl -} // end namespace Kokkos diff --git a/core/src/impl/Kokkos_MemorySpace.hpp b/core/src/impl/Kokkos_MemorySpace.hpp deleted file mode 100644 index 44956dd7c5d..00000000000 --- a/core/src/impl/Kokkos_MemorySpace.hpp +++ /dev/null @@ -1,71 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -/** @file Kokkos_MemorySpace.hpp - * - * Operations common to memory space instances, or at least default - * implementations thereof. - */ - -#ifndef KOKKOS_IMPL_MEMORYSPACE_HPP -#define KOKKOS_IMPL_MEMORYSPACE_HPP - -#include -#include -#include - -#include - -namespace Kokkos { -namespace Impl { - -// Defined in implementation file to avoid having to include iostream -void safe_throw_allocation_with_header_failure( - std::string const &space_name, std::string const &label, - Kokkos::Experimental::RawMemoryAllocationFailure const &failure); - -template -SharedAllocationHeader *checked_allocation_with_header(MemorySpace const &space, - std::string const &label, - size_t alloc_size) { - try { - return reinterpret_cast(space.allocate( - label.c_str(), alloc_size + sizeof(SharedAllocationHeader), - alloc_size)); - } catch (Kokkos::Experimental::RawMemoryAllocationFailure const &failure) { - safe_throw_allocation_with_header_failure(space.name(), label, failure); - } - return nullptr; // unreachable -} - -template -SharedAllocationHeader *checked_allocation_with_header( - ExecutionSpace const &exec_space, MemorySpace const &space, - std::string const &label, size_t alloc_size) { - try { - return reinterpret_cast(space.allocate( - exec_space, label.c_str(), alloc_size + sizeof(SharedAllocationHeader), - alloc_size)); - } catch (Kokkos::Experimental::RawMemoryAllocationFailure const &failure) { - safe_throw_allocation_with_header_failure(space.name(), label, failure); - } - return nullptr; // unreachable -} - -} // end namespace Impl -} // end namespace Kokkos - -#endif // KOKKOS_IMPL_MEMORYSPACE_HPP diff --git a/core/src/impl/Kokkos_Memory_Fence.hpp b/core/src/impl/Kokkos_Memory_Fence.hpp deleted file mode 100644 index 42a53b04fb2..00000000000 --- a/core/src/impl/Kokkos_Memory_Fence.hpp +++ /dev/null @@ -1,54 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_MEMORY_FENCE_HPP) -#define KOKKOS_MEMORY_FENCE_HPP -namespace Kokkos { - -////////////////////////////////////////////////////// -// store_fence() -// -// If possible use a store fence on the architecture, if not run a full memory -// fence - -KOKKOS_FORCEINLINE_FUNCTION -void store_fence() { -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) - asm volatile("sfence" ::: "memory"); -#else - memory_fence(); -#endif -} - -////////////////////////////////////////////////////// -// load_fence() -// -// If possible use a load fence on the architecture, if not run a full memory -// fence - -KOKKOS_FORCEINLINE_FUNCTION -void load_fence() { -#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ENABLE_ISA_X86_64) - asm volatile("lfence" ::: "memory"); -#else - memory_fence(); -#endif -} - -} // namespace Kokkos - -#endif diff --git a/core/src/impl/Kokkos_Profiling_C_Interface.h b/core/src/impl/Kokkos_Profiling_C_Interface.h index 731a11e917a..15c466b27ed 100644 --- a/core/src/impl/Kokkos_Profiling_C_Interface.h +++ b/core/src/impl/Kokkos_Profiling_C_Interface.h @@ -154,7 +154,7 @@ enum Kokkos_Tools_OptimizationType { Kokkos_Tools_Maximize }; -struct Kokkos_Tools_OptimzationGoal { +struct Kokkos_Tools_OptimizationGoal { size_t type_id; enum Kokkos_Tools_OptimizationType goal; }; @@ -220,7 +220,7 @@ typedef void (*Kokkos_Tools_contextBeginFunction)(const size_t); typedef void (*Kokkos_Tools_contextEndFunction)( const size_t, struct Kokkos_Tools_VariableValue); typedef void (*Kokkos_Tools_optimizationGoalDeclarationFunction)( - const size_t, const struct Kokkos_Tools_OptimzationGoal goal); + const size_t, const struct Kokkos_Tools_OptimizationGoal goal); struct Kokkos_Profiling_EventSet { Kokkos_Profiling_initFunction init; diff --git a/core/src/impl/Kokkos_Profiling_Interface.hpp b/core/src/impl/Kokkos_Profiling_Interface.hpp index af71932e47b..b66886d9f7e 100644 --- a/core/src/impl/Kokkos_Profiling_Interface.hpp +++ b/core/src/impl/Kokkos_Profiling_Interface.hpp @@ -226,7 +226,7 @@ using ValueType = Kokkos_Tools_VariableInfo_ValueType; using CandidateValueType = Kokkos_Tools_VariableInfo_CandidateValueType; using SetOrRange = Kokkos_Tools_VariableInfo_SetOrRange; using VariableInfo = Kokkos_Tools_VariableInfo; -using OptimizationGoal = Kokkos_Tools_OptimzationGoal; +using OptimizationGoal = Kokkos_Tools_OptimizationGoal; using TuningString = Kokkos_Tools_Tuning_String; using VariableValue = Kokkos_Tools_VariableValue; diff --git a/core/src/impl/Kokkos_SharedAlloc.cpp b/core/src/impl/Kokkos_SharedAlloc.cpp index 255f5125f4a..0bc3814b3a1 100644 --- a/core/src/impl/Kokkos_SharedAlloc.cpp +++ b/core/src/impl/Kokkos_SharedAlloc.cpp @@ -20,6 +20,8 @@ #include #include +#include +#include namespace Kokkos { namespace Impl { @@ -321,5 +323,53 @@ void SharedAllocationRecord::print_host_accessible_records( } #endif +void safe_throw_allocation_with_header_failure( + std::string const& space_name, std::string const& label, + Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { + auto generate_failure_message = [&](std::ostream& o) { + o << "Kokkos failed to allocate memory for label \"" << label + << "\". Allocation using MemorySpace named \"" << space_name + << "\" failed with the following error: "; + failure.print_error_message(o); + if (failure.failure_mode() == + Kokkos::Experimental::RawMemoryAllocationFailure::FailureMode:: + AllocationNotAligned) { + // TODO: delete the misaligned memory? + o << "Warning: Allocation failed due to misalignment; memory may " + "be leaked.\n"; + } + o.flush(); + }; + try { + std::ostringstream sstr; + generate_failure_message(sstr); + Kokkos::Impl::throw_runtime_exception(sstr.str()); + } catch (std::bad_alloc const&) { + // Probably failed to allocate the string because we're so close to out + // of memory. Try printing to std::cerr instead + try { + generate_failure_message(std::cerr); + } catch (std::bad_alloc const&) { + // oh well, we tried... + } + Kokkos::Impl::throw_runtime_exception( + "Kokkos encountered an allocation failure, then another allocation " + "failure while trying to create the error message."); + } +} + +void fill_host_accessible_header_info( + SharedAllocationRecord* arg_record, + SharedAllocationHeader& arg_header, std::string const& arg_label) { + // Fill in the Header information, directly accessible on the host + + arg_header.m_record = arg_record; + + strncpy(arg_header.m_label, arg_label.c_str(), + SharedAllocationHeader::maximum_label_length); + // Set last element zero, in case c_str is too long + arg_header.m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0'; +} + } /* namespace Impl */ } /* namespace Kokkos */ diff --git a/core/src/impl/Kokkos_SharedAlloc.hpp b/core/src/impl/Kokkos_SharedAlloc.hpp index 043505a158e..99ab660213f 100644 --- a/core/src/impl/Kokkos_SharedAlloc.hpp +++ b/core/src/impl/Kokkos_SharedAlloc.hpp @@ -51,6 +51,9 @@ class SharedAllocationHeader { friend class SharedAllocationRecordCommon; template friend class HostInaccessibleSharedAllocationRecordCommon; + friend void fill_host_accessible_header_info( + SharedAllocationRecord*, SharedAllocationHeader&, + std::string const&); Record* m_record; char m_label[maximum_label_length]; @@ -145,25 +148,23 @@ class SharedAllocationRecord { SharedAllocationRecord() : m_alloc_ptr(nullptr), m_alloc_size(0), - m_dealloc(nullptr) + m_dealloc(nullptr), #ifdef KOKKOS_ENABLE_DEBUG - , m_root(this), m_prev(this), - m_next(this) + m_next(this), #endif - , m_count(0) { } static constexpr unsigned maximum_label_length = SharedAllocationHeader::maximum_label_length; - KOKKOS_INLINE_FUNCTION + KOKKOS_FUNCTION const SharedAllocationHeader* head() const { return m_alloc_ptr; } /* User's memory begins at the end of the header */ - KOKKOS_INLINE_FUNCTION + KOKKOS_FUNCTION void* data() const { return static_cast(m_alloc_ptr + 1); } /* User's memory begins at the end of the header */ @@ -195,23 +196,79 @@ class SharedAllocationRecord { const SharedAllocationRecord* const root, const bool detail); }; +void safe_throw_allocation_with_header_failure( + std::string const& space_name, std::string const& label, + Kokkos::Experimental::RawMemoryAllocationFailure const& failure); + +template +SharedAllocationHeader* checked_allocation_with_header(MemorySpace const& space, + std::string const& label, + size_t alloc_size) { + try { + return reinterpret_cast(space.allocate( + label.c_str(), alloc_size + sizeof(SharedAllocationHeader), + alloc_size)); + } catch (Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { + safe_throw_allocation_with_header_failure(space.name(), label, failure); + } + return nullptr; // unreachable +} + +template +SharedAllocationHeader* checked_allocation_with_header( + ExecutionSpace const& exec_space, MemorySpace const& space, + std::string const& label, size_t alloc_size) { + try { + return reinterpret_cast(space.allocate( + exec_space, label.c_str(), alloc_size + sizeof(SharedAllocationHeader), + alloc_size)); + } catch (Kokkos::Experimental::RawMemoryAllocationFailure const& failure) { + safe_throw_allocation_with_header_failure(space.name(), label, failure); + } + return nullptr; // unreachable +} + +void fill_host_accessible_header_info(SharedAllocationHeader& arg_header, + std::string const& arg_label); + template class SharedAllocationRecordCommon : public SharedAllocationRecord { private: using derived_t = SharedAllocationRecord; using record_base_t = SharedAllocationRecord; - derived_t& self() { return *static_cast(this); } - derived_t const& self() const { return *static_cast(this); } protected: using record_base_t::record_base_t; - void _fill_host_accessible_header_info(SharedAllocationHeader& arg_header, - std::string const& arg_label); + MemorySpace m_space; + +#ifdef KOKKOS_ENABLE_DEBUG + static record_base_t s_root_record; +#endif static void deallocate(record_base_t* arg_rec); public: + ~SharedAllocationRecordCommon(); + template + SharedAllocationRecordCommon( + ExecutionSpace const& exec, MemorySpace const& space, + std::string const& label, std::size_t alloc_size, + record_base_t::function_type dealloc = &deallocate) + : SharedAllocationRecord( +#ifdef KOKKOS_ENABLE_DEBUG + &s_root_record, +#endif + checked_allocation_with_header(exec, space, label, alloc_size), + sizeof(SharedAllocationHeader) + alloc_size, dealloc, label), + m_space(space) { + auto& header = *SharedAllocationRecord::m_alloc_ptr; + fill_host_accessible_header_info(this, header, label); + } + SharedAllocationRecordCommon( + MemorySpace const& space, std::string const& label, std::size_t size, + record_base_t::function_type dealloc = &deallocate); + static auto allocate(MemorySpace const& arg_space, std::string const& arg_label, size_t arg_alloc_size) -> derived_t*; @@ -231,22 +288,103 @@ class SharedAllocationRecordCommon : public SharedAllocationRecord { template class HostInaccessibleSharedAllocationRecordCommon - : public SharedAllocationRecordCommon { + : public SharedAllocationRecord { private: - using base_t = SharedAllocationRecordCommon; using derived_t = SharedAllocationRecord; using record_base_t = SharedAllocationRecord; protected: - using base_t::base_t; + using record_base_t::record_base_t; + + MemorySpace m_space; + +#ifdef KOKKOS_ENABLE_DEBUG + static record_base_t s_root_record; +#endif + + static void deallocate(record_base_t* arg_rec); public: + ~HostInaccessibleSharedAllocationRecordCommon(); + template + HostInaccessibleSharedAllocationRecordCommon( + ExecutionSpace const& exec, MemorySpace const& space, + std::string const& label, std::size_t alloc_size, + record_base_t::function_type dealloc = &deallocate) + : SharedAllocationRecord( +#ifdef KOKKOS_ENABLE_DEBUG + &s_root_record, +#endif + checked_allocation_with_header(exec, space, label, alloc_size), + sizeof(SharedAllocationHeader) + alloc_size, dealloc, label), + m_space(space) { + SharedAllocationHeader header; + + fill_host_accessible_header_info(this, header, label); + + Kokkos::Impl::DeepCopy( + exec, SharedAllocationRecord::m_alloc_ptr, &header, + sizeof(SharedAllocationHeader)); + } + HostInaccessibleSharedAllocationRecordCommon( + MemorySpace const& space, std::string const& label, std::size_t size, + record_base_t::function_type dealloc = &deallocate); + + static auto allocate(MemorySpace const& arg_space, + std::string const& arg_label, size_t arg_alloc_size) + -> derived_t*; + /**\brief Allocate tracked memory in the space */ + static void* allocate_tracked(MemorySpace const& arg_space, + std::string const& arg_alloc_label, + size_t arg_alloc_size); + /**\brief Reallocate tracked memory in the space */ + static void deallocate_tracked(void* arg_alloc_ptr); + /**\brief Deallocate tracked memory in the space */ + static void* reallocate_tracked(void* arg_alloc_ptr, size_t arg_alloc_size); + static void print_records(std::ostream& s, MemorySpace const&, bool detail = false); static auto get_record(void* alloc_ptr) -> derived_t*; std::string get_label() const; }; +#ifdef KOKKOS_ENABLE_DEBUG +template +SharedAllocationRecord + SharedAllocationRecordCommon::s_root_record; + +template +SharedAllocationRecord + HostInaccessibleSharedAllocationRecordCommon::s_root_record; +#endif + +#define KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(MEMORY_SPACE) \ + template <> \ + class Kokkos::Impl::SharedAllocationRecord \ + : public Kokkos::Impl::SharedAllocationRecordCommon { \ + using SharedAllocationRecordCommon< \ + MEMORY_SPACE>::SharedAllocationRecordCommon; \ + } + +#define KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( \ + MEMORY_SPACE) \ + template <> \ + class Kokkos::Impl::SharedAllocationRecord \ + : public Kokkos::Impl::HostInaccessibleSharedAllocationRecordCommon< \ + MEMORY_SPACE> { \ + using HostInaccessibleSharedAllocationRecordCommon< \ + MEMORY_SPACE>::HostInaccessibleSharedAllocationRecordCommon; \ + } + +#define KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( \ + MEMORY_SPACE) \ + template class Kokkos::Impl::SharedAllocationRecordCommon + +#define KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( \ + MEMORY_SPACE) \ + template class Kokkos::Impl::HostInaccessibleSharedAllocationRecordCommon< \ + MEMORY_SPACE> + namespace { /* Taking the address of this function so make sure it is unique */ diff --git a/core/src/impl/Kokkos_SharedAlloc_timpl.hpp b/core/src/impl/Kokkos_SharedAlloc_timpl.hpp index d403ef9db06..41036ab0678 100644 --- a/core/src/impl/Kokkos_SharedAlloc_timpl.hpp +++ b/core/src/impl/Kokkos_SharedAlloc_timpl.hpp @@ -31,6 +31,66 @@ namespace Kokkos { namespace Impl { +template +SharedAllocationRecordCommon::~SharedAllocationRecordCommon() { + auto alloc_ptr = SharedAllocationRecord::m_alloc_ptr; + auto alloc_size = SharedAllocationRecord::m_alloc_size; + auto label = SharedAllocationRecord::m_label; + m_space.deallocate(label.c_str(), alloc_ptr, alloc_size, + alloc_size - sizeof(SharedAllocationHeader)); +} +template +HostInaccessibleSharedAllocationRecordCommon< + MemorySpace>::~HostInaccessibleSharedAllocationRecordCommon() { + auto alloc_ptr = SharedAllocationRecord::m_alloc_ptr; + auto alloc_size = SharedAllocationRecord::m_alloc_size; + auto label = SharedAllocationRecord::m_label; + m_space.deallocate(label.c_str(), alloc_ptr, alloc_size, + alloc_size - sizeof(SharedAllocationHeader)); +} + +template +SharedAllocationRecordCommon::SharedAllocationRecordCommon( + MemorySpace const& space, std::string const& label, std::size_t alloc_size, + SharedAllocationRecord::function_type dealloc) + : SharedAllocationRecord( +#ifdef KOKKOS_ENABLE_DEBUG + &s_root_record, +#endif + checked_allocation_with_header(space, label, alloc_size), + sizeof(SharedAllocationHeader) + alloc_size, dealloc, label), + m_space(space) { + auto& header = *SharedAllocationRecord::m_alloc_ptr; + fill_host_accessible_header_info(this, header, label); +} + +template +HostInaccessibleSharedAllocationRecordCommon:: + HostInaccessibleSharedAllocationRecordCommon( + MemorySpace const& space, std::string const& label, + std::size_t alloc_size, + SharedAllocationRecord::function_type dealloc) + : SharedAllocationRecord( +#ifdef KOKKOS_ENABLE_DEBUG + &s_root_record, +#endif + checked_allocation_with_header(space, label, alloc_size), + sizeof(SharedAllocationHeader) + alloc_size, dealloc, label), + m_space(space) { + SharedAllocationHeader header; + + fill_host_accessible_header_info(this, header, label); + + typename MemorySpace::execution_space exec; + Kokkos::Impl::DeepCopy( + exec, SharedAllocationRecord::m_alloc_ptr, &header, + sizeof(SharedAllocationHeader)); + exec.fence(std::string("SharedAllocationRecord::SharedAllocationRecord(): " + "fence after copying header from HostSpace"); +} + template auto SharedAllocationRecordCommon::allocate( MemorySpace const& arg_space, std::string const& arg_label, @@ -76,9 +136,64 @@ void* SharedAllocationRecordCommon::reallocate_tracked( Kokkos::Impl::DeepCopy( r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - Kokkos::fence( - "SharedAllocationRecord::reallocate_tracked(): fence after copying data"); + Kokkos::fence(std::string("SharedAllocationRecord<") + MemorySpace::name() + + ", void>::reallocate_tracked(): fence after copying data"); + + record_base_t::increment(r_new); + record_base_t::decrement(r_old); + + return r_new->data(); +} + +template +auto HostInaccessibleSharedAllocationRecordCommon::allocate( + MemorySpace const& arg_space, std::string const& arg_label, + size_t arg_alloc_size) -> derived_t* { + return new derived_t(arg_space, arg_label, arg_alloc_size); +} + +template +void* HostInaccessibleSharedAllocationRecordCommon< + MemorySpace>::allocate_tracked(const MemorySpace& arg_space, + const std::string& arg_alloc_label, + size_t arg_alloc_size) { + if (!arg_alloc_size) return nullptr; + + SharedAllocationRecord* const r = + allocate(arg_space, arg_alloc_label, arg_alloc_size); + + record_base_t::increment(r); + + return r->data(); +} + +template +void HostInaccessibleSharedAllocationRecordCommon::deallocate( + HostInaccessibleSharedAllocationRecordCommon::record_base_t* arg_rec) { + delete static_cast(arg_rec); +} + +template +void HostInaccessibleSharedAllocationRecordCommon< + MemorySpace>::deallocate_tracked(void* arg_alloc_ptr) { + if (arg_alloc_ptr != nullptr) { + SharedAllocationRecord* const r = derived_t::get_record(arg_alloc_ptr); + record_base_t::decrement(r); + } +} + +template +void* HostInaccessibleSharedAllocationRecordCommon< + MemorySpace>::reallocate_tracked(void* arg_alloc_ptr, + size_t arg_alloc_size) { + derived_t* const r_old = derived_t::get_record(arg_alloc_ptr); + derived_t* const r_new = + allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); + + Kokkos::Impl::DeepCopy( + r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); + Kokkos::fence(std::string("SharedAllocationRecord<") + MemorySpace::name() + + ", void>::reallocate_tracked(): fence after copying data"); record_base_t::increment(r_new); record_base_t::decrement(r_old); @@ -108,20 +223,6 @@ std::string SharedAllocationRecordCommon::get_label() const { return record_base_t::m_label; } -template -void SharedAllocationRecordCommon:: - _fill_host_accessible_header_info(SharedAllocationHeader& arg_header, - std::string const& arg_label) { - // Fill in the Header information, directly accessible on the host - - arg_header.m_record = &self(); - - strncpy(arg_header.m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length); - // Set last element zero, in case c_str is too long - arg_header.m_label[SharedAllocationHeader::maximum_label_length - 1] = '\0'; -} - template void SharedAllocationRecordCommon::print_records( std::ostream& s, const MemorySpace&, bool detail) { diff --git a/core/src/impl/Kokkos_Spinwait.hpp b/core/src/impl/Kokkos_Spinwait.hpp deleted file mode 100644 index c57b17d646a..00000000000 --- a/core/src/impl/Kokkos_Spinwait.hpp +++ /dev/null @@ -1,109 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_SPINWAIT_HPP -#define KOKKOS_SPINWAIT_HPP - -#include -#include - -#include - -#include - -namespace Kokkos { -namespace Impl { - -enum class WaitMode : int { - ACTIVE // Used for tight loops to keep threads active longest - , - PASSIVE // Used to quickly yield the thread to quite down the system - , - ROOT // Never sleep or yield the root thread -}; - -void host_thread_yield(const uint32_t i, const WaitMode mode); - -template -std::enable_if_t::value, void> root_spinwait_while_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value == flag) { - host_thread_yield(++i, WaitMode::ROOT); - } - Kokkos::load_fence(); -} - -template -std::enable_if_t::value, void> root_spinwait_until_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value != flag) { - host_thread_yield(++i, WaitMode::ROOT); - } - Kokkos::load_fence(); -} - -template -std::enable_if_t::value, void> spinwait_while_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value == flag) { - host_thread_yield(++i, WaitMode::ACTIVE); - } - Kokkos::load_fence(); -} - -template -std::enable_if_t::value, void> yield_while_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value == flag) { - host_thread_yield(++i, WaitMode::PASSIVE); - } - Kokkos::load_fence(); -} - -template -std::enable_if_t::value, void> spinwait_until_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value != flag) { - host_thread_yield(++i, WaitMode::ACTIVE); - } - Kokkos::load_fence(); -} - -template -std::enable_if_t::value, void> yield_until_equal( - T const volatile& flag, const T value) { - Kokkos::store_fence(); - uint32_t i = 0; - while (value != flag) { - host_thread_yield(++i, WaitMode::PASSIVE); - } - Kokkos::load_fence(); -} - -} /* namespace Impl */ -} /* namespace Kokkos */ - -#endif /* #ifndef KOKKOS_SPINWAIT_HPP */ diff --git a/core/src/impl/Kokkos_Utilities.hpp b/core/src/impl/Kokkos_Utilities.hpp index 7e2f130564f..cadeed1a6d8 100644 --- a/core/src/impl/Kokkos_Utilities.hpp +++ b/core/src/impl/Kokkos_Utilities.hpp @@ -49,6 +49,11 @@ struct integral_constant { template struct always_true : std::true_type {}; +// type-dependent expression that is always false intended for use in +// static_assert to check "we should never get there" +template +struct always_false : std::false_type {}; + //============================================================================== #if defined(__cpp_lib_type_identity) diff --git a/core/src/impl/Kokkos_ViewArray.hpp b/core/src/impl/Kokkos_ViewArray.hpp index 725ba5de092..fe43b630184 100644 --- a/core/src/impl/Kokkos_ViewArray.hpp +++ b/core/src/impl/Kokkos_ViewArray.hpp @@ -27,10 +27,9 @@ struct ViewDataAnalysis> { private: using array_analysis = ViewArrayAnalysis; - static_assert(std::is_void

::value, ""); + static_assert(std::is_void

::value); static_assert(std::is_same>::value, - ""); + Kokkos::Array>::value); static_assert(std::is_scalar::value, "View of Array type must be of a scalar type"); @@ -130,6 +129,12 @@ class ViewMapping> { return m_impl_offset.m_dim.extent(r); } + static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( + const unsigned r) noexcept { + using dim_type = typename offset_type::dimension_type; + return dim_type::static_extent(r); + } + KOKKOS_INLINE_FUNCTION constexpr typename Traits::array_layout layout() const { return m_impl_offset.layout(); @@ -507,7 +512,7 @@ class ViewMapping< Kokkos::LayoutStride>::value))>, SrcTraits, Args...> { private: - static_assert(SrcTraits::rank == sizeof...(Args), ""); + static_assert(SrcTraits::rank == sizeof...(Args)); enum : bool { R0 = is_integral_extent<0, Args...>::value, diff --git a/core/src/impl/Kokkos_ViewDataAnalysis.hpp b/core/src/impl/Kokkos_ViewDataAnalysis.hpp new file mode 100644 index 00000000000..04c0c9aeede --- /dev/null +++ b/core/src/impl/Kokkos_ViewDataAnalysis.hpp @@ -0,0 +1,402 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif + +#ifndef KOKKOS_VIEW_DATA_ANALYSIS_HPP +#define KOKKOS_VIEW_DATA_ANALYSIS_HPP + +#include + +namespace Kokkos::Impl { + +template +struct variadic_size_t { + enum : size_t { value = KOKKOS_INVALID_INDEX }; +}; + +template +struct variadic_size_t<0, Val, Args...> { + enum : size_t { value = Val }; +}; + +template +struct variadic_size_t { + enum : size_t { value = variadic_size_t::value }; +}; + +template +struct rank_dynamic; + +template <> +struct rank_dynamic<> { + enum : unsigned { value = 0 }; +}; + +template +struct rank_dynamic { + enum : unsigned { value = (Val == 0 ? 1 : 0) + rank_dynamic::value }; +}; + +#define KOKKOS_IMPL_VIEW_DIMENSION(R) \ + template \ + struct ViewDimension##R { \ + static constexpr size_t ArgN##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ + static constexpr size_t N##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ + KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t) {} \ + ViewDimension##R() = default; \ + ViewDimension##R(const ViewDimension##R&) = default; \ + ViewDimension##R& operator=(const ViewDimension##R&) = default; \ + }; \ + template \ + constexpr size_t ViewDimension##R::ArgN##R; \ + template \ + constexpr size_t ViewDimension##R::N##R; \ + template \ + struct ViewDimension##R<0u, RD> { \ + static constexpr size_t ArgN##R = 0; \ + std::conditional_t<(RD < 3), size_t, unsigned> N##R; \ + ViewDimension##R() = default; \ + ViewDimension##R(const ViewDimension##R&) = default; \ + ViewDimension##R& operator=(const ViewDimension##R&) = default; \ + KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t V) : N##R(V) {} \ + }; \ + template \ + constexpr size_t ViewDimension##R<0u, RD>::ArgN##R; + +KOKKOS_IMPL_VIEW_DIMENSION(0) +KOKKOS_IMPL_VIEW_DIMENSION(1) +KOKKOS_IMPL_VIEW_DIMENSION(2) +KOKKOS_IMPL_VIEW_DIMENSION(3) +KOKKOS_IMPL_VIEW_DIMENSION(4) +KOKKOS_IMPL_VIEW_DIMENSION(5) +KOKKOS_IMPL_VIEW_DIMENSION(6) +KOKKOS_IMPL_VIEW_DIMENSION(7) + +#undef KOKKOS_IMPL_VIEW_DIMENSION + +// MSVC does not do empty base class optimization by default. +// Per standard it is required for standard layout types +template +struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION ViewDimension + : public ViewDimension0::value, + rank_dynamic::value>, + public ViewDimension1::value, + rank_dynamic::value>, + public ViewDimension2::value, + rank_dynamic::value>, + public ViewDimension3::value, + rank_dynamic::value>, + public ViewDimension4::value, + rank_dynamic::value>, + public ViewDimension5::value, + rank_dynamic::value>, + public ViewDimension6::value, + rank_dynamic::value>, + public ViewDimension7::value, + rank_dynamic::value> { + using D0 = ViewDimension0::value, + rank_dynamic::value>; + using D1 = ViewDimension1::value, + rank_dynamic::value>; + using D2 = ViewDimension2::value, + rank_dynamic::value>; + using D3 = ViewDimension3::value, + rank_dynamic::value>; + using D4 = ViewDimension4::value, + rank_dynamic::value>; + using D5 = ViewDimension5::value, + rank_dynamic::value>; + using D6 = ViewDimension6::value, + rank_dynamic::value>; + using D7 = ViewDimension7::value, + rank_dynamic::value>; + + using D0::ArgN0; + using D1::ArgN1; + using D2::ArgN2; + using D3::ArgN3; + using D4::ArgN4; + using D5::ArgN5; + using D6::ArgN6; + using D7::ArgN7; + + using D0::N0; + using D1::N1; + using D2::N2; + using D3::N3; + using D4::N4; + using D5::N5; + using D6::N6; + using D7::N7; + + static constexpr unsigned rank = sizeof...(Vals); + static constexpr unsigned rank_dynamic = Impl::rank_dynamic::value; + + ViewDimension() = default; + ViewDimension(const ViewDimension&) = default; + ViewDimension& operator=(const ViewDimension&) = default; + + KOKKOS_INLINE_FUNCTION + constexpr ViewDimension(size_t n0, size_t n1, size_t n2, size_t n3, size_t n4, + size_t n5, size_t n6, size_t n7) + : D0(n0 == KOKKOS_INVALID_INDEX ? 1 : n0), + D1(n1 == KOKKOS_INVALID_INDEX ? 1 : n1), + D2(n2 == KOKKOS_INVALID_INDEX ? 1 : n2), + D3(n3 == KOKKOS_INVALID_INDEX ? 1 : n3), + D4(n4 == KOKKOS_INVALID_INDEX ? 1 : n4), + D5(n5 == KOKKOS_INVALID_INDEX ? 1 : n5), + D6(n6 == KOKKOS_INVALID_INDEX ? 1 : n6), + D7(n7 == KOKKOS_INVALID_INDEX ? 1 : n7) {} + + KOKKOS_INLINE_FUNCTION + constexpr size_t extent(const unsigned r) const noexcept { + return r == 0 + ? N0 + : (r == 1 + ? N1 + : (r == 2 + ? N2 + : (r == 3 + ? N3 + : (r == 4 + ? N4 + : (r == 5 + ? N5 + : (r == 6 + ? N6 + : (r == 7 ? N7 + : 0))))))); + } + + static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( + const unsigned r) noexcept { + return r == 0 + ? ArgN0 + : (r == 1 + ? ArgN1 + : (r == 2 + ? ArgN2 + : (r == 3 + ? ArgN3 + : (r == 4 + ? ArgN4 + : (r == 5 + ? ArgN5 + : (r == 6 + ? ArgN6 + : (r == 7 ? ArgN7 + : 0))))))); + } + + template + struct prepend { + using type = ViewDimension; + }; + + template + struct append { + using type = ViewDimension; + }; +}; + +template +struct ViewDimensionJoin; + +template +struct ViewDimensionJoin, ViewDimension> { + using type = ViewDimension; +}; + +//---------------------------------------------------------------------------- + +template +struct ViewDimensionAssignable; + +template +struct ViewDimensionAssignable, + ViewDimension> { + using dst = ViewDimension; + using src = ViewDimension; + + enum { + value = unsigned(dst::rank) == unsigned(src::rank) && + ( + // Compile time check that potential static dimensions match + ((1 > dst::rank_dynamic && 1 > src::rank_dynamic) + ? (size_t(dst::ArgN0) == size_t(src::ArgN0)) + : true) && + ((2 > dst::rank_dynamic && 2 > src::rank_dynamic) + ? (size_t(dst::ArgN1) == size_t(src::ArgN1)) + : true) && + ((3 > dst::rank_dynamic && 3 > src::rank_dynamic) + ? (size_t(dst::ArgN2) == size_t(src::ArgN2)) + : true) && + ((4 > dst::rank_dynamic && 4 > src::rank_dynamic) + ? (size_t(dst::ArgN3) == size_t(src::ArgN3)) + : true) && + ((5 > dst::rank_dynamic && 5 > src::rank_dynamic) + ? (size_t(dst::ArgN4) == size_t(src::ArgN4)) + : true) && + ((6 > dst::rank_dynamic && 6 > src::rank_dynamic) + ? (size_t(dst::ArgN5) == size_t(src::ArgN5)) + : true) && + ((7 > dst::rank_dynamic && 7 > src::rank_dynamic) + ? (size_t(dst::ArgN6) == size_t(src::ArgN6)) + : true) && + ((8 > dst::rank_dynamic && 8 > src::rank_dynamic) + ? (size_t(dst::ArgN7) == size_t(src::ArgN7)) + : true)) + }; +}; + +/** \brief Given a value type and dimension generate the View data type */ +template +struct ViewDataType; + +template +struct ViewDataType> { + using type = T; +}; + +template +struct ViewDataType> { + using type = typename ViewDataType>::type; +}; + +template +struct ViewDataType> { + using type = typename ViewDataType>::type[N]; +}; + +/**\brief Analysis of View data type. + * + * Data type conforms to one of the following patterns : + * {const} value_type [][#][#][#] + * {const} value_type ***[#][#][#] + * Where the sum of counts of '*' and '[#]' is at most ten. + * + * Provide alias for ViewDimension<...> and value_type. + */ +template +struct ViewArrayAnalysis { + using value_type = T; + using const_value_type = std::add_const_t; + using non_const_value_type = std::remove_const_t; + using static_dimension = ViewDimension<>; + using dynamic_dimension = ViewDimension<>; + using dimension = ViewDimension<>; +}; + +template +struct ViewArrayAnalysis { + private: + using nested = ViewArrayAnalysis; + + public: + using value_type = typename nested::value_type; + using const_value_type = typename nested::const_value_type; + using non_const_value_type = typename nested::non_const_value_type; + + using static_dimension = + typename nested::static_dimension::template prepend::type; + + using dynamic_dimension = typename nested::dynamic_dimension; + + using dimension = + typename ViewDimensionJoin::type; +}; + +template +struct ViewArrayAnalysis { + private: + using nested = ViewArrayAnalysis; + using nested_dimension = typename nested::dimension; + + public: + using value_type = typename nested::value_type; + using const_value_type = typename nested::const_value_type; + using non_const_value_type = typename nested::non_const_value_type; + + using dynamic_dimension = + typename nested::dynamic_dimension::template prepend<0>::type; + + using static_dimension = typename nested::static_dimension; + + using dimension = + typename ViewDimensionJoin::type; +}; + +template +struct ViewArrayAnalysis { + private: + using nested = ViewArrayAnalysis; + + public: + using value_type = typename nested::value_type; + using const_value_type = typename nested::const_value_type; + using non_const_value_type = typename nested::non_const_value_type; + + using dynamic_dimension = + typename nested::dynamic_dimension::template prepend<0>::type; + + using static_dimension = typename nested::static_dimension; + + using dimension = + typename ViewDimensionJoin::type; +}; + +template +struct ViewDataAnalysis { + private: + using array_analysis = ViewArrayAnalysis; + + // ValueType is opportunity for partial specialization. + // Must match array analysis when this default template is used. + static_assert( + std::is_same::value); + + public: + using specialize = void; // No specialization + + using dimension = typename array_analysis::dimension; + using value_type = typename array_analysis::value_type; + using const_value_type = typename array_analysis::const_value_type; + using non_const_value_type = typename array_analysis::non_const_value_type; + + // Generate analogous multidimensional array specification type. + using type = typename ViewDataType::type; + using const_type = typename ViewDataType::type; + using non_const_type = + typename ViewDataType::type; + + // Generate "flattened" multidimensional array specification type. + using scalar_array_type = type; + using const_scalar_array_type = const_type; + using non_const_scalar_array_type = non_const_type; +}; + +template +struct ViewOffset { + using is_mapping_plugin = std::false_type; +}; +} // namespace Kokkos::Impl + +#endif // KOKKOS_VIEW_DATA_ANALYSIS_HPP diff --git a/core/src/impl/Kokkos_ViewMapping.hpp b/core/src/impl/Kokkos_ViewMapping.hpp index 01d0dc4f681..3217c76e380 100644 --- a/core/src/impl/Kokkos_ViewMapping.hpp +++ b/core/src/impl/Kokkos_ViewMapping.hpp @@ -33,255 +33,7 @@ #include #include #include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -struct variadic_size_t { - enum : size_t { value = KOKKOS_INVALID_INDEX }; -}; - -template -struct variadic_size_t<0, Val, Args...> { - enum : size_t { value = Val }; -}; - -template -struct variadic_size_t { - enum : size_t { value = variadic_size_t::value }; -}; - -template -struct rank_dynamic; - -template <> -struct rank_dynamic<> { - enum : unsigned { value = 0 }; -}; - -template -struct rank_dynamic { - enum : unsigned { value = (Val == 0 ? 1 : 0) + rank_dynamic::value }; -}; - -#define KOKKOS_IMPL_VIEW_DIMENSION(R) \ - template \ - struct ViewDimension##R { \ - static constexpr size_t ArgN##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ - static constexpr size_t N##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ - KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t) {} \ - ViewDimension##R() = default; \ - ViewDimension##R(const ViewDimension##R&) = default; \ - ViewDimension##R& operator=(const ViewDimension##R&) = default; \ - }; \ - template \ - constexpr size_t ViewDimension##R::ArgN##R; \ - template \ - constexpr size_t ViewDimension##R::N##R; \ - template \ - struct ViewDimension##R<0u, RD> { \ - static constexpr size_t ArgN##R = 0; \ - std::conditional_t<(RD < 3), size_t, unsigned> N##R; \ - ViewDimension##R() = default; \ - ViewDimension##R(const ViewDimension##R&) = default; \ - ViewDimension##R& operator=(const ViewDimension##R&) = default; \ - KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t V) : N##R(V) {} \ - }; \ - template \ - constexpr size_t ViewDimension##R<0u, RD>::ArgN##R; - -KOKKOS_IMPL_VIEW_DIMENSION(0) -KOKKOS_IMPL_VIEW_DIMENSION(1) -KOKKOS_IMPL_VIEW_DIMENSION(2) -KOKKOS_IMPL_VIEW_DIMENSION(3) -KOKKOS_IMPL_VIEW_DIMENSION(4) -KOKKOS_IMPL_VIEW_DIMENSION(5) -KOKKOS_IMPL_VIEW_DIMENSION(6) -KOKKOS_IMPL_VIEW_DIMENSION(7) - -#undef KOKKOS_IMPL_VIEW_DIMENSION - -// MSVC does not do empty base class optimization by default. -// Per standard it is required for standard layout types -template -struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION ViewDimension - : public ViewDimension0::value, - rank_dynamic::value>, - public ViewDimension1::value, - rank_dynamic::value>, - public ViewDimension2::value, - rank_dynamic::value>, - public ViewDimension3::value, - rank_dynamic::value>, - public ViewDimension4::value, - rank_dynamic::value>, - public ViewDimension5::value, - rank_dynamic::value>, - public ViewDimension6::value, - rank_dynamic::value>, - public ViewDimension7::value, - rank_dynamic::value> { - using D0 = ViewDimension0::value, - rank_dynamic::value>; - using D1 = ViewDimension1::value, - rank_dynamic::value>; - using D2 = ViewDimension2::value, - rank_dynamic::value>; - using D3 = ViewDimension3::value, - rank_dynamic::value>; - using D4 = ViewDimension4::value, - rank_dynamic::value>; - using D5 = ViewDimension5::value, - rank_dynamic::value>; - using D6 = ViewDimension6::value, - rank_dynamic::value>; - using D7 = ViewDimension7::value, - rank_dynamic::value>; - - using D0::ArgN0; - using D1::ArgN1; - using D2::ArgN2; - using D3::ArgN3; - using D4::ArgN4; - using D5::ArgN5; - using D6::ArgN6; - using D7::ArgN7; - - using D0::N0; - using D1::N1; - using D2::N2; - using D3::N3; - using D4::N4; - using D5::N5; - using D6::N6; - using D7::N7; - - static constexpr unsigned rank = sizeof...(Vals); - static constexpr unsigned rank_dynamic = Impl::rank_dynamic::value; - - ViewDimension() = default; - ViewDimension(const ViewDimension&) = default; - ViewDimension& operator=(const ViewDimension&) = default; - - KOKKOS_INLINE_FUNCTION - constexpr ViewDimension(size_t n0, size_t n1, size_t n2, size_t n3, size_t n4, - size_t n5, size_t n6, size_t n7) - : D0(n0 == KOKKOS_INVALID_INDEX ? 1 : n0), - D1(n1 == KOKKOS_INVALID_INDEX ? 1 : n1), - D2(n2 == KOKKOS_INVALID_INDEX ? 1 : n2), - D3(n3 == KOKKOS_INVALID_INDEX ? 1 : n3), - D4(n4 == KOKKOS_INVALID_INDEX ? 1 : n4), - D5(n5 == KOKKOS_INVALID_INDEX ? 1 : n5), - D6(n6 == KOKKOS_INVALID_INDEX ? 1 : n6), - D7(n7 == KOKKOS_INVALID_INDEX ? 1 : n7) {} - - KOKKOS_INLINE_FUNCTION - constexpr size_t extent(const unsigned r) const noexcept { - return r == 0 - ? N0 - : (r == 1 - ? N1 - : (r == 2 - ? N2 - : (r == 3 - ? N3 - : (r == 4 - ? N4 - : (r == 5 - ? N5 - : (r == 6 - ? N6 - : (r == 7 ? N7 - : 0))))))); - } - - static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( - const unsigned r) noexcept { - return r == 0 - ? ArgN0 - : (r == 1 - ? ArgN1 - : (r == 2 - ? ArgN2 - : (r == 3 - ? ArgN3 - : (r == 4 - ? ArgN4 - : (r == 5 - ? ArgN5 - : (r == 6 - ? ArgN6 - : (r == 7 ? ArgN7 - : 0))))))); - } - - template - struct prepend { - using type = ViewDimension; - }; - - template - struct append { - using type = ViewDimension; - }; -}; - -template -struct ViewDimensionJoin; - -template -struct ViewDimensionJoin, ViewDimension> { - using type = ViewDimension; -}; - -//---------------------------------------------------------------------------- - -template -struct ViewDimensionAssignable; - -template -struct ViewDimensionAssignable, - ViewDimension> { - using dst = ViewDimension; - using src = ViewDimension; - - enum { - value = unsigned(dst::rank) == unsigned(src::rank) && - ( - // Compile time check that potential static dimensions match - ((1 > dst::rank_dynamic && 1 > src::rank_dynamic) - ? (size_t(dst::ArgN0) == size_t(src::ArgN0)) - : true) && - ((2 > dst::rank_dynamic && 2 > src::rank_dynamic) - ? (size_t(dst::ArgN1) == size_t(src::ArgN1)) - : true) && - ((3 > dst::rank_dynamic && 3 > src::rank_dynamic) - ? (size_t(dst::ArgN2) == size_t(src::ArgN2)) - : true) && - ((4 > dst::rank_dynamic && 4 > src::rank_dynamic) - ? (size_t(dst::ArgN3) == size_t(src::ArgN3)) - : true) && - ((5 > dst::rank_dynamic && 5 > src::rank_dynamic) - ? (size_t(dst::ArgN4) == size_t(src::ArgN4)) - : true) && - ((6 > dst::rank_dynamic && 6 > src::rank_dynamic) - ? (size_t(dst::ArgN5) == size_t(src::ArgN5)) - : true) && - ((7 > dst::rank_dynamic && 7 > src::rank_dynamic) - ? (size_t(dst::ArgN6) == size_t(src::ArgN6)) - : true) && - ((8 > dst::rank_dynamic && 8 > src::rank_dynamic) - ? (size_t(dst::ArgN7) == size_t(src::ArgN7)) - : true)) - }; -}; - -} // namespace Impl -} // namespace Kokkos +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -657,21 +409,20 @@ struct SubviewExtents { template KOKKOS_INLINE_FUNCTION SubviewExtents(const ViewDimension& dim, Args... args) { - static_assert(DomainRank == sizeof...(DimArgs), ""); - static_assert(DomainRank == sizeof...(Args), ""); + static_assert(DomainRank == sizeof...(DimArgs)); + static_assert(DomainRank == sizeof...(Args)); // Verifies that all arguments, up to 8, are integral types, // integral extents, or don't exist. - static_assert( - RangeRank == unsigned(is_integral_extent<0, Args...>::value) + - unsigned(is_integral_extent<1, Args...>::value) + - unsigned(is_integral_extent<2, Args...>::value) + - unsigned(is_integral_extent<3, Args...>::value) + - unsigned(is_integral_extent<4, Args...>::value) + - unsigned(is_integral_extent<5, Args...>::value) + - unsigned(is_integral_extent<6, Args...>::value) + - unsigned(is_integral_extent<7, Args...>::value), - ""); + static_assert(RangeRank == + unsigned(is_integral_extent<0, Args...>::value) + + unsigned(is_integral_extent<1, Args...>::value) + + unsigned(is_integral_extent<2, Args...>::value) + + unsigned(is_integral_extent<3, Args...>::value) + + unsigned(is_integral_extent<4, Args...>::value) + + unsigned(is_integral_extent<5, Args...>::value) + + unsigned(is_integral_extent<6, Args...>::value) + + unsigned(is_integral_extent<7, Args...>::value)); if (RangeRank == 0) { m_length[0] = 0; @@ -708,149 +459,6 @@ struct SubviewExtents { namespace Kokkos { namespace Impl { - -/** \brief Given a value type and dimension generate the View data type */ -template -struct ViewDataType; - -template -struct ViewDataType> { - using type = T; -}; - -template -struct ViewDataType> { - using type = typename ViewDataType>::type; -}; - -template -struct ViewDataType> { - using type = typename ViewDataType>::type[N]; -}; - -/**\brief Analysis of View data type. - * - * Data type conforms to one of the following patterns : - * {const} value_type [][#][#][#] - * {const} value_type ***[#][#][#] - * Where the sum of counts of '*' and '[#]' is at most ten. - * - * Provide alias for ViewDimension<...> and value_type. - */ -template -struct ViewArrayAnalysis { - using value_type = T; - using const_value_type = std::add_const_t; - using non_const_value_type = std::remove_const_t; - using static_dimension = ViewDimension<>; - using dynamic_dimension = ViewDimension<>; - using dimension = ViewDimension<>; -}; - -template -struct ViewArrayAnalysis { - private: - using nested = ViewArrayAnalysis; - - public: - using value_type = typename nested::value_type; - using const_value_type = typename nested::const_value_type; - using non_const_value_type = typename nested::non_const_value_type; - - using static_dimension = - typename nested::static_dimension::template prepend::type; - - using dynamic_dimension = typename nested::dynamic_dimension; - - using dimension = - typename ViewDimensionJoin::type; -}; - -template -struct ViewArrayAnalysis { - private: - using nested = ViewArrayAnalysis; - using nested_dimension = typename nested::dimension; - - public: - using value_type = typename nested::value_type; - using const_value_type = typename nested::const_value_type; - using non_const_value_type = typename nested::non_const_value_type; - - using dynamic_dimension = - typename nested::dynamic_dimension::template prepend<0>::type; - - using static_dimension = typename nested::static_dimension; - - using dimension = - typename ViewDimensionJoin::type; -}; - -template -struct ViewArrayAnalysis { - private: - using nested = ViewArrayAnalysis; - - public: - using value_type = typename nested::value_type; - using const_value_type = typename nested::const_value_type; - using non_const_value_type = typename nested::non_const_value_type; - - using dynamic_dimension = - typename nested::dynamic_dimension::template prepend<0>::type; - - using static_dimension = typename nested::static_dimension; - - using dimension = - typename ViewDimensionJoin::type; -}; - -template -struct ViewDataAnalysis { - private: - using array_analysis = ViewArrayAnalysis; - - // ValueType is opportunity for partial specialization. - // Must match array analysis when this default template is used. - static_assert( - std::is_same::value, - ""); - - public: - using specialize = void; // No specialization - - using dimension = typename array_analysis::dimension; - using value_type = typename array_analysis::value_type; - using const_value_type = typename array_analysis::const_value_type; - using non_const_value_type = typename array_analysis::non_const_value_type; - - // Generate analogous multidimensional array specification type. - using type = typename ViewDataType::type; - using const_type = typename ViewDataType::type; - using non_const_type = - typename ViewDataType::type; - - // Generate "flattened" multidimensional array specification type. - using scalar_array_type = type; - using const_scalar_array_type = const_type; - using non_const_scalar_array_type = non_const_type; -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -struct ViewOffset { - using is_mapping_plugin = std::false_type; -}; - //---------------------------------------------------------------------------- // LayoutLeft AND ( 1 >= rank OR 0 == rank_dynamic ) : no padding / striding template @@ -2919,13 +2527,9 @@ struct ViewValueFunctor { "Kokkos::View::initialization [" + name + "] via memset", Kokkos::Profiling::Experimental::device_id(space), &kpID); } - (void)ZeroMemset< - ExecSpace, Kokkos::View>>( - space, - Kokkos::View>(ptr, n), - value); + (void)ZeroMemset( + space, Kokkos::View>(ptr, n)); if (Kokkos::Profiling::profileLibraryLoaded()) { Kokkos::Profiling::endParallelFor(kpID); @@ -2949,37 +2553,33 @@ struct ViewValueFunctor { template void parallel_for_implementation() { - if (!space.in_parallel()) { - using PolicyType = - Kokkos::RangePolicy, Tag>; - PolicyType policy(space, 0, n); - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - const std::string functor_name = - (std::is_same_v - ? "Kokkos::View::destruction [" + name + "]" - : "Kokkos::View::initialization [" + name + "]"); - Kokkos::Profiling::beginParallelFor( - functor_name, Kokkos::Profiling::Experimental::device_id(space), - &kpID); - } + using PolicyType = + Kokkos::RangePolicy, Tag>; + PolicyType policy(space, 0, n); + uint64_t kpID = 0; + if (Kokkos::Profiling::profileLibraryLoaded()) { + const std::string functor_name = + (std::is_same_v + ? "Kokkos::View::destruction [" + name + "]" + : "Kokkos::View::initialization [" + name + "]"); + Kokkos::Profiling::beginParallelFor( + functor_name, Kokkos::Profiling::Experimental::device_id(space), + &kpID); + } #ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) { - Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, - true); - } + if (std::is_same::value) { + Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, + true); + } #endif - const Kokkos::Impl::ParallelFor closure( - *this, policy); - closure.execute(); - if (default_exec_space || std::is_same_v) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - } else { - for (size_t i = 0; i < n; ++i) operator()(Tag{}, i); + const Kokkos::Impl::ParallelFor closure( + *this, policy); + closure.execute(); + if (default_exec_space || std::is_same_v) + space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); } } @@ -3057,13 +2657,9 @@ struct ViewValueFunctor { Kokkos::Profiling::Experimental::device_id(space), &kpID); } - (void)ZeroMemset< - ExecSpace, Kokkos::View>>( - space, - Kokkos::View>(ptr, n), - value); + (void)ZeroMemset( + space, Kokkos::View>(ptr, n)); if (Kokkos::Profiling::profileLibraryLoaded()) { Kokkos::Profiling::endParallelFor(kpID); @@ -3086,32 +2682,28 @@ struct ViewValueFunctor { } void parallel_for_implementation() { - if (!space.in_parallel()) { - PolicyType policy(0, n); - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "]", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } + PolicyType policy(0, n); + uint64_t kpID = 0; + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::beginParallelFor( + "Kokkos::View::initialization [" + name + "]", + Kokkos::Profiling::Experimental::device_id(space), &kpID); + } #ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) { - Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, - true); - } + if (std::is_same::value) { + Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, + true); + } #endif - const Kokkos::Impl::ParallelFor closure( - *this, PolicyType(0, n)); - closure.execute(); - if (default_exec_space) - space.fence( - "Kokkos::Impl::ViewValueFunctor: Fence after setting values in " - "view"); - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - } else { - for (size_t i = 0; i < n; ++i) operator()(i); + const Kokkos::Impl::ParallelFor closure( + *this, PolicyType(0, n)); + closure.execute(); + if (default_exec_space) + space.fence( + "Kokkos::Impl::ViewValueFunctor: Fence after setting values in " + "view"); + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); } } @@ -3896,7 +3488,7 @@ class ViewMapping< template struct apply { - static_assert(Kokkos::is_memory_traits::value, ""); + static_assert(Kokkos::is_memory_traits::value); using traits_type = Kokkos::ViewTraits -KOKKOS_INLINE_FUNCTION bool view_verify_operator_bounds(const MapType&) { - return true; +template +KOKKOS_FUNCTION bool within_range(Map const& map, + std::index_sequence, + Indices... indices) { + return (((std::size_t)indices < map.extent(Enumerate)) && ...); } -template -KOKKOS_INLINE_FUNCTION bool view_verify_operator_bounds(const MapType& map, - const iType& i, - Args... args) { - return (size_t(i) < map.extent(R)) && - view_verify_operator_bounds(map, args...); +template +KOKKOS_FUNCTION constexpr char* append_formatted_multidimensional_index( + char* dest, Indices... indices) { + char* d = dest; + strcat(d, "["); + ( + [&] { + d += strlen(d); + to_chars_i(d, + d + 20, // 20 digits ought to be enough + indices); + strcat(d, ","); + }(), + ...); + d[strlen(d) - 1] = ']'; // overwrite trailing comma + return dest; } -template -inline void view_error_operator_bounds(char*, int, const MapType&) {} - -template -inline void view_error_operator_bounds(char* buf, int len, const MapType& map, - const iType& i, Args... args) { - const int n = snprintf( - buf, len, " %ld < %ld %c", static_cast(i), - static_cast(map.extent(R)), (sizeof...(Args) ? ',' : ')')); - view_error_operator_bounds(buf + n, len - n, map, args...); +template +KOKKOS_FUNCTION void print_extents(char* dest, Map const& map, + std::index_sequence) { + append_formatted_multidimensional_index(dest, map.extent(Enumerate)...); } -/* Check #3: is the View managed as determined by the MemoryTraits? */ -template -struct OperatorBoundsErrorOnDevice; - -template -struct OperatorBoundsErrorOnDevice { - KOKKOS_INLINE_FUNCTION - static void run(MapType const&) { Kokkos::abort("View bounds error"); } -}; - -template -struct OperatorBoundsErrorOnDevice { - KOKKOS_INLINE_FUNCTION - static void run(MapType const& map) { - SharedAllocationHeader const* const header = - SharedAllocationHeader::get_header( - static_cast(map.data())); - char const* const label = header->label(); - enum { LEN = 128 }; - char msg[LEN]; - char const* const first_part = "View bounds error of view "; - char* p = msg; - char* const end = msg + LEN - 1; - for (char const* p2 = first_part; (*p2 != '\0') && (p < end); ++p, ++p2) { - *p = *p2; - } - for (char const* p2 = label; (*p2 != '\0') && (p < end); ++p, ++p2) { - *p = *p2; - } - *p = '\0'; - Kokkos::abort(msg); - } -}; - -/* Check #2: does the ViewMapping have the printable_label_typedef defined? - See above that only the non-specialized standard-layout ViewMapping has - this defined by default. - The existence of this alias indicates the existence of MapType::is_managed - */ template using printable_label_typedef_t = typename T::printable_label_typedef; -template -KOKKOS_FUNCTION - std::enable_if_t::value> - operator_bounds_error_on_device(Map const&) { - Kokkos::abort("View bounds error"); -} - -template -KOKKOS_FUNCTION - std::enable_if_t::value> - operator_bounds_error_on_device(Map const& map) { - OperatorBoundsErrorOnDevice::run(map); -} - template KOKKOS_INLINE_FUNCTION void view_verify_operator_bounds( Kokkos::Impl::ViewTracker const& tracker, const MapType& map, Args... args) { - if (!view_verify_operator_bounds<0>(map, args...)) { + if (!within_range(map, std::make_index_sequence(), + args...)) { + char err[256] = ""; + strcat(err, "Kokkos::View ERROR: out of bounds access"); + strcat(err, " label=(\""); KOKKOS_IF_ON_HOST( - (enum {LEN = 1024}; char buffer[LEN]; - const std::string label = - tracker.m_tracker.template get_label(); - int n = snprintf(buffer, LEN, "View bounds error of view %s (", - label.c_str()); - view_error_operator_bounds<0>(buffer + n, LEN - n, map, args...); - Kokkos::Impl::throw_runtime_exception(std::string(buffer));)) - - KOKKOS_IF_ON_DEVICE(( - /* Check #1: is there a SharedAllocationRecord? - (we won't use it, but if its not there then there isn't - a corresponding SharedAllocationHeader containing a label). - This check should cover the case of Views that don't - have the Unmanaged trait but were initialized by pointer. */ if (tracker.m_tracker.has_record()) { - operator_bounds_error_on_device(map); - } else { Kokkos::abort("View bounds error"); })) + strncat(err, tracker.m_tracker.template get_label().c_str(), + 128); + } else { strcat(err, "**UNMANAGED**"); }) + KOKKOS_IF_ON_DEVICE([&] { + // Check #1: is there a SharedAllocationRecord? (we won't use it, but + // if its not there then there isn't a corresponding + // SharedAllocationHeader containing a label). This check should cover + // the case of Views that don't have the Unmanaged trait but were + // initialized by pointer. + if (!tracker.m_tracker.has_record()) { + strcat(err, "**UNMANAGED**"); + return; + } + // Check #2: does the ViewMapping have the printable_label_typedef + // defined? See above that only the non-specialized standard-layout + // ViewMapping has this defined by default. The existence of this + // alias indicates the existence of MapType::is_managed + if constexpr (is_detected_v) { + // Check #3: is the View managed as determined by the MemoryTraits? + if constexpr (MapType::is_managed != 0) { + SharedAllocationHeader const* const header = + SharedAllocationHeader::get_header( + static_cast(map.data())); + char const* const label = header->label(); + strcat(err, label); + return; + } + strcat(err, "**UNAVAILABLE**"); + } + }();) + strcat(err, "\") with indices "); + append_formatted_multidimensional_index(err, args...); + strcat(err, " but extents "); + print_extents(err, map, std::make_index_sequence()); + Kokkos::abort(err); } } diff --git a/core/src/setup/Kokkos_Setup_SYCL.hpp b/core/src/setup/Kokkos_Setup_SYCL.hpp index 7f7957bc61f..30f6fa2ad23 100644 --- a/core/src/setup/Kokkos_Setup_SYCL.hpp +++ b/core/src/setup/Kokkos_Setup_SYCL.hpp @@ -38,12 +38,11 @@ #include #endif -#ifdef __SYCL_DEVICE_ONLY__ -#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(format, ...) \ - do { \ - const __attribute__((opencl_constant)) char fmt[] = (format); \ - sycl::ext::oneapi::experimental::printf(fmt, ##__VA_ARGS__); \ - } while (0) +#if defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER >= 20230200 +#define KOKKOS_IMPL_SYCL_GET_MULTI_PTR(accessor) \ + accessor.get_multi_ptr() +#else +#define KOKKOS_IMPL_SYCL_GET_MULTI_PTR(accessor) accessor.get_pointer() #endif #endif diff --git a/core/src/traits/Kokkos_IndexTypeTrait.hpp b/core/src/traits/Kokkos_IndexTypeTrait.hpp index 91820fbccac..e43535451c3 100644 --- a/core/src/traits/Kokkos_IndexTypeTrait.hpp +++ b/core/src/traits/Kokkos_IndexTypeTrait.hpp @@ -83,7 +83,7 @@ struct IndexTypePolicyMixin : AnalyzeNextTrait { "Kokkos Error: More than one index type given. Search " "compiler output for 'show_extra_index_type' to see the " "type of the errant tag."); - static_assert(std::is_integral::value, ""); + static_assert(std::is_integral::value); static constexpr bool index_type_is_defaulted = false; using index_type = Kokkos::IndexType; }; diff --git a/core/src/traits/Kokkos_OccupancyControlTrait.hpp b/core/src/traits/Kokkos_OccupancyControlTrait.hpp index dadf582c372..c2ca5a341f1 100644 --- a/core/src/traits/Kokkos_OccupancyControlTrait.hpp +++ b/core/src/traits/Kokkos_OccupancyControlTrait.hpp @@ -163,7 +163,7 @@ auto prefer(Policy const& p, DesiredOccupancy occ) { template constexpr auto prefer(Policy const& p, MaximizeOccupancy) { - static_assert(Kokkos::is_execution_policy::value, ""); + static_assert(Kokkos::is_execution_policy::value); using new_policy_t = Kokkos::Impl::OccupancyControlTrait::policy_with_trait; diff --git a/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp b/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp index 578e9e762ad..98ad1d7ebbb 100644 --- a/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp +++ b/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp @@ -68,7 +68,7 @@ struct PolicyTraitAdaptorImpl< TraitSpec, PolicyTemplate, type_list, type_list, NewTrait, std::enable_if_t::value>> { - static_assert(PolicyTraitMatcher::value, ""); + static_assert(PolicyTraitMatcher::value); using type = PolicyTemplate; }; @@ -92,7 +92,7 @@ template class PolicyTemplate, struct PolicyTraitAdaptorImpl, type_list<>, NewTrait> { - static_assert(PolicyTraitMatcher::value, ""); + static_assert(PolicyTraitMatcher::value); using type = PolicyTemplate; }; diff --git a/core/src/traits/Kokkos_ScheduleTrait.hpp b/core/src/traits/Kokkos_ScheduleTrait.hpp index 86130025530..4e91d89f0f9 100644 --- a/core/src/traits/Kokkos_ScheduleTrait.hpp +++ b/core/src/traits/Kokkos_ScheduleTrait.hpp @@ -78,7 +78,7 @@ namespace Experimental { template constexpr auto require(Policy const& p, Kokkos::Schedule) { - static_assert(Kokkos::is_execution_policy::value, ""); + static_assert(Kokkos::is_execution_policy::value); using new_policy_t = Kokkos::Impl::ScheduleTrait::policy_with_trait< Policy, Kokkos::Schedule>; return new_policy_t{p}; diff --git a/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp b/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp index 8f95385c851..ae7aa6e534f 100644 --- a/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp +++ b/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp @@ -57,7 +57,7 @@ namespace Experimental { template constexpr auto require(const Policy p, WorkItemProperty::ImplWorkItemProperty) { - static_assert(Kokkos::is_execution_policy::value, ""); + static_assert(Kokkos::is_execution_policy::value); using new_policy_t = Kokkos::Impl::WorkItemPropertyTrait::policy_with_trait< Policy, WorkItemProperty::ImplWorkItemProperty>; return new_policy_t{p}; diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index b71c72c3c9f..6dfb7505c5d 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -65,7 +65,7 @@ SET(KOKKOS_THREADS_NAME Threads) IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) SET(KOKKOS_OPENACC_FEATURE_LEVEL 9) ELSE() - SET(KOKKOS_OPENACC_FEATURE_LEVEL 16) + SET(KOKKOS_OPENACC_FEATURE_LEVEL 17) ENDIF() SET(KOKKOS_OPENACC_NAME Experimental::OpenACC) @@ -86,11 +86,13 @@ SET(COMPILE_ONLY_SOURCES TestDetectionIdiom.cpp TestBitManipulation.cpp TestInterOp.cpp + TestRangePolicyCTAD.cpp TestStringManipulation.cpp TestVersionMacros.cpp TestViewRank.cpp TestViewTypeTraits.cpp TestTypeList.cpp + TestMDRangePolicyCTAD.cpp view/TestExtentsDatatypeConversion.cpp ) @@ -184,6 +186,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) MDSpan MinMaxClamp NumericTraits + OccupancyControlTrait Other ParallelScanRangePolicy Printf @@ -200,6 +203,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) Reductions Reductions_DeviceView SharedAlloc + Swap ) set(file ${dir}/Test${Tag}_${Name}.cpp) # Write to a temporary intermediate file and call configure_file to avoid @@ -233,6 +237,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) ViewCopy_a ViewCopy_b ViewCtorDimMatch + ViewEmptyRuntimeUnmanaged ViewHooks ViewLayoutStrideAssignment ViewMapping_a @@ -240,6 +245,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;OpenACC;HIP;SYCL) ViewMapping_subview ViewMemoryAccessViolation ViewOfClass + ViewOutOfBoundsAccess ViewResize WorkGraph WithoutInitializing @@ -372,20 +378,21 @@ if(Kokkos_ENABLE_OPENMPTARGET) ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamCombinedReducers.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamReductionScan.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_WorkGraph.cpp - IF (KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0) - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c01.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c02.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c03.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp - endif() IF (KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0) ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_shared.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_MinMaxClamp.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_LocalDeepCopy.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_e.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamScan.cpp ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamBasic.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_e.cpp + IF (KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 17.0.3) + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c01.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c02.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_SubView_c03.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reducers_d.cpp + endif() endif() # FIXME_OPENMPTARGET_CRAY: The following tests fail at compile time when the OpenMPTarget backend is enabled with the Cray compiler. # Atomic compare/exchange is used in these tests which can be one of the reasons for the compilation failures. @@ -522,17 +529,7 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) list(REMOVE_ITEM OpenACC_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_a1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/default/TestDefaultDeviceType_b1.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_double.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_float.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_int.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_longint.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_longlongint.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_shared.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_unsignedint.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_unsignedlongint.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Atomics.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicViews.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_BlockSizeDeduction.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_DeepCopyAlignment.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_HostSharedPtr.cpp @@ -549,17 +546,10 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_d.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions_DeviceView.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_b.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c02.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c03.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c05.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c08.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c11.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamBasic.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamScratch.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamTeamSize.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_UniqueToken.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewMapping_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewResize.cpp ) endif() @@ -677,7 +667,6 @@ endif() if (Kokkos_ENABLE_OPENMP) set(OpenMP_EXTRA_SOURCES openmp/TestOpenMP_Task.cpp - openmp/TestOpenMP_PartitionMaster.cpp ) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_OpenMP @@ -724,12 +713,14 @@ if(Kokkos_ENABLE_HPX) hpx/TestHPX_IndependentInstancesRefCounting.cpp hpx/TestHPX_IndependentInstancesSynchronization.cpp ) +if(Kokkos_ENABLE_DEPRECATED_CODE_4) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_HPX_InParallel SOURCES UnitTestMainInit.cpp hpx/TestHPX_InParallel.cpp ) + endif() endif() if(Kokkos_ENABLE_OPENMPTARGET) @@ -797,6 +788,12 @@ if(Kokkos_ENABLE_CUDA) UnitTestMain.cpp cuda/TestCuda_InterOp_Streams.cpp ) + KOKKOS_ADD_EXECUTABLE_AND_TEST( + CoreUnitTest_CudaInterOpStreamsMultiGPU + SOURCES + UnitTestMainInit.cpp + cuda/TestCuda_InterOp_StreamsMultiGPU.cpp + ) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_CudaGraph SOURCES @@ -1039,13 +1036,7 @@ KOKKOS_ADD_ADVANCED_TEST( CoreUnitTest_PushFinalizeHook_terminate tools/TestCategoricalTuner.cpp ) endif() - if((NOT Kokkos_ENABLE_OPENMPTARGET) AND (NOT Kokkos_ENABLE_OPENACC)) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_LogicalSpaces - SOURCES - tools/TestLogicalSpaces.cpp - ) - endif() + SET(KOKKOSP_SOURCES UnitTestMainInit.cpp tools/TestEventCorrectness.cpp @@ -1167,15 +1158,6 @@ KOKKOS_ADD_TEST( NAME CoreUnitTest_StackTraceTest ) endif() -if(Kokkos_ENABLE_DEPRECATED_CODE_3) - foreach(INITTESTS_NUM RANGE 1 18) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - CoreUnitTest_DefaultInit_${INITTESTS_NUM} - SOURCES UnitTestMain.cpp default/TestDefaultDeviceTypeInit_${INITTESTS_NUM}.cpp - ) - endforeach(INITTESTS_NUM) -endif() - if (KOKKOS_ENABLE_HWLOC) KOKKOS_ADD_EXECUTABLE_AND_TEST( CoreUnitTest_HWLOC @@ -1259,12 +1241,10 @@ if (NOT KOKKOS_HAS_TRILINOS) INPUT TestDeviceAndThreads.py ${USE_SOURCE_PERMISSIONS_WHEN_SUPPORTED} ) - if(NOT Kokkos_ENABLE_OPENMPTARGET) # FIXME_OPENMPTARGET does not select the right device - add_test( - NAME Kokkos_CoreUnitTest_DeviceAndThreads - COMMAND ${Python3_EXECUTABLE} -m unittest -v $/TestDeviceAndThreads.py - ) - endif() + add_test( + NAME Kokkos_CoreUnitTest_DeviceAndThreads + COMMAND ${Python3_EXECUTABLE} -m unittest -v $/TestDeviceAndThreads.py + ) endif() endif() diff --git a/core/unit_test/Makefile b/core/unit_test/Makefile index 33a84b61f92..202809d3fc9 100644 --- a/core/unit_test/Makefile +++ b/core/unit_test/Makefile @@ -67,8 +67,8 @@ TESTS = AtomicOperations_int AtomicOperations_unsignedint AtomicOperations_longi tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ tmp2 := $(foreach test, $(TESTS), \ $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > Test$(device)_$(test).cpp); \ - $(shell echo "\#include " >> Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include " > Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include " >> Test$(device)_$(test).cpp); \ ) \ ) \ ) @@ -82,8 +82,8 @@ KOKKOS_SUBVIEW_DEVICELIST := $(filter-out Cuda, $(KOKKOS_DEVICELIST)) tmp := $(foreach device, $(KOKKOS_SUBVIEW_DEVICELIST), \ tmp2 := $(foreach test, $(SUBVIEW_TESTS), \ $(if $(filter Test$(device)_$(test).cpp, $(shell ls Test$(device)_$(test).cpp 2>/dev/null)),, \ - $(shell echo "\#include " > Test$(device)_$(test).cpp); \ - $(shell echo "\#include " >> Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include " > Test$(device)_$(test).cpp); \ + $(shell echo "$(H)include " >> Test$(device)_$(test).cpp); \ ) \ )\ ) @@ -91,8 +91,8 @@ tmp := $(foreach device, $(KOKKOS_SUBVIEW_DEVICELIST), \ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) tmp2 := $(foreach test, $(SUBVIEW_TESTS), \ $(if $(filter TestCuda_$(test).cpp, $(shell ls TestCuda_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > TestCuda_$(test).cpp); \ - $(shell echo "\#include " >> TestCuda_$(test).cpp); \ + $(shell echo "$(H)include " > TestCuda_$(test).cpp); \ + $(shell echo "$(H)include " >> TestCuda_$(test).cpp); \ )\ ) @@ -100,8 +100,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) tmp := $(foreach space, $(GPU_SPACES), \ tmp2 := $(foreach test, $(GPU_SPACE_TESTS), \ $(if $(filter Test$(space)_$(test).cpp, $(shell ls Test$(space)_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > Test$(space)_$(test).cpp); \ - $(shell echo "\#include " >> Test$(space)_$(test).cpp); \ + $(shell echo "$(H)include " > Test$(space)_$(test).cpp); \ + $(shell echo "$(H)include " >> Test$(space)_$(test).cpp); \ )\ )\ ) @@ -277,8 +277,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) tmp := $(foreach space, $(GPU_SPACES), \ tmp2 := $(foreach test, $(GPU_SPACE_TESTS), \ $(if $(filter Test$(space)_$(test).cpp, $(shell ls Test$(space)_$(test).cpp 2>/dev/null)),,\ - $(shell echo "\#include " > Test$(space)_$(test).cpp); \ - $(shell echo "\#include " >> Test$(space)_$(test).cpp); \ + $(shell echo "$(H)include " > Test$(space)_$(test).cpp); \ + $(shell echo "$(H)include " >> Test$(space)_$(test).cpp); \ )\ )\ ) diff --git a/core/unit_test/TestAggregate.hpp b/core/unit_test/TestAggregate.hpp index 4f67b2eddce..f1316a7426a 100644 --- a/core/unit_test/TestAggregate.hpp +++ b/core/unit_test/TestAggregate.hpp @@ -29,35 +29,31 @@ void TestViewAggregate() { value_type>; static_assert( - std::is_same >::value, - ""); + std::is_same >::value); using a32_traits = Kokkos::ViewTraits; using flat_traits = Kokkos::ViewTraits; static_assert( - std::is_same >::value, - ""); + std::is_same >::value); static_assert( - std::is_same::value, ""); - static_assert(a32_traits::rank == 2, ""); - static_assert(a32_traits::rank_dynamic == 2, ""); + std::is_same::value); + static_assert(a32_traits::rank == 2); + static_assert(a32_traits::rank_dynamic == 2); - static_assert(std::is_void::value, ""); - static_assert(flat_traits::rank == 3, ""); - static_assert(flat_traits::rank_dynamic == 2, ""); - static_assert(flat_traits::dimension::N2 == 32, ""); + static_assert(std::is_void::value); + static_assert(flat_traits::rank == 3); + static_assert(flat_traits::rank_dynamic == 2); + static_assert(flat_traits::dimension::N2 == 32); using a32_type = Kokkos::View **, DeviceType>; using a32_flat_type = typename a32_type::array_type; - static_assert(std::is_same::value, - ""); - static_assert(std::is_same::value, - ""); - static_assert(a32_type::rank == 2, ""); - static_assert(a32_flat_type::rank == 3, ""); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(a32_type::rank == 2); + static_assert(a32_flat_type::rank == 3); a32_type x("test", 4, 5); a32_flat_type y(x); diff --git a/core/unit_test/TestArray.cpp b/core/unit_test/TestArray.cpp index d3bdc4f93f7..673d0036b71 100644 --- a/core/unit_test/TestArray.cpp +++ b/core/unit_test/TestArray.cpp @@ -49,4 +49,28 @@ KOKKOS_FUNCTION constexpr bool test_array_structured_binding_support() { static_assert(test_array_structured_binding_support()); +template +KOKKOS_FUNCTION constexpr bool is_equal(L const& l, R const& r) { + if (std::size(l) != std::size(r)) return false; + + for (size_t i = 0; i != std::size(l); ++i) { + if (l[i] != r[i]) return false; + } + + return true; +} + +// Disable ctad test for intel versions < 2021, see issue #6702 +#if !defined(KOKKOS_COMPILER_INTEL) || KOKKOS_COMPILER_INTEL >= 2021 +KOKKOS_FUNCTION constexpr bool test_array_ctad() { + constexpr int x = 10; + constexpr Kokkos::Array a{1, 2, 3, 5, x}; + constexpr Kokkos::Array b{1, 2, 3, 5, x}; + + return std::is_same_v && is_equal(a, b); +} + +static_assert(test_array_ctad()); +#endif + } // namespace diff --git a/core/unit_test/TestAtomicOperations.hpp b/core/unit_test/TestAtomicOperations.hpp index a5aebed4138..cd7ba47aa1e 100644 --- a/core/unit_test/TestAtomicOperations.hpp +++ b/core/unit_test/TestAtomicOperations.hpp @@ -368,6 +368,63 @@ bool atomic_op_test(T old_val, T update) { return result == 0; } +template +constexpr T relative_error_threshold = T(1.0e-15); + +template +bool atomic_op_test_rel(T old_val, T update) { + Kokkos::View op_data("op_data"); + Kokkos::deep_copy(op_data, old_val); + int result = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, 1), + KOKKOS_LAMBDA(int, int& local_result) { + auto fetch_result = + Op::atomic_op(&op_data(0), &op_data(1), &op_data(2), update); + T expected_val = Op::op(old_val, update); + Kokkos::memory_fence(); + if (expected_val == T(0)) { + if (fabs(op_data(0)) > relative_error_threshold) local_result += 1; + if (fabs(op_data(1)) > relative_error_threshold) local_result += 2; + if (fabs(op_data(2)) > relative_error_threshold) local_result += 4; + if (fetch_result.first != old_val) local_result += 8; + if (fabs(fetch_result.second) > relative_error_threshold) + local_result += 16; + } else { + if (fabs((op_data(0) - expected_val) / expected_val) > + relative_error_threshold) + local_result += 1; + if (fabs((op_data(1) - expected_val) / expected_val) > + relative_error_threshold) + local_result += 2; + if (fabs((op_data(2) - expected_val) / expected_val) > + relative_error_threshold) + local_result += 4; + if (fetch_result.first != old_val) local_result += 8; + if (fabs((fetch_result.second - expected_val) / expected_val) > + relative_error_threshold) + local_result += 16; + } + }, + result); + if ((result & 1) != 0) + printf("atomic_%s failed with type %s\n", Op::name(), typeid(T).name()); + if ((result & 2) != 0) + printf("atomic_fetch_%s failed with type %s\n", Op::name(), + typeid(T).name()); + if ((result & 4) != 0) + printf("atomic_%s_fetch failed with type %s\n", Op::name(), + typeid(T).name()); + if ((result & 8) != 0) + printf("atomic_fetch_%s did not return old value with type %s\n", + Op::name(), typeid(T).name()); + if ((result & 16) != 0) + printf("atomic_%s_fetch did not return updated value with type %s\n", + Op::name(), typeid(T).name()); + + return result == 0; +} + //--------------------------------------------------- //--------------atomic_test_control------------------ //--------------------------------------------------- @@ -395,6 +452,12 @@ bool AtomicOperationsTestIntegralType(int old_val_in, int update_in, int test) { case 9: return atomic_op_test(old_val, update); case 10: return atomic_op_test(old_val, update); +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + // FIXME_NVHPC: atomic-fetch-shift operation fails due to NVHPC OpenACC + // compiler bugs, which are reported to NVIDIA. + case 11: return true; + case 12: return true; +#else case 11: return update_in >= 0 ? atomic_op_test( old_val, update) @@ -403,6 +466,7 @@ bool AtomicOperationsTestIntegralType(int old_val_in, int update_in, int test) { return update_in >= 0 ? atomic_op_test( old_val, update) : true; +#endif case 13: return atomic_op_test(old_val, update); case 14: @@ -440,10 +504,20 @@ bool AtomicOperationsTestNonIntegralType(int old_val_in, int update_in, case 2: return atomic_op_test(old_val, update); case 3: return atomic_op_test(old_val, update); case 4: return atomic_op_test(old_val, update); +#if defined(KOKKOS_ENABLE_OPENACC) && defined(KOKKOS_COMPILER_NVHPC) + // NVHPC may use different internal precisions for the device and host + // atomic operations. Therefore, relative errors are used to compare the + // host results and device results. + case 5: + return update != 0 ? atomic_op_test_rel( + old_val, update) + : true; +#else case 5: return update != 0 ? atomic_op_test(old_val, update) : true; +#endif case 6: return atomic_op_test(old_val, update); } diff --git a/core/unit_test/TestAtomics.hpp b/core/unit_test/TestAtomics.hpp index 2b40f12d0a4..5f48e8c9746 100644 --- a/core/unit_test/TestAtomics.hpp +++ b/core/unit_test/TestAtomics.hpp @@ -498,7 +498,9 @@ TEST(TEST_CATEGORY, atomics) { ASSERT_TRUE((TestAtomic::Loop(100, 2))); ASSERT_TRUE((TestAtomic::Loop(100, 3))); -#ifndef KOKKOS_ENABLE_OPENMPTARGET + // FIXME_OPENMPTARGET + // FIXME_OPENACC: atomic operations on composite types are not supported. +#if !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_OPENACC) ASSERT_TRUE((TestAtomic::Loop, TEST_EXECSPACE>(1, 1))); ASSERT_TRUE((TestAtomic::Loop, TEST_EXECSPACE>(1, 2))); ASSERT_TRUE((TestAtomic::Loop, TEST_EXECSPACE>(1, 3))); diff --git a/core/unit_test/TestBitManipulationBuiltins.hpp b/core/unit_test/TestBitManipulationBuiltins.hpp index 092e7cff618..2f3bcfe817d 100644 --- a/core/unit_test/TestBitManipulationBuiltins.hpp +++ b/core/unit_test/TestBitManipulationBuiltins.hpp @@ -804,26 +804,26 @@ struct TestBitCastFunction { using Kokkos::bit_cast; if (bit_cast(123) != 123) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #1\n"); + Kokkos::printf("failed check #1\n"); } if (bit_cast(123u) != 123) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #2\n"); + Kokkos::printf("failed check #2\n"); } if (bit_cast(~0u) != ~0) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #3\n"); + Kokkos::printf("failed check #3\n"); } if constexpr (sizeof(int) == sizeof(float)) { if (!check(12.34f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #4\n"); + Kokkos::printf("failed check #4\n"); } } if constexpr (sizeof(unsigned long long) == sizeof(double)) { if (!check(123.456)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #5\n"); + Kokkos::printf("failed check #5\n"); } } @@ -848,11 +848,11 @@ struct TestBitCastFunction { } if (!(bit_cast(arr) == arr)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #6\n"); + Kokkos::printf("failed check #6\n"); } if (!(bit_cast(arr2) == arr2)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed check #7\n"); + Kokkos::printf("failed check #7\n"); } } }; diff --git a/core/unit_test/TestComplex.hpp b/core/unit_test/TestComplex.hpp index bcae2e1d816..5501a35b7f0 100644 --- a/core/unit_test/TestComplex.hpp +++ b/core/unit_test/TestComplex.hpp @@ -451,17 +451,15 @@ TEST(TEST_CATEGORY, complex_issue_3867) { ASSERT_FLOAT_EQ(x.real(), y.real()); ASSERT_FLOAT_EQ(x.imag(), y.imag()); -#define CHECK_POW_COMPLEX_PROMOTION(ARGTYPE1, ARGTYPE2, RETURNTYPE) \ - static_assert( \ - std::is_same(), \ - std::declval()))>::value, \ - ""); \ - static_assert( \ - std::is_same(), \ - std::declval()))>::value, \ - ""); +#define CHECK_POW_COMPLEX_PROMOTION(ARGTYPE1, ARGTYPE2, RETURNTYPE) \ + static_assert( \ + std::is_same(), \ + std::declval()))>::value); \ + static_assert( \ + std::is_same(), \ + std::declval()))>::value); CHECK_POW_COMPLEX_PROMOTION(Kokkos::complex, long double, Kokkos::complex); diff --git a/core/unit_test/TestConcepts.hpp b/core/unit_test/TestConcepts.hpp index 476a8848325..b85867bf63a 100644 --- a/core/unit_test/TestConcepts.hpp +++ b/core/unit_test/TestConcepts.hpp @@ -22,42 +22,42 @@ using ExecutionSpace = TEST_EXECSPACE; using MemorySpace = typename ExecutionSpace::memory_space; using DeviceType = typename ExecutionSpace::device_type; -static_assert(Kokkos::is_execution_space{}, ""); -static_assert(Kokkos::is_execution_space{}, ""); -static_assert(!Kokkos::is_execution_space{}, ""); -static_assert(!Kokkos::is_execution_space{}, ""); - -static_assert(Kokkos::is_memory_space{}, ""); -static_assert(Kokkos::is_memory_space{}, ""); -static_assert(!Kokkos::is_memory_space{}, ""); -static_assert(!Kokkos::is_memory_space{}, ""); - -static_assert(Kokkos::is_device{}, ""); -static_assert(Kokkos::is_device{}, ""); -static_assert(!Kokkos::is_device{}, ""); -static_assert(!Kokkos::is_device{}, ""); - -static_assert(!Kokkos::is_device{}, ""); -static_assert(!Kokkos::is_device{}, ""); - -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(Kokkos::is_space{}, ""); -static_assert(!Kokkos::is_space{}, ""); -static_assert(!Kokkos::is_space{}, ""); -static_assert(!Kokkos::is_space{}, ""); - -static_assert(Kokkos::is_execution_space_v, ""); -static_assert(!Kokkos::is_execution_space_v, ""); +static_assert(Kokkos::is_execution_space{}); +static_assert(Kokkos::is_execution_space{}); +static_assert(!Kokkos::is_execution_space{}); +static_assert(!Kokkos::is_execution_space{}); + +static_assert(Kokkos::is_memory_space{}); +static_assert(Kokkos::is_memory_space{}); +static_assert(!Kokkos::is_memory_space{}); +static_assert(!Kokkos::is_memory_space{}); + +static_assert(Kokkos::is_device{}); +static_assert(Kokkos::is_device{}); +static_assert(!Kokkos::is_device{}); +static_assert(!Kokkos::is_device{}); + +static_assert(!Kokkos::is_device{}); +static_assert(!Kokkos::is_device{}); + +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(Kokkos::is_space{}); +static_assert(!Kokkos::is_space{}); +static_assert(!Kokkos::is_space{}); +static_assert(!Kokkos::is_space{}); + +static_assert(Kokkos::is_execution_space_v); +static_assert(!Kokkos::is_execution_space_v); static_assert( - std::is_same>{}, ""); -static_assert(std::is_same>{}, ""); -static_assert(std::is_same>{}, ""); -static_assert(std::is_same>{}, ""); + std::is_same>{}); +static_assert(std::is_same>{}); +static_assert(std::is_same>{}); +static_assert(std::is_same>{}); /*------------------------------------------------- begin test for team_handle concept diff --git a/core/unit_test/TestDefaultDeviceTypeInit.hpp b/core/unit_test/TestDefaultDeviceTypeInit.hpp deleted file mode 100644 index 929c91db4e0..00000000000 --- a/core/unit_test/TestDefaultDeviceTypeInit.hpp +++ /dev/null @@ -1,491 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include - -#include - -#ifdef KOKKOS_ENABLE_OPENMP -#include -#endif -#include -#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__) - -namespace Test { - -namespace Impl { - -std::set delete_these; -void cleanup_memory() { - for (auto x : delete_these) { - delete[] x; - } -} - -char** init_kokkos_args(bool do_threads, bool do_numa, bool do_device, - bool do_other, bool do_tune, int& nargs, - Kokkos::InitArguments& init_args) { - nargs = (do_threads ? 1 : 0) + (do_numa ? 1 : 0) + (do_device ? 1 : 0) + - (do_other ? 4 : 0) + (do_tune ? 1 : 0); - - char** args_kokkos = new char*[nargs]; - const int max_args_size = 45; - for (int i = 0; i < nargs; i++) { - args_kokkos[i] = new char[max_args_size]; - delete_these.insert(args_kokkos[i]); - } - - int threads_idx = do_other ? 1 : 0; - int numa_idx = (do_other ? 3 : 0) + (do_threads ? 1 : 0); - int device_idx = - (do_other ? 3 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0); - int tune_idx = (do_other ? 4 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0) + - (do_device ? 1 : 0); - - if (do_threads) { - int nthreads = 3; - -#ifdef KOKKOS_ENABLE_OPENMP - if (omp_get_max_threads() < nthreads) { - nthreads = omp_get_max_threads(); - } -#elif defined(KOKKOS_ENABLE_HPX) - const int concurrency = std::thread::hardware_concurrency(); - if (concurrency < nthreads) { - nthreads = concurrency; - } -#endif - - if (Kokkos::hwloc::available()) { - if (Kokkos::hwloc::get_available_threads_per_core() < - static_cast(nthreads)) - nthreads = Kokkos::hwloc::get_available_threads_per_core() * - Kokkos::hwloc::get_available_numa_count(); - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same::value || - std::is_same::value) { - nthreads = 1; - } -#endif - - init_args.num_threads = nthreads; - snprintf(args_kokkos[threads_idx], max_args_size, "--threads=%i", nthreads); - } - - if (do_numa) { - int numa = 1; - if (Kokkos::hwloc::available()) { - numa = Kokkos::hwloc::get_available_numa_count(); - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same::value || - std::is_same::value) { - numa = 1; - } -#endif - - init_args.num_numa = numa; - snprintf(args_kokkos[numa_idx], max_args_size, "--numa=%i", numa); - } - - if (do_device) { - init_args.device_id = 0; - snprintf(args_kokkos[device_idx], max_args_size, "--device-id=%i", 0); - } - - if (do_other) { - snprintf(args_kokkos[0], max_args_size, "--dummyarg=1"); - snprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0)], max_args_size, - "--dummy2arg"); - snprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0) + 1], max_args_size, - "dummy3arg"); - snprintf(args_kokkos[device_idx + (do_device ? 1 : 0)], max_args_size, - "dummy4arg=1"); - } - - if (do_tune) { - init_args.tune_internals = true; - snprintf(args_kokkos[tune_idx], max_args_size, "--kokkos-tune-internals"); - } - - return args_kokkos; -} - -Kokkos::InitArguments init_initstruct(bool do_threads, bool do_numa, - bool do_device, bool do_tune) { - Kokkos::InitArguments args; - - if (do_threads) { - int nthreads = 3; - -#ifdef KOKKOS_ENABLE_OPENMP - if (omp_get_max_threads() < nthreads) { - nthreads = omp_get_max_threads(); - } -#elif defined(KOKKOS_ENABLE_HPX) - const int concurrency = std::thread::hardware_concurrency(); - if (concurrency < nthreads) { - nthreads = concurrency; - } -#endif - - if (Kokkos::hwloc::available()) { - if (Kokkos::hwloc::get_available_threads_per_core() < - static_cast(nthreads)) { - nthreads = Kokkos::hwloc::get_available_threads_per_core() * - Kokkos::hwloc::get_available_numa_count(); - } - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same::value || - std::is_same::value) { - nthreads = 1; - } -#endif - - args.num_threads = nthreads; - } - - if (do_numa) { - int numa = 1; - if (Kokkos::hwloc::available()) { - numa = Kokkos::hwloc::get_available_numa_count(); - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same::value || - std::is_same::value) { - numa = 1; - } -#endif - - args.num_numa = numa; - } - - if (do_device) { - args.device_id = 0; - } - - if (do_tune) { - args.tune_internals = true; - } - - return args; -} - -void check_correct_initialization(const Kokkos::InitArguments& argstruct) { - ASSERT_EQ(Kokkos::DefaultExecutionSpace::impl_is_initialized(), 1); - ASSERT_EQ(Kokkos::HostSpace::execution_space::impl_is_initialized(), 1); - - // Figure out the number of threads the HostSpace ExecutionSpace should have - // initialized to. - int expected_nthreads = argstruct.num_threads; - -#ifdef KOKKOS_ENABLE_OPENMP - if (std::is_same::value) { - // use openmp default num threads - if (expected_nthreads < 0 || - (expected_nthreads == 0 && !Kokkos::hwloc::available())) { - expected_nthreads = omp_get_max_threads(); - } - // use hwloc if available - else if (expected_nthreads == 0 && Kokkos::hwloc::available()) { - expected_nthreads = Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa() * - Kokkos::hwloc::get_available_threads_per_core(); - } - } -#endif - - if (expected_nthreads < 1) { - if (Kokkos::hwloc::available()) { - expected_nthreads = Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa() * - Kokkos::hwloc::get_available_threads_per_core(); - } else { - expected_nthreads = 1; - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same::value || - std::is_same::value) { - expected_nthreads = 1; - } -#endif - -#ifdef KOKKOS_ENABLE_HPX - // HPX uses all cores on machine by default. Skip this test. - if (std::is_same::value || - std::is_same::value) { - return; - } -#endif - } - - int expected_numa = argstruct.num_numa; - - if (expected_numa < 1) { - if (Kokkos::hwloc::available()) { - expected_numa = Kokkos::hwloc::get_available_numa_count(); - } else { - expected_numa = 1; - } - -#ifdef KOKKOS_ENABLE_SERIAL - if (std::is_same::value || - std::is_same::value) - expected_numa = 1; -#endif - } - - ASSERT_EQ(Kokkos::HostSpace::execution_space().impl_thread_pool_size(), - expected_nthreads); - -#ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) { - int device; - cudaGetDevice(&device); - - int expected_device = argstruct.device_id; - if (argstruct.device_id < 0) { - expected_device = Kokkos::Cuda().cuda_device(); - } - - ASSERT_EQ(expected_device, device); - } -#endif - ASSERT_EQ(argstruct.tune_internals, Kokkos::tune_internals()); -} - -// TODO: Add check whether correct number of threads are actually started. -void test_no_arguments() { - Kokkos::initialize(); - check_correct_initialization(Kokkos::InitArguments()); - Kokkos::finalize(); -} - -void test_commandline_args(int nargs, char** args, - const Kokkos::InitArguments& argstruct) { - Kokkos::initialize(nargs, args); - check_correct_initialization(argstruct); - Kokkos::finalize(); -} - -void test_initstruct_args(const Kokkos::InitArguments& args) { - Kokkos::initialize(args); - check_correct_initialization(args); - Kokkos::finalize(); -} - -} // namespace Impl - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_01 -TEST(defaultdevicetypeinit, no_args) { Impl::test_no_arguments(); } -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_02 -TEST(defaultdevicetypeinit, commandline_args_empty) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = Impl::init_kokkos_args(false, false, false, false, false, nargs, - argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_03 -TEST(defaultdevicetypeinit, commandline_args_other) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = Impl::init_kokkos_args(false, false, false, true, false, nargs, - argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_04 -TEST(defaultdevicetypeinit, commandline_args_nthreads) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = Impl::init_kokkos_args(true, false, false, false, false, nargs, - argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_05 -TEST(defaultdevicetypeinit, commandline_args_nthreads_numa) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, true, false, false, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_06 -TEST(defaultdevicetypeinit, commandline_args_nthreads_numa_device) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, true, true, false, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_07 -TEST(defaultdevicetypeinit, commandline_args_nthreads_device) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, false, true, false, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_08 -TEST(defaultdevicetypeinit, commandline_args_numa_device) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(false, true, true, false, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_09 -TEST(defaultdevicetypeinit, commandline_args_device) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = Impl::init_kokkos_args(false, false, true, false, false, nargs, - argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_10 -TEST(defaultdevicetypeinit, commandline_args_nthreads_numa_device_other) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, true, true, true, false, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_11 -TEST(defaultdevicetypeinit, commandline_args_nthreads_numa_device_other_tune) { - Kokkos::InitArguments argstruct; - int nargs = 0; - char** args = - Impl::init_kokkos_args(true, true, true, true, true, nargs, argstruct); - Impl::test_commandline_args(nargs, args, argstruct); - Impl::cleanup_memory(); - delete[] args; -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_12 -TEST(defaultdevicetypeinit, initstruct_default) { - Kokkos::InitArguments args; - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_13 -TEST(defaultdevicetypeinit, initstruct_nthreads) { - Kokkos::InitArguments args = Impl::init_initstruct(true, false, false, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_14 -TEST(defaultdevicetypeinit, initstruct_nthreads_numa) { - Kokkos::InitArguments args = Impl::init_initstruct(true, true, false, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_15 -TEST(defaultdevicetypeinit, initstruct_device) { - Kokkos::InitArguments args = Impl::init_initstruct(false, false, true, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_16 -TEST(defaultdevicetypeinit, initstruct_nthreads_device) { - Kokkos::InitArguments args = Impl::init_initstruct(true, false, true, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_17 -TEST(defaultdevicetypeinit, initstruct_nthreads_numa_device) { - Kokkos::InitArguments args = Impl::init_initstruct(true, true, true, false); - Impl::test_initstruct_args(args); -} -#endif - -#ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_18 -TEST(defaultdevicetypeinit, initstruct_nthreads_numa_device_tune) { - Kokkos::InitArguments args = Impl::init_initstruct(true, true, true, true); - Impl::test_initstruct_args(args); -} -#endif - -} // namespace Test - -#endif diff --git a/core/unit_test/TestDeviceAndThreads.py b/core/unit_test/TestDeviceAndThreads.py index 1d3ff8eea7e..63d26ad41a4 100644 --- a/core/unit_test/TestDeviceAndThreads.py +++ b/core/unit_test/TestDeviceAndThreads.py @@ -17,6 +17,8 @@ import unittest import subprocess +import platform +import os PREFIX = "$" EXECUTABLE = "$" @@ -30,7 +32,22 @@ def GetFlag(flag, *extra_args): return int(p.stdout) def GetNumThreads(max_threads): - for x in [1, 2, 3, 5, 7]: + args = [] + name = platform.system() + if name == 'Darwin': + args = ['sysctl', '-n', 'hw.physicalcpu_max'] + elif name == 'Linux': + args = ['nproc', '--all'] + else: + args = ['wmic', 'cpu', 'get', 'NumberOfCores'] + + result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output = result.stdout.decode('utf-8') + phys_cores_count = int(output) + looplist = [1] + [i*phys_cores_count for i in [1,2,3,4,5,6,7]] \ + if GetFlag("hwloc_enabled") else [1,2,3,4,5] + + for x in looplist: if x >= max_threads: break yield x @@ -48,13 +65,25 @@ def test_num_threads(self): "num_threads", "--kokkos-num-threads={}".format(num_threads))) + def test_num_devices(self): + if "KOKKOS_VISIBLE_DEVICES" in os.environ: + self.skipTest("KOKKOS_VISIBLE_DEVICES environment variable is set") + num_devices = GetFlag("num_devices") + self.assertNotEqual(num_devices, 0) + if num_devices == -1: + self.skipTest("no device backend enabled") + self.assertGreaterEqual(num_devices, 1) + def test_device_id(self): - device_count = GetFlag("device_count") - if device_count == 0: - self.skipTest("no device detected") + if "KOKKOS_VISIBLE_DEVICES" in os.environ: + self.skipTest("KOKKOS_VISIBLE_DEVICES environment variable is set") + num_devices = GetFlag("num_devices") + if num_devices == -1: + self.assertEqual(-1, GetFlag("device_id")) + self.skipTest("no device backend enabled") # by default use the first GPU available for execution self.assertEqual(0, GetFlag("device_id")) - for device_id in range(device_count): + for device_id in range(num_devices): self.assertEqual( device_id, GetFlag( diff --git a/core/unit_test/TestExecutionSpace.hpp b/core/unit_test/TestExecutionSpace.hpp index 6f0f159c174..983a5975afd 100644 --- a/core/unit_test/TestExecutionSpace.hpp +++ b/core/unit_test/TestExecutionSpace.hpp @@ -25,13 +25,7 @@ struct CheckClassWithExecutionSpaceAsDataMemberIsCopyable { Kokkos::DefaultExecutionSpace device; Kokkos::DefaultHostExecutionSpace host; - KOKKOS_FUNCTION void operator()(int, int& e) const { - // not actually doing anything useful, mostly checking that - // ExecutionSpace::in_parallel() is callable - if (static_cast(device.in_parallel()) < 0) { - ++e; - } - } + KOKKOS_FUNCTION void operator()(int i, int& e) const { e += i; } CheckClassWithExecutionSpaceAsDataMemberIsCopyable() { int errors; diff --git a/core/unit_test/TestFunctorAnalysis.hpp b/core/unit_test/TestFunctorAnalysis.hpp index c024526111b..e58324144e4 100644 --- a/core/unit_test/TestFunctorAnalysis.hpp +++ b/core/unit_test/TestFunctorAnalysis.hpp @@ -59,16 +59,15 @@ void test_functor_analysis() { using R01 = typename A01::Reducer; - static_assert(std::is_void::value, ""); - static_assert(std::is_void::value, ""); - static_assert(std::is_void::value, ""); - static_assert(std::is_same::value, - ""); - - static_assert(!A01::has_join_member_function, ""); - static_assert(!A01::has_init_member_function, ""); - static_assert(!A01::has_final_member_function, ""); - static_assert(A01::StaticValueSize == 0, ""); + static_assert(std::is_void::value); + static_assert(std::is_void::value); + static_assert(std::is_void::value); + static_assert(std::is_same::value); + + static_assert(!A01::has_join_member_function); + static_assert(!A01::has_init_member_function); + static_assert(!A01::has_final_member_function); + static_assert(A01::StaticValueSize == 0); ASSERT_EQ(R01(c01).length(), 0); //------------------------------ @@ -78,16 +77,15 @@ void test_functor_analysis() { Kokkos::RangePolicy, decltype(c02), void>; using R02 = typename A02::Reducer; - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, - ""); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); - static_assert(!A02::has_join_member_function, ""); - static_assert(!A02::has_init_member_function, ""); - static_assert(!A02::has_final_member_function, ""); - static_assert(A02::StaticValueSize == sizeof(double), ""); + static_assert(!A02::has_join_member_function); + static_assert(!A02::has_init_member_function); + static_assert(!A02::has_final_member_function); + static_assert(A02::StaticValueSize == sizeof(double)); ASSERT_EQ(R02(c02).length(), 1); //------------------------------ @@ -99,23 +97,19 @@ void test_functor_analysis() { using R03 = typename A03::Reducer; static_assert(std::is_same::value, - ""); + TestFunctorAnalysis_03::value_type>::value); static_assert(std::is_same::value, - ""); + TestFunctorAnalysis_03::value_type*>::value); static_assert(std::is_same::value, - ""); + TestFunctorAnalysis_03::value_type&>::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); - static_assert(A03::has_join_member_function, ""); - static_assert(A03::has_init_member_function, ""); - static_assert(!A03::has_final_member_function, ""); - static_assert( - A03::StaticValueSize == sizeof(TestFunctorAnalysis_03::value_type), ""); + static_assert(A03::has_join_member_function); + static_assert(A03::has_init_member_function); + static_assert(!A03::has_final_member_function); + static_assert(A03::StaticValueSize == + sizeof(TestFunctorAnalysis_03::value_type)); ASSERT_EQ(R03(c03).length(), 1); //------------------------------ diff --git a/core/unit_test/TestHalfOperators.hpp b/core/unit_test/TestHalfOperators.hpp index 752e3b50816..c69cdd57034 100644 --- a/core/unit_test/TestHalfOperators.hpp +++ b/core/unit_test/TestHalfOperators.hpp @@ -268,96 +268,6 @@ enum OP_TESTS { N_OP_TESTS }; -// volatile-qualified parameter type 'volatile half_type' is deprecated -#if !defined(KOKKOS_ENABLE_CXX20) && !defined(KOKKOS_ENABLE_CXX23) -template -struct Functor_TestHalfVolatileOperators { - volatile half_type h_lhs, h_rhs; - view_type actual_lhs, expected_lhs; - double d_lhs, d_rhs; - Functor_TestHalfVolatileOperators(volatile half_type lhs = half_type(0), - volatile half_type rhs = half_type(0)) - : h_lhs(lhs), h_rhs(rhs) { - actual_lhs = view_type("actual_lhs", N_OP_TESTS); - expected_lhs = view_type("expected_lhs", N_OP_TESTS); - half_type nv_tmp; - nv_tmp = h_lhs; - d_lhs = static_cast(nv_tmp); - nv_tmp = h_rhs; - d_rhs = static_cast(nv_tmp); - if (std::is_same::value) { - auto run_on_host = *this; - run_on_host(0); - } else { - Kokkos::parallel_for("Test::Functor_TestHalfVolatileOperators", - Kokkos::RangePolicy(0, 1), *this); - } - } - - KOKKOS_FUNCTION - void operator()(int) const { - volatile half_type tmp_lhs; - half_type nv_tmp; - - // Initialze output views to catch missing test invocations - for (int i = 0; i < N_OP_TESTS; ++i) { - actual_lhs(i) = 1; - expected_lhs(i) = -1; - } - - nv_tmp = h_lhs; - actual_lhs(ASSIGN) = static_cast(nv_tmp); - expected_lhs(ASSIGN) = d_lhs; - - actual_lhs(LT_H_H) = h_lhs < h_rhs; - expected_lhs(LT_H_H) = d_lhs < d_rhs; - - actual_lhs(LE_H_H) = h_lhs <= h_rhs; - expected_lhs(LE_H_H) = d_lhs <= d_rhs; - - actual_lhs(NEQ) = h_lhs != h_rhs; - expected_lhs(NEQ) = d_lhs != d_rhs; - - actual_lhs(GT_H_H) = h_lhs > h_rhs; - expected_lhs(GT_H_H) = d_lhs > d_rhs; - - actual_lhs(GE_H_H) = h_lhs >= h_rhs; - expected_lhs(GE_H_H) = d_lhs >= d_rhs; - - actual_lhs(EQ) = h_lhs == h_rhs; - expected_lhs(EQ) = d_lhs == d_rhs; - - tmp_lhs = h_lhs; - tmp_lhs += h_rhs; - nv_tmp = tmp_lhs; - actual_lhs(CADD_H_H) = static_cast(nv_tmp); - expected_lhs(CADD_H_H) = d_lhs; - expected_lhs(CADD_H_H) += d_rhs; - - tmp_lhs = h_lhs; - tmp_lhs -= h_rhs; - nv_tmp = tmp_lhs; - actual_lhs(CSUB_H_H) = static_cast(nv_tmp); - expected_lhs(CSUB_H_H) = d_lhs; - expected_lhs(CSUB_H_H) -= d_rhs; - - tmp_lhs = h_lhs; - tmp_lhs *= h_rhs; - nv_tmp = tmp_lhs; - actual_lhs(CMUL_H_H) = static_cast(nv_tmp); - expected_lhs(CMUL_H_H) = d_lhs; - expected_lhs(CMUL_H_H) *= d_rhs; - - tmp_lhs = h_lhs; - tmp_lhs /= h_rhs; - nv_tmp = tmp_lhs; - actual_lhs(CDIV_H_H) = static_cast(nv_tmp); - expected_lhs(CDIV_H_H) = d_lhs; - expected_lhs(CDIV_H_H) /= d_rhs; - } -}; -#endif - template struct Functor_TestHalfOperators { half_type h_lhs, h_rhs; @@ -995,33 +905,6 @@ void __test_half_operators(half_type h_lhs, half_type h_rhs) { static_cast(epsilon)); } -// volatile-qualified parameter type 'volatile half_type' is deprecated -#if !defined(KOKKOS_ENABLE_CXX20) && !defined(KOKKOS_ENABLE_CXX23) - // Test partial volatile support - volatile half_type _h_lhs = h_lhs; - volatile half_type _h_rhs = h_rhs; - Functor_TestHalfVolatileOperators f_volatile_device( - _h_lhs, _h_rhs); - Functor_TestHalfVolatileOperators f_volatile_host( - _h_lhs, _h_rhs); - - ExecutionSpace().fence(); - Kokkos::deep_copy(f_device_actual_lhs, f_device.actual_lhs); - Kokkos::deep_copy(f_device_expected_lhs, f_device.expected_lhs); - for (int op_test = 0; op_test < N_OP_TESTS; op_test++) { - // printf("op_test = %d\n", op_test); - if (op_test == ASSIGN || op_test == LT_H_H || op_test == LE_H_H || - op_test == NEQ || op_test == EQ || op_test == GT_H_H || - op_test == GE_H_H || op_test == CADD_H_H || op_test == CSUB_H_H || - op_test == CMUL_H_H || op_test == CDIV_H_H) { - ASSERT_NEAR(f_device_actual_lhs(op_test), f_device_expected_lhs(op_test), - static_cast(epsilon)); - ASSERT_NEAR(f_host.actual_lhs(op_test), f_host.expected_lhs(op_test), - static_cast(epsilon)); - } - } -#endif - // is_trivially_copyable is false with the addition of explicit // copy constructors that are required for supporting reductions // ASSERT_TRUE(std::is_trivially_copyable::value); diff --git a/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp b/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp index 3ee2ff52051..467b9ad157f 100644 --- a/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp +++ b/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp @@ -37,7 +37,7 @@ template struct CheckAccessStoredPointerAndDereferenceOnDevice { SmartPtr m_device_ptr; using ElementType = typename SmartPtr::element_type; - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); CheckAccessStoredPointerAndDereferenceOnDevice(SmartPtr device_ptr) : m_device_ptr(device_ptr) { diff --git a/core/unit_test/TestInitializationSettings.cpp b/core/unit_test/TestInitializationSettings.cpp index f5be0e47aab..40dc3f11df3 100644 --- a/core/unit_test/TestInitializationSettings.cpp +++ b/core/unit_test/TestInitializationSettings.cpp @@ -20,30 +20,6 @@ namespace { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -void take_initialization_settings(Kokkos::InitializationSettings const&) {} - -TEST(defaultdevicetype, - init_arguments_implicit_conversion_to_initialization_settings) { - Kokkos::InitArguments arguments; - take_initialization_settings(arguments); // check that conversion is implicit - arguments.device_id = 1; - arguments.tune_internals = true; - Kokkos::InitializationSettings settings{arguments}; - EXPECT_FALSE(settings.has_num_threads()); - EXPECT_TRUE(settings.has_device_id()); - EXPECT_EQ(settings.get_device_id(), 1); - EXPECT_FALSE(settings.has_num_devices()); - EXPECT_FALSE(settings.has_skip_device()); - EXPECT_FALSE(settings.has_disable_warnings()); - EXPECT_TRUE(settings.has_tune_internals()); - EXPECT_TRUE(settings.get_tune_internals()); - EXPECT_FALSE(settings.has_tools_help()); - EXPECT_FALSE(settings.has_tools_libs()); - EXPECT_FALSE(settings.has_tools_args()); -} -#endif - TEST(defaultdevicetype, initialization_settings) { auto const settings = Kokkos::InitializationSettings() .set_num_threads(255) @@ -52,8 +28,6 @@ TEST(defaultdevicetype, initialization_settings) { EXPECT_TRUE(settings.has_num_threads()); EXPECT_EQ(settings.get_num_threads(), 255); EXPECT_FALSE(settings.has_device_id()); - EXPECT_FALSE(settings.has_num_devices()); - EXPECT_FALSE(settings.has_skip_device()); EXPECT_TRUE(settings.has_disable_warnings()); EXPECT_FALSE(settings.get_disable_warnings()); EXPECT_FALSE(settings.has_tune_internals()); @@ -75,8 +49,6 @@ constexpr bool test_initialization_settings_getter() { TYPE>::value); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(num_threads, int); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(device_id, int); - CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(num_devices, int); - CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(skip_device, int); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(disable_warnings, bool); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(tune_internals, bool); CHECK_INITIALIZATION_SETTINGS_GETTER_RETURN_TYPE(tools_help, bool); diff --git a/core/unit_test/TestJoinBackwardCompatibility.hpp b/core/unit_test/TestJoinBackwardCompatibility.hpp index 24cf52aa709..efe4a2307a8 100644 --- a/core/unit_test/TestJoinBackwardCompatibility.hpp +++ b/core/unit_test/TestJoinBackwardCompatibility.hpp @@ -36,9 +36,8 @@ KOKKOS_FUNCTION constexpr MyErrorCode operator|(MyErrorCode lhs, } static_assert((no_error | error_operator_plus_equal_volatile) == - error_operator_plus_equal_volatile, - ""); -static_assert((error_join_volatile | error_operator_plus_equal) == 0b101, ""); + error_operator_plus_equal_volatile); +static_assert((error_join_volatile | error_operator_plus_equal) == 0b101); struct MyJoinBackCompatValueType { MyErrorCode err = no_error; diff --git a/core/unit_test/TestMDRangePolicyCTAD.cpp b/core/unit_test/TestMDRangePolicyCTAD.cpp new file mode 100644 index 00000000000..b2c3d021c35 --- /dev/null +++ b/core/unit_test/TestMDRangePolicyCTAD.cpp @@ -0,0 +1,138 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +namespace { + +struct TestMDRangePolicyCTAD { + template + static void maybe_unused(Ts&&...) {} + + struct SomeExecutionSpace { + using execution_space = SomeExecutionSpace; + using size_type = size_t; + }; + static_assert(Kokkos::is_execution_space_v); + + struct ImplicitlyConvertibleToDefaultExecutionSpace { + [[maybe_unused]] operator Kokkos::DefaultExecutionSpace() const { + return Kokkos::DefaultExecutionSpace(); + } + }; + static_assert(!Kokkos::is_execution_space_v< + ImplicitlyConvertibleToDefaultExecutionSpace>); + + [[maybe_unused]] static inline Kokkos::DefaultExecutionSpace des; + [[maybe_unused]] static inline ImplicitlyConvertibleToDefaultExecutionSpace + notEs; + [[maybe_unused]] static inline SomeExecutionSpace ses; + + [[maybe_unused]] static inline int t[5]; + [[maybe_unused]] static inline int64_t tt[5]; + [[maybe_unused]] static inline Kokkos::Array a; + [[maybe_unused]] static inline Kokkos::Array aa; + [[maybe_unused]] static inline int64_t i64; + + // Workaround for nvc++ (CUDA-11.7-NVHPC) ignoring [[maybe_unused]] on + // ImplicitlyConvertibleToDefaultExecutionSpace::operator + // Kokkos::DefaultExecutionSpace() const + [[maybe_unused]] static inline Kokkos::DefaultExecutionSpace notEsToDes = + notEs; + + // Workaround for HIP-ROCm-5.2 "declared but never referenced" + TestMDRangePolicyCTAD() { + maybe_unused(des, notEs, ses, t, tt, a, aa, notEsToDes, i64); + } + + // MDRangePolicy with C array parameters + + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(t, t))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(t, t, tt))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(des, t, tt))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(notEs, t, t))>); + + static_assert( + std::is_same_v< + Kokkos::MDRangePolicy>, + decltype(Kokkos::MDRangePolicy(ses, t, t))>); + + // MDRangePolicy with Kokkos::initializer_list parameters + + static_assert(std::is_same_v>, + decltype(Kokkos::MDRangePolicy( + {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}))>); + + static_assert(std::is_same_v>, + decltype(Kokkos::MDRangePolicy( + {1, 2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 6}, + {i64, i64, i64, i64, i64, i64}))>); + + static_assert(std::is_same_v>, + decltype(Kokkos::MDRangePolicy( + des, {1, 2, 3, 4, 5, 6}, + {i64, i64, i64, i64, i64, i64}))>); + + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(notEs, {1, 2, 3, 4, 5, 6}, + {1, 2, 3, 4, 5, 6}))>); + + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(ses, {1, 2, 3, 4, 5, 6}, + {1, 2, 3, 4, 5, 6}))>); + + // MDRangePolicy with Kokkos::Array parameters + + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(a, a))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(a, a, aa))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(des, a, a))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(notEs, a, a))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(des, a, a, aa))>); + static_assert( + std::is_same_v>, + decltype(Kokkos::MDRangePolicy(notEs, a, a, aa))>); + + static_assert( + std::is_same_v< + Kokkos::MDRangePolicy>, + decltype(Kokkos::MDRangePolicy(ses, a, a))>); + static_assert( + std::is_same_v< + Kokkos::MDRangePolicy>, + decltype(Kokkos::MDRangePolicy(ses, a, a, aa))>); +}; + +} // namespace diff --git a/core/unit_test/TestMDRangePolicyConstructors.hpp b/core/unit_test/TestMDRangePolicyConstructors.hpp index f577f415e7c..6f241b45d47 100644 --- a/core/unit_test/TestMDRangePolicyConstructors.hpp +++ b/core/unit_test/TestMDRangePolicyConstructors.hpp @@ -18,6 +18,8 @@ #include +#include + namespace { template @@ -86,12 +88,56 @@ TEST(TEST_CATEGORY_DEATH, policy_bounds_unsafe_narrowing_conversions) { using Policy = Kokkos::MDRangePolicy, Kokkos::IndexType>; + std::string msg = + "Kokkos::MDRangePolicy bound type error: an unsafe implicit conversion " + "is " + "performed on a bound (-1) in dimension (0), which may not preserve its " + "original value.\n"; + std::string expected = std::regex_replace(msg, std::regex("\\(|\\)"), "\\$&"); + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - ASSERT_DEATH( - { - (void)Policy({-1, 0}, {2, 3}); - }, - "unsafe narrowing conversion"); + ASSERT_DEATH({ (void)Policy({-1, 0}, {2, 3}); }, expected); +} + +TEST(TEST_CATEGORY_DEATH, policy_invalid_bounds) { + using Policy = Kokkos::MDRangePolicy>; + + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + + auto [dim0, dim1] = (Policy::inner_direction == Kokkos::Iterate::Right) + ? std::make_pair(1, 0) + : std::make_pair(0, 1); + std::string msg1 = + "Kokkos::MDRangePolicy bounds error: The lower bound (100) is greater " + "than its upper bound (90) in dimension " + + std::to_string(dim0) + ".\n"; + + std::string msg2 = + "Kokkos::MDRangePolicy bounds error: The lower bound (100) is greater " + "than its upper bound (90) in dimension " + + std::to_string(dim1) + ".\n"; + +#if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + // escape the parentheses in the regex to match the error message + msg1 = std::regex_replace(msg1, std::regex("\\(|\\)"), "\\$&"); + (void)msg2; + ASSERT_DEATH({ (void)Policy({100, 100}, {90, 90}); }, msg1); +#else + if (!Kokkos::show_warnings()) { + GTEST_SKIP() << "Kokkos warning messages are disabled"; + } + + ::testing::internal::CaptureStderr(); + (void)Policy({100, 100}, {90, 90}); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + ASSERT_EQ(::testing::internal::GetCapturedStderr(), msg1 + msg2); +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg1; + (void)msg2; +#endif + +#endif } #endif diff --git a/core/unit_test/TestMathematicalFunctions.hpp b/core/unit_test/TestMathematicalFunctions.hpp index 424ba05a904..ad035d4e4bf 100644 --- a/core/unit_test/TestMathematicalFunctions.hpp +++ b/core/unit_test/TestMathematicalFunctions.hpp @@ -287,21 +287,20 @@ struct FloatingPointComparison { public: template - KOKKOS_FUNCTION bool compare_near_zero(FPT const& fpv, double ulp) const { + KOKKOS_FUNCTION bool compare_near_zero(FPT const& fpv, int ulp) const { auto abs_tol = eps(fpv) * ulp; bool ar = absolute(fpv) < abs_tol; if (!ar) { Kokkos::printf("absolute value exceeds tolerance [|%e| > %e]\n", - (double)fpv, abs_tol); + (double)fpv, (double)abs_tol); } return ar; } template - KOKKOS_FUNCTION bool compare(Lhs const& lhs, Rhs const& rhs, - double ulp) const { + KOKKOS_FUNCTION bool compare(Lhs const& lhs, Rhs const& rhs, int ulp) const { if (lhs == 0) { return compare_near_zero(rhs, ulp); } else if (rhs == 0) { @@ -315,7 +314,7 @@ struct FloatingPointComparison { bool ar = abs_diff == 0 || rel_diff < rel_tol; if (!ar) { Kokkos::printf("relative difference exceeds tolerance [%e > %e]\n", - (double)rel_diff, rel_tol); + (double)rel_diff, (double)rel_tol); } return ar; @@ -348,7 +347,7 @@ struct math_function_name; } \ MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE \ } \ - static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \ + static KOKKOS_FUNCTION int ulp_factor() { return ULP_FACTOR; } \ }; \ using kk_##FUNC = MathUnaryFunction_##FUNC; \ template <> \ @@ -373,7 +372,7 @@ struct math_function_name; math_unary_function_return_type_t>::value); \ return REF_FUNC; \ } \ - static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \ + static KOKKOS_FUNCTION int ulp_factor() { return ULP_FACTOR; } \ }; \ using kk_##FUNC = MathUnaryFunction_##FUNC; \ template <> \ @@ -477,7 +476,7 @@ DEFINE_UNARY_FUNCTION_EVAL(logb, 2); } \ MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE \ } \ - static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \ + static KOKKOS_FUNCTION int ulp_factor() { return ULP_FACTOR; } \ }; \ using kk_##FUNC = MathBinaryFunction_##FUNC; \ template <> \ @@ -511,7 +510,7 @@ DEFINE_BINARY_FUNCTION_EVAL(copysign, 1); math_ternary_function_return_type_t>::value); \ return std::FUNC(x, y, z); \ } \ - static KOKKOS_FUNCTION double ulp_factor() { return ULP_FACTOR; } \ + static KOKKOS_FUNCTION int ulp_factor() { return ULP_FACTOR; } \ }; \ using kk3_##FUNC = MathTernaryFunction_##FUNC; \ template <> \ @@ -1307,12 +1306,12 @@ struct TestAbsoluteValueFunction { if (abs(static_cast(4.f)) != static_cast(4.f) || abs(static_cast(-4.f)) != static_cast(4.f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(KE::half_t)\n"); + Kokkos::printf("failed abs(KE::half_t)\n"); } if (abs(static_cast(4.f)) != static_cast(4.f) || abs(static_cast(-4.f)) != static_cast(4.f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(KE::bhalf_t)\n"); + Kokkos::printf("failed abs(KE::bhalf_t)\n"); } if (abs(5.) != 5. || abs(-5.) != 5.) { ++e; @@ -1332,19 +1331,17 @@ struct TestAbsoluteValueFunction { Kokkos::printf("failed abs(floating_point) special values\n"); } - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); static_assert(std::is_same(4.f))), - KE::half_t>::value, - ""); + KE::half_t>::value); static_assert(std::is_same(4.f))), - KE::bhalf_t>::value, - ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); + KE::bhalf_t>::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); #endif } }; @@ -1365,26 +1362,26 @@ struct TestFloatingPointAbsoluteValueFunction { using Kokkos::fabs; if (fabs(4.f) != 4.f || fabs(-4.f) != 4.f) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(float)\n"); + Kokkos::printf("failed fabs(float)\n"); } if (fabs(static_cast(4.f)) != static_cast(4.f) || fabs(static_cast(-4.f)) != static_cast(4.f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(KE::half_t)\n"); + Kokkos::printf("failed fabs(KE::half_t)\n"); } if (fabs(static_cast(4.f)) != static_cast(4.f) || fabs(static_cast(-4.f)) != static_cast(4.f)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(KE::bhalf_t)\n"); + Kokkos::printf("failed fabs(KE::bhalf_t)\n"); } if (fabs(5.) != 5. || fabs(-5.) != 5.) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(double)\n"); + Kokkos::printf("failed fabs(double)\n"); } #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS if (fabs(6.l) != 6.l || fabs(-6.l) != 6.l) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fabs(long double)\n"); + Kokkos::printf("failed fabs(long double)\n"); } #endif // special values @@ -1392,8 +1389,7 @@ struct TestFloatingPointAbsoluteValueFunction { using Kokkos::isnan; if (fabs(-0.) != 0. || !isinf(fabs(-INFINITY)) || !isnan(fabs(-NAN))) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "failed fabs(floating_point) special values\n"); + Kokkos::printf("failed fabs(floating_point) special values\n"); } static_assert(std::is_same(4.f))), @@ -1425,7 +1421,7 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { if (!compare(fmod(6.2f, 4.f), 2.2f, 1) && !compare(fmod(-6.2f, 4.f), -2.2f, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(float)\n"); + Kokkos::printf("failed fmod(float)\n"); } if (!compare( fmod(static_cast(6.2f), static_cast(4.f)), @@ -1434,7 +1430,7 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { fmod(static_cast(-6.2f), static_cast(4.f)), -static_cast(2.2f), 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(KE::half_t)\n"); + Kokkos::printf("failed fmod(KE::half_t)\n"); } if (!compare( fmod(static_cast(6.2f), static_cast(4.f)), @@ -1443,17 +1439,17 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { static_cast(4.f)), -static_cast(2.2f), 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(KE::bhalf_t)\n"); + Kokkos::printf("failed fmod(KE::bhalf_t)\n"); } if (!compare(fmod(6.2, 4.), 2.2, 1) && !compare(fmod(-6.2, 4.), -2.2, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(double)\n"); + Kokkos::printf("failed fmod(double)\n"); } #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS if (!compare(fmod(6.2l, 4.l), 2.2l, 1) && !compare(fmod(-6.2l, 4.l), -2.2l, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed fmod(long double)\n"); + Kokkos::printf("failed fmod(long double)\n"); } #endif // special values @@ -1462,23 +1458,19 @@ struct TestFloatingPointRemainderFunction : FloatingPointComparison { if (!isinf(fmod(-KE::infinity::value, 1.f)) && !isnan(fmod(-KE::quiet_NaN::value, 1.f))) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "failed fmod(floating_point) special values\n"); + Kokkos::printf("failed fmod(floating_point) special values\n"); } static_assert(std::is_same(4.f), static_cast(4.f))), - KE::half_t>::value, - ""); + KE::half_t>::value); static_assert(std::is_same(4.f), static_cast(4.f))), - KE::bhalf_t>::value, - ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); + KE::bhalf_t>::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS - static_assert(std::is_same::value, - ""); + static_assert(std::is_same::value); #endif } }; @@ -1502,7 +1494,7 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { if (!compare(remainder(6.2f, 4.f), 2.2f, 2) && !compare(remainder(-6.2f, 4.f), 2.2f, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(float)\n"); + Kokkos::printf("failed remainder(float)\n"); } if (!compare(remainder(static_cast(6.2f), static_cast(4.f)), @@ -1511,7 +1503,7 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { static_cast(4.f)), -static_cast(2.2f), 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(KE::half_t)\n"); + Kokkos::printf("failed remainder(KE::half_t)\n"); } if (!compare(remainder(static_cast(6.2f), static_cast(4.f)), @@ -1520,18 +1512,18 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { static_cast(4.f)), -static_cast(2.2f), 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(KE::bhalf_t)\n"); + Kokkos::printf("failed remainder(KE::bhalf_t)\n"); } if (!compare(remainder(6.2, 4.), 2.2, 2) && !compare(remainder(-6.2, 4.), 2.2, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(double)\n"); + Kokkos::printf("failed remainder(double)\n"); } #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS if (!compare(remainder(6.2l, 4.l), 2.2l, 1) && !compare(remainder(-6.2l, 4.l), -2.2l, 1)) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed remainder(long double)\n"); + Kokkos::printf("failed remainder(long double)\n"); } #endif // special values @@ -1540,26 +1532,23 @@ struct TestIEEEFloatingPointRemainderFunction : FloatingPointComparison { if (!isinf(remainder(-KE::infinity::value, 1.f)) && !isnan(remainder(-KE::quiet_NaN::value, 1.f))) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF( + Kokkos::printf( "failed remainder(floating_point) special values\n"); } static_assert( std::is_same(4.f), static_cast(4.f))), - KE::half_t>::value, - ""); + KE::half_t>::value); static_assert( std::is_same(4.f), static_cast(4.f))), - KE::bhalf_t>::value, - ""); - static_assert(std::is_same::value, - ""); - static_assert(std::is_same::value, ""); + KE::bhalf_t>::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS static_assert( - std::is_same::value, ""); + std::is_same::value); #endif } }; @@ -1765,7 +1754,7 @@ struct TestIsNaN { #endif ) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(KE::half_t)\n"); + Kokkos::printf("failed isnan(KE::half_t)\n"); } if (isnan(static_cast(2.f)) #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 @@ -1775,7 +1764,7 @@ struct TestIsNaN { #endif ) { ++e; - KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed isnan(KE::bhalf_t)\n"); + Kokkos::printf("failed isnan(KE::bhalf_t)\n"); } if (isnan(3.) #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 @@ -1801,11 +1790,11 @@ struct TestIsNaN { Kokkos::printf("failed isnan(floating_point) special values\n"); } - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); + static_assert(std::is_same::value); + static_assert(std::is_same::value); #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS - static_assert(std::is_same::value, ""); + static_assert(std::is_same::value); #endif } }; diff --git a/core/unit_test/TestMathematicalSpecialFunctions.hpp b/core/unit_test/TestMathematicalSpecialFunctions.hpp index 06c84c75137..7969dc86864 100644 --- a/core/unit_test/TestMathematicalSpecialFunctions.hpp +++ b/core/unit_test/TestMathematicalSpecialFunctions.hpp @@ -1213,13 +1213,13 @@ struct TestComplexBesselI0K0Function { } EXPECT_EQ(h_ref_cbk0(0), h_cbk0(0)); - int upper_limit = N; + int upper_limit_0 = N; // FIXME_SYCL Failing for Intel GPUs, 19 is the first failing test case #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) if (std::is_same_v) - upper_limit = 19; + upper_limit_0 = 19; #endif - for (int i = 1; i < upper_limit; i++) { + for (int i = 1; i < upper_limit_0; i++) { EXPECT_LE(Kokkos::abs(h_cbk0(i) - h_ref_cbk0(i)), Kokkos::abs(h_ref_cbk0(i)) * 1e-13) << "at index " << i; @@ -1462,13 +1462,13 @@ struct TestComplexBesselI1K1Function { } EXPECT_EQ(h_ref_cbk1(0), h_cbk1(0)); - int upper_limit = N; + int upper_limit_1 = N; // FIXME_SYCL Failing for Intel GPUs, 8 is the first failing test case #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) if (std::is_same_v) - upper_limit = 8; + upper_limit_1 = 8; #endif - for (int i = 1; i < upper_limit; i++) { + for (int i = 1; i < upper_limit_1; i++) { EXPECT_LE(Kokkos::abs(h_cbk1(i) - h_ref_cbk1(i)), Kokkos::abs(h_ref_cbk1(i)) * 1e-13) << "at index " << i; @@ -1718,20 +1718,26 @@ struct TestComplexBesselH1Function { ((HIP_VERSION_MAJOR == 5) && \ !((HIP_VERSION_MINOR == 5) || (HIP_VERSION_MINOR == 6))) EXPECT_EQ(h_ref_ch10(0), h_ch10(0)); - for (int i = 1; i < N; i++) { + int upper_limit_10 = N; +// FIXME_SYCL Failing for Intel GPUs, 17 is the first failing test case +#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + upper_limit_10 = 17; +#endif + for (int i = 1; i < upper_limit_10; i++) { EXPECT_LE(Kokkos::abs(h_ch10(i) - h_ref_ch10(i)), Kokkos::abs(h_ref_ch10(i)) * 1e-13) << "at index " << i; } EXPECT_EQ(h_ref_ch11(0), h_ch11(0)); - int upper_limit = N; - // FIXME_SYCL Failing for Intel GPUs, 16 is the first failing test case + int upper_limit_11 = N; + // FIXME_SYCL Failing for Intel GPUs, 2 is the first failing test case #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) if (std::is_same_v) - upper_limit = 16; + upper_limit_11 = 2; #endif - for (int i = 1; i < upper_limit; i++) { + for (int i = 1; i < upper_limit_11; i++) { EXPECT_LE(Kokkos::abs(h_ch11(i) - h_ref_ch11(i)), Kokkos::abs(h_ref_ch11(i)) * 1e-13) << "at index " << i; @@ -1912,19 +1918,26 @@ struct TestComplexBesselH2Function { ((HIP_VERSION_MAJOR == 5) && \ !((HIP_VERSION_MINOR == 5) || (HIP_VERSION_MINOR == 6))) EXPECT_EQ(h_ref_ch20(0), h_ch20(0)); - for (int i = 1; i < N; i++) { + int upper_limit_20 = N; +// FIXME_SYCL Failing for Intel GPUs, 16 is the first failing test case +#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + upper_limit_20 = 16; +#endif + for (int i = 1; i < upper_limit_20; i++) { EXPECT_LE(Kokkos::abs(h_ch20(i) - h_ref_ch20(i)), - Kokkos::abs(h_ref_ch20(i)) * 1e-13); + Kokkos::abs(h_ref_ch20(i)) * 1e-13) + << "at index " << i; } EXPECT_EQ(h_ref_ch21(0), h_ch21(0)); - int upper_limit = N; - // FIXME_SYCL Failing for Intel GPUs, 17 is the first failing test case + int upper_limit_21 = N; + // FIXME_SYCL Failing for Intel GPUs, 1 is the first failing test case #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) if (std::is_same_v) - upper_limit = 17; + upper_limit_21 = 1; #endif - for (int i = 1; i < upper_limit; i++) { + for (int i = 1; i < upper_limit_21; i++) { EXPECT_LE(Kokkos::abs(h_ch21(i) - h_ref_ch21(i)), Kokkos::abs(h_ref_ch21(i)) * 1e-13) << "at index " << i; @@ -1954,31 +1967,61 @@ TEST(TEST_CATEGORY, mathspecialfunc_errorfunc) { #endif TEST(TEST_CATEGORY, mathspecialfunc_cbesselj0y0) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselJ0Y0Function test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesselj1y1) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselJ1Y1Function test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesseli0k0) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselI0K0Function test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesseli1k1) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselI1K1Function test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesselh1stkind) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselH1Function test; test.testit(); } TEST(TEST_CATEGORY, mathspecialfunc_cbesselh2ndkind) { +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ARCH_INTEL_GPU) + if (std::is_same_v) + GTEST_SKIP() << "skipping since test is known to fail with OpenMPTarget on " + "Intel GPUs"; // FIXME_OPENMPTARGET +#endif TestComplexBesselH2Function test; test.testit(); } diff --git a/core/unit_test/TestNonTrivialScalarTypes.hpp b/core/unit_test/TestNonTrivialScalarTypes.hpp index eaf7a4125cc..116ac58c39f 100644 --- a/core/unit_test/TestNonTrivialScalarTypes.hpp +++ b/core/unit_test/TestNonTrivialScalarTypes.hpp @@ -214,7 +214,7 @@ struct point_t { uint8_t x, y, z; KOKKOS_FUNCTION - point_t() : x(1), y(1), z(1){}; + point_t() : x(0), y(0), z(0){}; KOKKOS_FUNCTION point_t(const point_t &val) : x(val.x), y(val.y), z(val.z){}; diff --git a/core/unit_test/TestNumericTraits.hpp b/core/unit_test/TestNumericTraits.hpp index ec1c1e0ca0b..81a9d0a5e0d 100644 --- a/core/unit_test/TestNumericTraits.hpp +++ b/core/unit_test/TestNumericTraits.hpp @@ -210,9 +210,10 @@ TEST(TEST_CATEGORY, numeric_traits_infinity) { #endif TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -224,9 +225,9 @@ TEST(TEST_CATEGORY, numeric_traits_epsilon) { #endif TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -239,9 +240,9 @@ TEST(TEST_CATEGORY, numeric_traits_round_error) { #endif TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -253,9 +254,9 @@ TEST(TEST_CATEGORY, numeric_traits_norm_min) { #endif TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -263,9 +264,9 @@ TEST(TEST_CATEGORY, numeric_traits_norm_min) { TEST(TEST_CATEGORY, numeric_traits_denorm_min) { TestNumericTraits(); TestNumericTraits(); - // FIXME_NVHPC long double not supported -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -302,8 +303,10 @@ TEST(TEST_CATEGORY, numeric_traits_finite_min_max) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); TestNumericTraits(); #endif @@ -326,8 +329,10 @@ TEST(TEST_CATEGORY, numeric_traits_digits) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -349,8 +354,10 @@ TEST(TEST_CATEGORY, numeric_traits_digits10) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -358,8 +365,10 @@ TEST(TEST_CATEGORY, numeric_traits_digits10) { TEST(TEST_CATEGORY, numeric_traits_max_digits10) { TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -380,8 +389,10 @@ TEST(TEST_CATEGORY, numeric_traits_radix) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); #endif } @@ -395,8 +406,10 @@ TEST(TEST_CATEGORY, numeric_traits_min_max_exponent) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); TestNumericTraits(); #endif @@ -407,8 +420,10 @@ TEST(TEST_CATEGORY, numeric_traits_min_max_exponent10) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); TestNumericTraits(); #endif @@ -426,8 +441,10 @@ TEST(TEST_CATEGORY, numeric_traits_quiet_and_signaling_nan) { TestNumericTraits(); TestNumericTraits(); TestNumericTraits(); -#if !defined(KOKKOS_ENABLE_CUDA) || \ - !defined(KOKKOS_COMPILER_NVHPC) // 23.7 long double: + // FIXME_NVHPC 23.7 long double + // FIXME_OPENMPTARGET long double on Intel GPUs +#if (!defined(KOKKOS_ENABLE_CUDA) || !defined(KOKKOS_COMPILER_NVHPC)) && \ + (!defined(KOKKOS_ENABLE_OPENMPTARGET) || !defined(KOKKOS_ARCH_INTEL_GPU)) TestNumericTraits(); TestNumericTraits(); #endif @@ -442,7 +459,7 @@ struct HasNoSpecialization {}; using TRAIT##_value_t = decltype(Kokkos::Experimental::TRAIT::value); \ template \ using has_##TRAIT = Kokkos::is_detected; \ - static_assert(!has_##TRAIT::value, ""); + static_assert(!has_##TRAIT::value); CHECK_TRAIT_IS_SFINAE_FRIENDLY(infinity) CHECK_TRAIT_IS_SFINAE_FRIENDLY(finite_min) @@ -524,39 +541,39 @@ CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, denorm_min); #endif // clang-format off -static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits< float>::min(), ""); -static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits< double>::min(), ""); -static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits::min(), ""); +static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits< float>::min()); +static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits< double>::min()); +static_assert(Kokkos::Experimental::norm_min::value == std::numeric_limits::min()); // integer types -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< char>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< signed char>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned char>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< short>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned short>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< long int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned long int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< long long int>::min(), ""); -static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits::min(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< char>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< signed char>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned char>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< short>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned short>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< long int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned long int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< long long int>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits::max(), ""); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< char>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< signed char>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned char>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< short>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned short>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< long int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< unsigned long int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits< long long int>::min()); +static_assert(Kokkos::Experimental::finite_min::value == std::numeric_limits::min()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< char>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< signed char>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned char>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< short>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned short>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< long int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< unsigned long int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< long long int>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits::max()); // floating point types -static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits< float>::max(), ""); -static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits< double>::max(), ""); -static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< float>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< double>::max(), ""); -static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits::max(), ""); +static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits< float>::max()); +static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits< double>::max()); +static_assert(Kokkos::Experimental::finite_min::value == -std::numeric_limits::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< float>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits< double>::max()); +static_assert(Kokkos::Experimental::finite_max::value == std::numeric_limits::max()); // clang-format on CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(bool, digits); @@ -623,15 +640,13 @@ CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, max_exponent10); #undef CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION #undef CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT -#define CHECK_NAN_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(T, TRAIT) \ - static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ - static_assert( \ - std::numeric_limits::TRAIT() != std::numeric_limits::TRAIT(), ""); \ - static_assert(Kokkos::Experimental::TRAIT::value != \ - std::numeric_limits::TRAIT(), \ - "") +#define CHECK_NAN_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(T, TRAIT) \ + static_assert(Kokkos::Experimental::TRAIT::value != \ + Kokkos::Experimental::TRAIT::value); \ + static_assert(std::numeric_limits::TRAIT() != \ + std::numeric_limits::TRAIT()); \ + static_assert(Kokkos::Experimental::TRAIT::value != \ + std::numeric_limits::TRAIT()) // Workaround compiler issue error: expression must have a constant value // See kokkos/kokkos#4574 @@ -651,14 +666,11 @@ CHECK_NAN_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, signaling_NaN); #define CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES(T, TRAIT) \ static_assert(Kokkos::Experimental::TRAIT::value == \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value == \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value == \ - Kokkos::Experimental::TRAIT::value, \ - "") + Kokkos::Experimental::TRAIT::value) #define CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT(TRAIT) \ CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES(float, TRAIT); \ @@ -706,17 +718,13 @@ CHECK_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT(max_exponent10); #define CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES(T, TRAIT) \ static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - ""); \ + Kokkos::Experimental::TRAIT::value); \ static_assert(Kokkos::Experimental::TRAIT::value != \ - Kokkos::Experimental::TRAIT::value, \ - "") + Kokkos::Experimental::TRAIT::value) #define CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES_FLOATING_POINT(TRAIT) \ CHECK_NAN_INSTANTIATED_ON_CV_QUALIFIED_TYPES(float, TRAIT); \ diff --git a/core/unit_test/TestOccupancyControlTrait.hpp b/core/unit_test/TestOccupancyControlTrait.hpp new file mode 100644 index 00000000000..345a906d668 --- /dev/null +++ b/core/unit_test/TestOccupancyControlTrait.hpp @@ -0,0 +1,80 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +namespace { + +template +void test_policy_execution(const Kokkos::RangePolicy& policy) { + Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int){}); +} +template +void test_policy_execution(const Kokkos::TeamPolicy& policy) { + Kokkos::parallel_for( + policy, + KOKKOS_LAMBDA( + const typename Kokkos::TeamPolicy::member_type&){}); +} +template +void test_policy_execution(const Kokkos::MDRangePolicy& policy) { + Kokkos::parallel_for(policy, KOKKOS_LAMBDA(int, int){}); +} + +template +void test_prefer_desired_occupancy(Policy policy) { + using Kokkos::Experimental::DesiredOccupancy; + using Kokkos::Experimental::MaximizeOccupancy; + using Kokkos::Experimental::prefer; + using Kokkos::Experimental::WorkItemProperty; + + // MaximizeOccupancy -> MaximizeOccupancy + auto const policy_still_no_occ = prefer(policy, MaximizeOccupancy{}); + test_policy_execution(policy_still_no_occ); + + // MaximizeOccupancy -> DesiredOccupancy + auto const policy_with_occ = + prefer(policy_still_no_occ, DesiredOccupancy{33}); + test_policy_execution(policy_with_occ); + + // DesiredOccupancy -> DesiredOccupancy + auto const policy_change_occ = prefer(policy_with_occ, DesiredOccupancy{24}); + test_policy_execution(policy_change_occ); + + // DesiredOccupancy -> DesiredOccupancy w/ hint + auto policy_with_occ_and_hint = Kokkos::Experimental::require( + policy_change_occ, + Kokkos::Experimental::WorkItemProperty::HintLightWeight); + test_policy_execution(policy_with_occ_and_hint); + + // DesiredOccupancy -> MaximizeOccupancy + auto const policy_drop_occ = + prefer(policy_with_occ_and_hint, MaximizeOccupancy{}); + test_policy_execution(policy_drop_occ); +} + +// FIXME_MSVC_WITH_CUDA +// This test doesn't compile with CUDA on Windows +#if !(defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA)) +TEST(TEST_CATEGORY, occupancy_control) { + test_prefer_desired_occupancy(Kokkos::RangePolicy(0, 1)); + test_prefer_desired_occupancy( + Kokkos::TeamPolicy{1, Kokkos::AUTO}); + test_prefer_desired_occupancy( + Kokkos::MDRangePolicy>{{0, 0}, {1, 1}}); +} +#endif +} // namespace diff --git a/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp b/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp index 176ce9b5fed..a56dfd9efc7 100644 --- a/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp +++ b/core/unit_test/TestParseCmdLineArgsAndEnvVars.cpp @@ -166,22 +166,6 @@ TEST(defaultdevicetype, cmd_line_args_device_id) { EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, {"--dummy"}); } -TEST(defaultdevicetype, cmd_line_args_num_devices) { - CmdLineArgsHelper cla = {{ - "--kokkos-num-devices=5,6", - "--kokkos-num-devices=7", - "-v", - }}; - Kokkos::InitializationSettings settings; - Kokkos::Impl::parse_command_line_arguments(cla.argc(), cla.argv(), settings); - EXPECT_TRUE(settings.has_num_devices()); - EXPECT_EQ(settings.get_num_devices(), 7); - // this is the current behavior, not suggesting this cannot be revisited - EXPECT_TRUE(settings.has_skip_device()) << "behavior changed see comment"; - EXPECT_EQ(settings.get_skip_device(), 6) << "behavior changed see comment"; - EXPECT_REMAINING_COMMAND_LINE_ARGUMENTS(cla, {"-v"}); -} - TEST(defaultdevicetype, cmd_line_args_disable_warning) { CmdLineArgsHelper cla = {{ "--kokkos-disable-warnings=1", @@ -351,20 +335,6 @@ TEST(defaultdevicetype, env_vars_device_id) { EXPECT_EQ(settings.get_device_id(), 33); } -TEST(defaultdevicetype, env_vars_num_devices) { - EnvVarsHelper ev = {{ - {"KOKKOS_NUM_DEVICES", "4"}, - {"KOKKOS_SKIP_DEVICE", "1"}, - }}; - SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev); - Kokkos::InitializationSettings settings; - Kokkos::Impl::parse_environment_variables(settings); - EXPECT_TRUE(settings.has_num_devices()); - EXPECT_EQ(settings.get_num_devices(), 4); - EXPECT_TRUE(settings.has_skip_device()); - EXPECT_EQ(settings.get_skip_device(), 1); -} - TEST(defaultdevicetype, env_vars_disable_warnings) { for (auto const& value_true : {"1", "true", "TRUE", "yEs"}) { EnvVarsHelper ev = {{ @@ -420,22 +390,20 @@ TEST(defaultdevicetype, env_vars_tune_internals) { } TEST(defaultdevicetype, visible_devices) { -#define KOKKOS_TEST_VISIBLE_DEVICES(ENV, CNT, DEV) \ - do { \ - EnvVarsHelper ev{ENV}; \ - SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev); \ - Kokkos::InitializationSettings settings; \ - Kokkos::Impl::parse_environment_variables(settings); \ - auto computed = Kokkos::Impl::get_visible_devices(settings, CNT); \ - std::vector expected = DEV; \ - EXPECT_EQ(expected.size(), computed.size()) \ - << ev << "device count: " << CNT; \ - auto n = std::min(expected.size(), computed.size()); \ - for (int i = 0; i < n; ++i) { \ - EXPECT_EQ(expected[i], computed[i]) \ - << "devices differ at index " << i << '\n' \ - << ev << "device count: " << CNT; \ - } \ +#define KOKKOS_TEST_VISIBLE_DEVICES(ENV, CNT, DEV) \ + do { \ + EnvVarsHelper ev{ENV}; \ + SKIP_IF_ENVIRONMENT_VARIABLE_ALREADY_SET(ev); \ + auto computed = Kokkos::Impl::get_visible_devices(CNT); \ + std::vector expected = DEV; \ + EXPECT_EQ(expected.size(), computed.size()) \ + << ev << "device count: " << CNT; \ + auto n = std::min(expected.size(), computed.size()); \ + for (int i = 0; i < n; ++i) { \ + EXPECT_EQ(expected[i], computed[i]) \ + << "devices differ at index " << i << '\n' \ + << ev << "device count: " << CNT; \ + } \ } while (false) #define DEV(...) \ @@ -444,6 +412,8 @@ TEST(defaultdevicetype, visible_devices) { // first test with all environment variables that are involved in determining // the visible devices so user set var do not mess up the logic below. + // KOKKOS_NUM_DEVICES and KOKKOS_SKIP_DEVICE are deprecated since 3.7 and are + // not taken into account anymore. KOKKOS_TEST_VISIBLE_DEVICES( ENV({"KOKKOS_VISIBLE_DEVICES", "2,1"}, {"KOKKOS_NUM_DEVICES", "8"}, {"KOKKOS_SKIP_DEVICE", "1"}), @@ -452,10 +422,10 @@ TEST(defaultdevicetype, visible_devices) { ENV({"KOKKOS_VISIBLE_DEVICES", "2,1"}, {"KOKKOS_NUM_DEVICES", "8"}, ), 6, DEV(2, 1)); KOKKOS_TEST_VISIBLE_DEVICES(ENV({"KOKKOS_NUM_DEVICES", "3"}), 6, - DEV(0, 1, 2)); + DEV(0, 1, 2, 3, 4, 5)); KOKKOS_TEST_VISIBLE_DEVICES( ENV({"KOKKOS_NUM_DEVICES", "4"}, {"KOKKOS_SKIP_DEVICE", "1"}, ), 6, - DEV(0, 2, 3)); + DEV(0, 1, 2, 3, 4, 5)); KOKKOS_TEST_VISIBLE_DEVICES(ENV({"KOKKOS_VISIBLE_DEVICES", "1,3,4"}), 6, DEV(1, 3, 4)); KOKKOS_TEST_VISIBLE_DEVICES( diff --git a/core/unit_test/TestRangePolicyCTAD.cpp b/core/unit_test/TestRangePolicyCTAD.cpp new file mode 100644 index 00000000000..20288e2b40a --- /dev/null +++ b/core/unit_test/TestRangePolicyCTAD.cpp @@ -0,0 +1,150 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include "Kokkos_Core_fwd.hpp" + +namespace { + +struct TestRangePolicyCTAD { + struct SomeExecutionSpace { + using execution_space = SomeExecutionSpace; + using size_type = size_t; + + [[maybe_unused]] static int concurrency() { return 0; } + }; + static_assert(Kokkos::is_execution_space_v); + + struct ImplicitlyConvertibleToDefaultExecutionSpace { + [[maybe_unused]] operator Kokkos::DefaultExecutionSpace() const { + return Kokkos::DefaultExecutionSpace(); + } + }; + static_assert(!Kokkos::is_execution_space_v< + ImplicitlyConvertibleToDefaultExecutionSpace>); + + [[maybe_unused]] static inline auto i64 = int64_t(); + [[maybe_unused]] static inline auto i32 = int32_t(); + [[maybe_unused]] static inline auto cs = Kokkos::ChunkSize(0); + [[maybe_unused]] static inline auto des = Kokkos::DefaultExecutionSpace(); + [[maybe_unused]] static inline auto nes = + ImplicitlyConvertibleToDefaultExecutionSpace(); + [[maybe_unused]] static inline auto ses = SomeExecutionSpace(); + + // RangePolicy() + + [[maybe_unused]] static inline auto rp = Kokkos::RangePolicy{}; + static_assert(std::is_same_v, decltype(rp)>); + + // RangePolicy(index_type, index_type) + + [[maybe_unused]] static inline auto rpi64i64 = Kokkos::RangePolicy(i64, i64); + static_assert(std::is_same_v, decltype(rpi64i64)>); + + [[maybe_unused]] static inline auto rpi64i32 = Kokkos::RangePolicy(i64, i32); + static_assert(std::is_same_v, decltype(rpi64i32)>); + + [[maybe_unused]] static inline auto rpi32i64 = Kokkos::RangePolicy(i32, i64); + static_assert(std::is_same_v, decltype(rpi32i64)>); + + [[maybe_unused]] static inline auto rpi32i32 = Kokkos::RangePolicy(i32, i32); + static_assert(std::is_same_v, decltype(rpi32i32)>); + + // RangePolicy(index_type, index_type, ChunkSize) + + [[maybe_unused]] static inline auto rpi64i64cs = + Kokkos::RangePolicy(i64, i64, cs); + static_assert(std::is_same_v, decltype(rpi64i64cs)>); + + [[maybe_unused]] static inline auto rpi64i32cs = + Kokkos::RangePolicy(i64, i32, cs); + static_assert(std::is_same_v, decltype(rpi64i32cs)>); + + [[maybe_unused]] static inline auto rpi32i64cs = + Kokkos::RangePolicy(i32, i64, cs); + static_assert(std::is_same_v, decltype(rpi32i64cs)>); + + [[maybe_unused]] static inline auto rpi32i32cs = + Kokkos::RangePolicy(i32, i32, cs); + static_assert(std::is_same_v, decltype(rpi32i32cs)>); + + // RangePolicy(execution_space, index_type, index_type) + + [[maybe_unused]] static inline auto rpdesi64i64 = + Kokkos::RangePolicy(des, i64, i64); + static_assert(std::is_same_v, decltype(rpdesi64i64)>); + + [[maybe_unused]] static inline auto rpdesi32i32 = + Kokkos::RangePolicy(des, i32, i32); + static_assert(std::is_same_v, decltype(rpdesi32i32)>); + + [[maybe_unused]] static inline auto rpnesi64i64 = + Kokkos::RangePolicy(nes, i64, i64); + static_assert(std::is_same_v, decltype(rpnesi64i64)>); + + [[maybe_unused]] static inline auto rpnesi32i32 = + Kokkos::RangePolicy(nes, i32, i32); + static_assert(std::is_same_v, decltype(rpnesi32i32)>); + + [[maybe_unused]] static inline auto rpsesi64i64 = + Kokkos::RangePolicy(ses, i64, i64); + static_assert(std::is_same_v, + decltype(rpsesi64i64)>); + + [[maybe_unused]] static inline auto rpsesi32i32 = + Kokkos::RangePolicy(ses, i32, i32); + static_assert(std::is_same_v, + decltype(rpsesi32i32)>); + + // RangePolicy(execution_space, index_type, index_type, ChunkSize) + + [[maybe_unused]] static inline auto rpdesi64i64cs = + Kokkos::RangePolicy(des, i64, i64, cs); + static_assert(std::is_same_v, decltype(rpdesi64i64cs)>); + + [[maybe_unused]] static inline auto rpdesi32i32cs = + Kokkos::RangePolicy(des, i32, i32, cs); + static_assert(std::is_same_v, decltype(rpdesi32i32cs)>); + + [[maybe_unused]] static inline auto rpnesi64i64cs = + Kokkos::RangePolicy(nes, i64, i64, cs); + static_assert(std::is_same_v, decltype(rpnesi64i64cs)>); + + [[maybe_unused]] static inline auto rpnesi32i32cs = + Kokkos::RangePolicy(nes, i32, i32, cs); + static_assert(std::is_same_v, decltype(rpnesi32i32cs)>); + + [[maybe_unused]] static inline auto rpsesi64i64cs = + Kokkos::RangePolicy(ses, i64, i64, cs); + static_assert(std::is_same_v, + decltype(rpsesi64i64cs)>); + + [[maybe_unused]] static inline auto rpsesi32i32cs = + Kokkos::RangePolicy(ses, i32, i32, cs); + static_assert(std::is_same_v, + decltype(rpsesi32i32cs)>); + +}; // TestRangePolicyCTAD struct + +// To eliminate maybe_unused warning on some compilers + +[[maybe_unused]] const Kokkos::DefaultExecutionSpace nestodes = + TestRangePolicyCTAD::ImplicitlyConvertibleToDefaultExecutionSpace(); + +[[maybe_unused]] const auto sesconcurrency = + TestRangePolicyCTAD::ses.concurrency(); + +} // namespace diff --git a/core/unit_test/TestRangePolicyConstructors.hpp b/core/unit_test/TestRangePolicyConstructors.hpp index 0a7e59ed980..c8c1542af13 100644 --- a/core/unit_test/TestRangePolicyConstructors.hpp +++ b/core/unit_test/TestRangePolicyConstructors.hpp @@ -18,6 +18,9 @@ #include +#include +#include + namespace { TEST(TEST_CATEGORY, range_policy_runtime_parameters) { @@ -70,4 +73,127 @@ TEST(TEST_CATEGORY, range_policy_runtime_parameters) { } } +TEST(TEST_CATEGORY_DEATH, range_policy_invalid_bounds) { + using Policy = Kokkos::RangePolicy; + using ChunkSize = Kokkos::ChunkSize; + + std::string msg = + "Kokkos::RangePolicy bounds error: The lower bound (100) is greater than " + "the upper bound (90).\n"; +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 + // escape the parentheses in the regex to match the error message + msg = std::regex_replace(msg, std::regex("\\(|\\)"), "\\$&"); + ASSERT_DEATH({ (void)Policy(100, 90); }, msg); + + ASSERT_DEATH({ (void)Policy(TEST_EXECSPACE(), 100, 90, ChunkSize(10)); }, + msg); +#else + + if (!Kokkos::show_warnings()) { + GTEST_SKIP() << "Kokkos warning messages are disabled"; + } + + { + ::testing::internal::CaptureStderr(); + Policy policy(100, 90); + ASSERT_EQ((int)policy.begin(), 0); + ASSERT_EQ((int)policy.end(), 0); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + ASSERT_EQ(::testing::internal::GetCapturedStderr(), msg); +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg; +#endif + } + + { + ::testing::internal::CaptureStderr(); + Policy policy(TEST_EXECSPACE(), 100, 90, ChunkSize(10)); + ASSERT_EQ((int)policy.begin(), 0); + ASSERT_EQ((int)policy.end(), 0); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + ASSERT_EQ(::testing::internal::GetCapturedStderr(), msg); +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg; +#endif + } + +#endif +} + +TEST(TEST_CATEGORY_DEATH, range_policy_implicitly_converted_bounds) { + using UIntIndexType = Kokkos::IndexType; + using IntIndexType = Kokkos::IndexType; + using UIntPolicy = Kokkos::RangePolicy; + using IntPolicy = Kokkos::RangePolicy; + + std::string msg = + "Kokkos::RangePolicy bound type error: an unsafe implicit conversion is " + "performed on a bound (), which may not preserve its original value.\n"; + + auto get_error_msg = [](auto str, auto val) { + return str.insert(str.find("(") + 1, std::to_string(val).c_str()); + }; +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 + std::string expected = std::regex_replace(msg, std::regex("\\(|\\)"), "\\$&"); + { + int test_val = -1; + ASSERT_DEATH({ (void)UIntPolicy(test_val, 10); }, + get_error_msg(expected, test_val)); + } + { + unsigned test_val = std::numeric_limits::max(); + ASSERT_DEATH({ (void)IntPolicy(0u, test_val); }, + get_error_msg(expected, test_val)); + } + { + long long test_val = std::numeric_limits::max(); + ASSERT_DEATH({ (void)IntPolicy(0LL, test_val); }, + get_error_msg(expected, test_val)); + } + { + int test_val = -1; + ASSERT_DEATH({ (void)UIntPolicy(test_val, 10, Kokkos::ChunkSize(2)); }, + get_error_msg(expected, test_val)); + } + +#else + { + ::testing::internal::CaptureStderr(); + int test_val = -1; + UIntPolicy policy(test_val, 10); + ASSERT_EQ(policy.begin(), 0u); + ASSERT_EQ(policy.end(), 0u); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + if (Kokkos::show_warnings()) { + auto s = std::string(::testing::internal::GetCapturedStderr()); + ASSERT_EQ(s.substr(0, s.find("\n") + 1), get_error_msg(msg, test_val)); + } +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg; + (void)get_error_msg; +#endif + } + { + ::testing::internal::CaptureStderr(); + unsigned test_val = std::numeric_limits::max(); + IntPolicy policy(0u, test_val); + ASSERT_EQ(policy.begin(), 0); + ASSERT_EQ(policy.end(), 0); +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + if (Kokkos::show_warnings()) { + auto s = std::string(::testing::internal::GetCapturedStderr()); + ASSERT_EQ(s.substr(0, s.find("\n") + 1), get_error_msg(msg, test_val)); + } +#else + ASSERT_TRUE(::testing::internal::GetCapturedStderr().empty()); + (void)msg; + (void)get_error_msg; +#endif + } +#endif +} + } // namespace diff --git a/core/unit_test/TestReducers.hpp b/core/unit_test/TestReducers.hpp index 957b9a0ca1a..fbcb9629af0 100644 --- a/core/unit_test/TestReducers.hpp +++ b/core/unit_test/TestReducers.hpp @@ -19,6 +19,7 @@ #include #include +#include //-------------------------------------------------------------------------- @@ -46,6 +47,37 @@ struct TestReducers { void operator()(const int& i, Scalar& value) const { value += values(i); } }; + struct TeamSumFunctor { + using member_type = typename Kokkos::TeamPolicy::member_type; + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type& m, Scalar& value) const { + if (m.team_rank() == m.team_size() - 1) value += Scalar(1); + } + }; + + struct TeamSumNestedFunctor { + using member_type = typename Kokkos::TeamPolicy::member_type; + + SumFunctor f; + int M, N; + Kokkos::View result; + + TeamSumNestedFunctor(SumFunctor& f_, const int M_, const int N_, + Kokkos::View result_) + : f(f_), M(M_), N(N_), result(result_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type& m) const { + const int i = m.league_rank(); + Scalar local_scalar; + Kokkos::Sum reducer_scalar( + local_scalar); + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(m, N), f, reducer_scalar); + result(i) = local_scalar; + } + }; + struct ProdFunctor { Kokkos::View values; @@ -319,6 +351,102 @@ struct TestReducers { value = value || values(i); } }; + + // get number of teams for TeamPolicy depending on the tested type + constexpr static int get_num_teams() { + if constexpr (sizeof(Scalar) == 1) { + return 126; + } else if constexpr (std::is_same_v) { + return 256; + } + + return 1024; + } + + static void test_sum_team_policy(int N, SumFunctor f, Scalar reference_sum) { +#ifdef KOKKOS_ENABLE_OPENACC + if constexpr (std::is_same_v && + (std::is_same_v || + std::is_same_v)) { + return; // FIXME_OPENACC + } +#endif + + Scalar sum_scalar; + Kokkos::View sum_view("result"); + Kokkos::deep_copy(sum_view, Scalar(1)); + + // Test team policy reduction + { + constexpr int num_teams = get_num_teams(); + TeamSumFunctor tf; + // FIXME_OPENMPTARGET temporary restriction for team size to be at least + // 32 +#ifdef KOKKOS_ENABLE_OPENMPTARGET + int team_size = + std::is_same::value + ? 32 + : 1; +#else + int team_size = 1; +#endif + auto team_pol = Kokkos::TeamPolicy(num_teams, team_size); + Kokkos::parallel_reduce(team_pol, tf, sum_view); + Kokkos::deep_copy(sum_scalar, sum_view); + ASSERT_EQ(sum_scalar, Scalar{num_teams}) << "num_teams: " << num_teams; + } + + // Test TeamThreadRange level reduction with 0 work produces 0 result + { + const int league_size = 1; + Kokkos::View result("result", league_size); + TeamSumNestedFunctor tnf(f, league_size, 0, result); + // FIXME_OPENMPTARGET temporary restriction for team size to be at least + // 32 +#ifdef KOKKOS_ENABLE_OPENMPTARGET + int team_size = + std::is_same::value + ? 32 + : 1; +#else + int team_size = 1; +#endif + auto team_pol = Kokkos::TeamPolicy(1, team_size); + Kokkos::parallel_for(team_pol, tnf); + auto result_h = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), result); + ASSERT_EQ(result_h(0), Scalar{0}) << "N: " << N; + } + + // Same test as above, but with inner reduction over N, and league_size=10 + { + const int league_size = 10; + Kokkos::View result("result", league_size); + TeamSumNestedFunctor tnf(f, league_size, N, result); + // FIXME_OPENMPTARGET temporary restriction for team size to be at least + // 32 +#ifdef KOKKOS_ENABLE_OPENMPTARGET + int initial_team_size = + std::is_same_v ? 32 + : 1; +#else + int initial_team_size = 1; +#endif + auto team_size_max = + Kokkos::TeamPolicy(league_size, initial_team_size) + .team_size_max(tnf, Kokkos::ParallelForTag()); + auto team_size = std::min(team_size_max, TEST_EXECSPACE().concurrency()); + auto team_pol = Kokkos::TeamPolicy(league_size, team_size); + Kokkos::parallel_for(team_pol, tnf); + auto result_h = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), result); + for (int i = 0; i < result_h.extent_int(0); ++i) { + ASSERT_EQ(result_h(i), reference_sum) << "N: " << N; + } + } + } + static void test_sum(int N) { Kokkos::View values("Values", N); auto h_values = Kokkos::create_mirror_view(values); @@ -374,6 +502,8 @@ struct TestReducers { ASSERT_EQ(sum_scalar_view, reference_sum) << "N: " << N; } + test_sum_team_policy(N, f, reference_sum); + { Kokkos::View sum_view("View"); sum_view() = Scalar(1); diff --git a/core/unit_test/TestReducers_d.hpp b/core/unit_test/TestReducers_d.hpp index 19eaa6d7000..ecf851aa108 100644 --- a/core/unit_test/TestReducers_d.hpp +++ b/core/unit_test/TestReducers_d.hpp @@ -80,6 +80,20 @@ TEST(TEST_CATEGORY, reducers_int8_t) { TestReducers::test_prod(4); } +TEST(TEST_CATEGORY, reducers_int16_t) { + using ThisTestType = int16_t; + + TestReducers::test_sum(1); + TestReducers::test_sum(2); + TestReducers::test_sum(3); + TestReducers::test_sum(4); + + TestReducers::test_prod(1); + TestReducers::test_prod(2); + TestReducers::test_prod(3); + TestReducers::test_prod(4); +} + #if !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_OPENMPTARGET) // TODO - resolve: "Kokkos_HIP_Vectorization.hpp:80:15: error: call to // implicitly-deleted default constructor of 'conv_type' diff --git a/core/unit_test/TestSwap.hpp b/core/unit_test/TestSwap.hpp new file mode 100644 index 00000000000..4e98351cf19 --- /dev/null +++ b/core/unit_test/TestSwap.hpp @@ -0,0 +1,68 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include +#include +#include + +namespace { + +template +struct TestSwap { + KOKKOS_FUNCTION void operator()(int, int& err) const { + { + int a = 1; + int b = 2; + Kokkos::kokkos_swap(a, b); + if (!(a == 2 && b == 1)) { + Kokkos::printf("Failed Kokkos::kokkos_swap(int, int)\n"); + ++err; + } + } + { + float a = 1; + float b = 2; + Kokkos::kokkos_swap(a, b); + if (!(a == 2 && b == 1)) { + Kokkos::printf("Failed Kokkos::kokkos_swap(float, float)\n"); + ++err; + } + } + { + int a[3] = {1, 2, 3}; + int b[3] = {4, 5, 6}; + Kokkos::kokkos_swap(a, b); + if (!(a[0] == 4 && a[1] == 5 && a[2] == 6 && b[0] == 1 && b[1] == 2 && + b[2] == 3)) { + Kokkos::printf("Failed Kokkos::kokkos_swap(int[3], int[3])\n"); + ++err; + } + } + } + + TestSwap() { + int errors; + Kokkos::parallel_reduce( + "TestSwap", Kokkos::RangePolicy(0, 1), *this, errors); + EXPECT_EQ(errors, 0); + } +}; + +TEST(TEST_CATEGORY, kokkos_swap) { TestSwap(); } + +} // namespace diff --git a/core/unit_test/TestTeamBasic.hpp b/core/unit_test/TestTeamBasic.hpp index c395bc0837c..a3d84c5e16b 100644 --- a/core/unit_test/TestTeamBasic.hpp +++ b/core/unit_test/TestTeamBasic.hpp @@ -280,7 +280,7 @@ namespace Test { // Test for non-arithmetic type TEST(TEST_CATEGORY, team_broadcast_long_wrapper) { - static_assert(!std::is_arithmetic::value, ""); + static_assert(!std::is_arithmetic::value); TestTeamBroadcast, long_wrapper>::test_teambroadcast(0, 1); diff --git a/core/unit_test/TestTeamMDRange.hpp b/core/unit_test/TestTeamMDRange.hpp index 6e65cde0cf8..81931467c5a 100644 --- a/core/unit_test/TestTeamMDRange.hpp +++ b/core/unit_test/TestTeamMDRange.hpp @@ -169,7 +169,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -202,7 +209,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -236,7 +250,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -272,7 +293,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -310,7 +338,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4, n5); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -350,7 +385,14 @@ struct TestTeamThreadMDRangeParallelFor : public TestTeamMDParallelFor { n6); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -420,7 +462,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -457,7 +506,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -496,7 +552,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -536,7 +599,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4, n5); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -579,7 +649,14 @@ struct TestThreadVectorMDRangeParallelFor : public TestTeamMDParallelFor { n6); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -620,7 +697,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -653,7 +737,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -687,7 +778,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -723,7 +821,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -761,7 +866,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { FillFlattenedIndex fillFlattenedIndex(leagueSize, n0, n1, n2, n3, n4, n5); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -801,7 +913,14 @@ struct TestTeamVectorMDRangeParallelFor : public TestTeamMDParallelFor { n6); Kokkos::parallel_for( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(const TeamType& team) { int leagueRank = team.league_rank(); @@ -908,13 +1027,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k) = fillFlattenedIndex(i, j, k); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange, TeamType>( @@ -923,7 +1049,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j); }, teamSum); - leagueSum += teamSum; + // FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -952,13 +1084,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l) = fillFlattenedIndex(i, j, k, l); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange, TeamType>( @@ -966,7 +1105,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { [=](const int& i, const int& j, const int& k, DataType& threadSum) { threadSum += v(leagueRank, i, j, k); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -997,13 +1142,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m) = fillFlattenedIndex(i, j, k, l, m); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange, TeamType>( @@ -1013,7 +1165,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j, k, l); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1045,13 +1203,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m, n) = fillFlattenedIndex(i, j, k, l, m, n); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange, TeamType>( @@ -1061,7 +1226,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j, k, l, m); }, teamSum); - leagueSum += teamSum; + // FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1100,13 +1271,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange, TeamType>( @@ -1116,7 +1294,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j, k, l, m, n); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1157,13 +1341,20 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; Kokkos::parallel_reduce( Kokkos::TeamThreadMDRange, TeamType>( @@ -1174,7 +1365,13 @@ struct TestTeamThreadMDRangeParallelReduce : public TestTeamMDParallelReduce { threadSum += v(leagueRank, i, j, k, l, m, n, o); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1207,20 +1404,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l) = fillFlattenedIndex(i, j, k, l); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange, TeamType>( team, n1, n2); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1228,11 +1431,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k); }, threadSum); - - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); @@ -1263,20 +1464,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m) = fillFlattenedIndex(i, j, k, l, m); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange, TeamType>( team, n1, n2, n3); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1286,10 +1493,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { }, threadSum); - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); @@ -1321,20 +1527,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m, n) = fillFlattenedIndex(i, j, k, l, m, n); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange, TeamType>( team, n1, n2, n3, n4); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1344,10 +1556,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { }, threadSum); - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); @@ -1384,20 +1595,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange, TeamType>( team, n1, n2, n3, n4, n5); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1407,10 +1624,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { }, threadSum); - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); @@ -1451,20 +1667,26 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); auto teamThreadRange = Kokkos::TeamThreadRange(team, n0); auto threadVectorRange = Kokkos::ThreadVectorMDRange, TeamType>( team, n1, n2, n3, n4, n5, n6); - Kokkos::parallel_for(teamThreadRange, [=, &teamSum](const int& i) { + Kokkos::parallel_for(teamThreadRange, [=, &leagueSum](const int& i) { DataType threadSum = 0; Kokkos::parallel_reduce( threadVectorRange, @@ -1474,10 +1696,9 @@ struct TestThreadVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { }, threadSum); - teamSum += threadSum; + Kokkos::single(Kokkos::PerThread(team), + [&]() { leagueSum += threadSum; }); }); - - leagueSum += teamSum; }, finalSum); @@ -1510,13 +1731,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l) = fillFlattenedIndex(i, j, k, l); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange, TeamType>( @@ -1527,7 +1755,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { [=](const int& i, const int& j, const int& k, DataType& vectorSum) { vectorSum += v(leagueRank, i, j, k); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1558,13 +1792,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m) = fillFlattenedIndex(i, j, k, l, m); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange, TeamType>( @@ -1577,7 +1818,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k, l); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1609,13 +1856,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { v(i, j, k, l, m, n) = fillFlattenedIndex(i, j, k, l, m, n); }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange, TeamType>( @@ -1628,7 +1882,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k, l, m); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1665,13 +1925,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange, TeamType>( @@ -1684,7 +1951,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k, l, m, n); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1725,13 +1998,20 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { } }); - DataType finalSum = 0; + DataType finalSum; Kokkos::parallel_reduce( - Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO), + Kokkos::TeamPolicy(leagueSize, Kokkos::AUTO, +#ifdef KOKKOS_ENABLE_OPENMPTARGET + 2 +#else + Kokkos::TeamPolicy< + ExecSpace>::vector_length_max() +#endif + ), KOKKOS_LAMBDA(TeamType const& team, DataType& leagueSum) { - auto leagueRank = team.league_rank(); - DataType teamSum = 0; + auto leagueRank = team.league_rank(); + DataType teamSum; auto teamVectorRange = Kokkos::TeamVectorMDRange, TeamType>( @@ -1745,7 +2025,13 @@ struct TestTeamVectorMDRangeParallelReduce : public TestTeamMDParallelReduce { vectorSum += v(leagueRank, i, j, k, l, m, n, o); }, teamSum); - leagueSum += teamSum; +// FIXME_OPENMPTARGET +#ifdef KOKKOS_ENABLE_OPENMPTARGET + if (team.team_rank() == 0) leagueSum += teamSum; +#else + Kokkos::single(Kokkos::PerTeam(team), + [&]() { leagueSum += teamSum; }); +#endif }, finalSum); @@ -1904,13 +2190,6 @@ TEST(TEST_CATEGORY, ThreadVectorMDRangeParallelReduce) { GTEST_SKIP() << "skipping because of bug in group_barrier implementation"; #endif -// FIXME_OPENMPTARGET_CRAY: The unit tests fails correctness. -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CRAYCLANG) - if (std::is_same_v) - GTEST_SKIP() << "Cray compiler fails correctness at runtime with the " - "OpenMPTarget backend."; -#endif - TestThreadVectorMDRangeParallelReduce:: test_parallel_reduce_for_4D_ThreadVectorMDRange(dims); TestThreadVectorMDRangeParallelReduce:: @@ -1944,13 +2223,6 @@ TEST(TEST_CATEGORY, TeamVectorMDRangeParallelReduce) { GTEST_SKIP() << "skipping because of bug in group_barrier implementation"; #endif -// FIXME_OPENMPTARGET_CRAY: The unit tests fails correctness. -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CRAYCLANG) - if (std::is_same_v) - GTEST_SKIP() << "Cray compiler fails correctness at runtime with the " - "OpenMPTarget backend."; -#endif - TestTeamVectorMDRangeParallelReduce:: test_parallel_reduce_for_4D_TeamVectorMDRange(dims); TestTeamVectorMDRangeParallelReduce:: diff --git a/core/unit_test/TestTeamPolicyConstructors.hpp b/core/unit_test/TestTeamPolicyConstructors.hpp index 5b0bfdb1755..9d89f757086 100644 --- a/core/unit_test/TestTeamPolicyConstructors.hpp +++ b/core/unit_test/TestTeamPolicyConstructors.hpp @@ -20,11 +20,24 @@ namespace { +struct SomeTag {}; + +struct FunctorFor { + KOKKOS_FUNCTION + void operator()( + Kokkos::TeamPolicy::member_type const&) const {} + + KOKKOS_FUNCTION + void operator()( + SomeTag, Kokkos::TeamPolicy::member_type const&) const {} +}; + template void test_run_time_parameters() { int league_size = 131; using ExecutionSpace = typename Policy::execution_space; + using ParallelTag = Kokkos::ParallelForTag; int team_size = 4 < ExecutionSpace().concurrency() ? 4 : ExecutionSpace().concurrency(); #ifdef KOKKOS_ENABLE_HPX @@ -44,6 +57,8 @@ void test_run_time_parameters() { ASSERT_EQ(p1.team_size(), team_size); ASSERT_GT(p1.chunk_size(), 0); ASSERT_EQ(p1.scratch_size(0), 0u); + ASSERT_GT(p1.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p1.team_size_recommended(FunctorFor(), ParallelTag()), 0); Policy p2 = p1.set_chunk_size(chunk_size); ASSERT_EQ(p1.league_size(), league_size); @@ -112,6 +127,8 @@ void test_run_time_parameters() { Policy p8; // default constructed ASSERT_EQ(p8.league_size(), 0); ASSERT_EQ(p8.scratch_size(0), 0u); + ASSERT_GT(p8.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p8.team_size_recommended(FunctorFor(), ParallelTag()), 0); p8 = p3; // call assignment operator ASSERT_EQ(p3.league_size(), league_size); ASSERT_EQ(p3.team_size(), team_size); @@ -121,11 +138,25 @@ void test_run_time_parameters() { ASSERT_EQ(p8.team_size(), team_size); ASSERT_EQ(p8.chunk_size(), chunk_size); ASSERT_EQ(p8.scratch_size(0), size_t(scratch_size)); + + Policy p9(league_size, Kokkos::AUTO); + ASSERT_EQ(p9.league_size(), league_size); + ASSERT_GT(p9.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p9.team_size_recommended(FunctorFor(), ParallelTag()), 0); + + Policy p10(league_size, team_size, Kokkos::AUTO); + ASSERT_EQ(p10.league_size(), league_size); + ASSERT_EQ(p10.team_size(), team_size); + ASSERT_GT(p10.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p10.team_size_recommended(FunctorFor(), ParallelTag()), 0); + + Policy p11(league_size, Kokkos::AUTO, Kokkos::AUTO); + ASSERT_EQ(p11.league_size(), league_size); + ASSERT_GT(p11.team_size_max(FunctorFor(), ParallelTag()), 0); + ASSERT_GT(p11.team_size_recommended(FunctorFor(), ParallelTag()), 0); } TEST(TEST_CATEGORY, team_policy_runtime_parameters) { - struct SomeTag {}; - using TestExecSpace = TEST_EXECSPACE; using DynamicSchedule = Kokkos::Schedule; using LongIndex = Kokkos::IndexType; diff --git a/core/unit_test/TestTeamVector.hpp b/core/unit_test/TestTeamVector.hpp index 39122736ed7..5e16539d652 100644 --- a/core/unit_test/TestTeamVector.hpp +++ b/core/unit_test/TestTeamVector.hpp @@ -1012,7 +1012,6 @@ struct checkScan { }; } // namespace VectorScanReducer -#if !defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) TEST(TEST_CATEGORY, team_vector) { ASSERT_TRUE((TestTeamVector::Test(0))); ASSERT_TRUE((TestTeamVector::Test(1))); @@ -1028,9 +1027,7 @@ TEST(TEST_CATEGORY, team_vector) { ASSERT_TRUE((TestTeamVector::Test(11))); ASSERT_TRUE((TestTeamVector::Test(12))); } -#endif -#if !defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND) TEST(TEST_CATEGORY, triple_nested_parallelism) { // With KOKKOS_ENABLE_DEBUG enabled, the functor uses too many registers to run // with a team size of 32 on GPUs, 16 is the max possible (at least on a K80 @@ -1055,7 +1052,6 @@ TEST(TEST_CATEGORY, triple_nested_parallelism) { TestTripleNestedReduce(8192, 2048, 16, 16); TestTripleNestedReduce(8192, 2048, 7, 16); } -#endif TEST(TEST_CATEGORY, parallel_scan_with_reducers) { using T = double; diff --git a/core/unit_test/TestUtilities.hpp b/core/unit_test/TestUtilities.hpp index b1f9d30c1fc..ad5a0df92de 100644 --- a/core/unit_test/TestUtilities.hpp +++ b/core/unit_test/TestUtilities.hpp @@ -25,20 +25,18 @@ namespace Test { void test_is_specialization_of() { using Kokkos::Impl::is_specialization_of; - static_assert(is_specialization_of, Kokkos::pair>{}, - ""); - static_assert(!is_specialization_of, Kokkos::pair>{}, ""); - static_assert(is_specialization_of, Kokkos::View>{}, ""); + static_assert(is_specialization_of, Kokkos::pair>{}); + static_assert(!is_specialization_of, Kokkos::pair>{}); + static_assert(is_specialization_of, Kokkos::View>{}); // NOTE Not removing cv-qualifiers - static_assert(!is_specialization_of const, Kokkos::View>{}, - ""); + static_assert( + !is_specialization_of const, Kokkos::View>{}); // NOTE Would not compile because Kokkos::Array takes a non-type template // parameter - // static_assert(is_specialization_of, Kokkos::Array>{}, - // ""); + // static_assert(is_specialization_of, + // Kokkos::Array>{}); // But this is fine of course - static_assert(!is_specialization_of, Kokkos::pair>{}, - ""); + static_assert(!is_specialization_of, Kokkos::pair>{}); } namespace { diff --git a/core/unit_test/TestViewAPI.hpp b/core/unit_test/TestViewAPI.hpp index ffc500e4a9a..ca098dbc247 100644 --- a/core/unit_test/TestViewAPI.hpp +++ b/core/unit_test/TestViewAPI.hpp @@ -958,8 +958,7 @@ class TestViewAPI { using mirror_type = typename view_type::HostMirror; static_assert(std::is_same::value, - ""); + typename mirror_type::memory_space>::value); view_type a("a"); mirror_type am = Kokkos::create_mirror_view(a); @@ -1005,25 +1004,25 @@ class TestViewAPI { hView3 hv_3("dView3::HostMirror", N0); hView4 hv_4("dView4::HostMirror", N0); - dView0 dv_0_1(nullptr, 0); + dView0 dv_0_1(nullptr); dView0 dv_0_2(hv_0.label(), hv_0.layout()); - dView1 dv_1_1(nullptr, 0); + dView1 dv_1_1(nullptr, N0); dView1 dv_1_2(hv_1.label(), hv_1.layout()); - dView2 dv_2_1(nullptr, 0); + dView2 dv_2_1(nullptr, N0); dView2 dv_2_2(hv_2.label(), hv_2.layout()); - dView3 dv_3_1(nullptr, 0); + dView3 dv_3_1(nullptr, N0); dView3 dv_3_2(hv_3.label(), hv_3.layout()); - dView4 dv_4_1(nullptr, 0); + dView4 dv_4_1(nullptr, N0); dView4 dv_4_2(hv_4.label(), hv_4.layout()); } static void run_test_contruction_from_layout_2() { using dView3_0 = Kokkos::View; - using dView3_1 = Kokkos::View; + using dView3_1 = Kokkos::View; using dView3_2 = Kokkos::View; using dView3_3 = Kokkos::View; @@ -1554,6 +1553,7 @@ class TestViewAPI { Kokkos::CudaUVMSpace>::value) return; #endif + bool did_throw = false; auto alloc_size = std::numeric_limits::max() - 42; try { auto should_always_fail = dView1("hello_world_failure", alloc_size); @@ -1585,7 +1585,9 @@ class TestViewAPI { "because of an unknown error.", msg); } #endif + did_throw = true; } + ASSERT_TRUE(did_throw); } }; diff --git a/core/unit_test/TestViewAPI_d.hpp b/core/unit_test/TestViewAPI_d.hpp index 08d21f54499..b0d759ffccc 100644 --- a/core/unit_test/TestViewAPI_d.hpp +++ b/core/unit_test/TestViewAPI_d.hpp @@ -27,8 +27,19 @@ TEST(TEST_CATEGORY, view_api_d) { } TEST(TEST_CATEGORY, view_allocation_error) { +#if defined(__has_feature) +#if __has_feature(address_sanitizer) + GTEST_SKIP() << "AddressSanitzer detects allocating too much memory " + "preventing our checks to run"; +#endif +#endif #if ((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 3)) GTEST_SKIP() << "ROCm 5.3 segfaults when trying to allocate too much memory"; +#endif +#if defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC + if (std::is_same_v) { + GTEST_SKIP() << "acc_malloc() not properly returning nullptr"; + } #endif TestViewAPI::run_test_error(); } diff --git a/core/unit_test/TestViewCopy_a.hpp b/core/unit_test/TestViewCopy_a.hpp index 3bfc93aadac..a4735b29988 100644 --- a/core/unit_test/TestViewCopy_a.hpp +++ b/core/unit_test/TestViewCopy_a.hpp @@ -147,6 +147,40 @@ TEST(TEST_CATEGORY, view_copy_tests) { Kokkos::deep_copy(s_a, hs_a); ASSERT_TRUE(run_check(s_a, 6)); } + } else { + // These copies won't succeed, but they should each throw + // an exception whose message contains the view labels, + // and the names of the views' memory spaces. + // + // Note: original a,b both have the same device type, + // and their mirrors have the same device type. + using memory_space = typename decltype(a)::memory_space; + using mirror_memory_space = typename decltype(h_a)::memory_space; + bool threw = false; + std::string msg; + try { + Kokkos::deep_copy(hs_b, s_b); + } catch (std::exception& e) { + threw = true; + msg = e.what(); + } + ASSERT_TRUE(threw); + ASSERT_NE(msg.find(hs_b.label()), std::string::npos); + ASSERT_NE(msg.find(s_b.label()), std::string::npos); + ASSERT_NE(msg.find(memory_space().name()), std::string::npos); + ASSERT_NE(msg.find(mirror_memory_space().name()), std::string::npos); + threw = false; + try { + Kokkos::deep_copy(s_a, hs_a); + } catch (std::exception& e) { + threw = true; + msg = e.what(); + } + ASSERT_TRUE(threw); + ASSERT_NE(msg.find(s_a.label()), std::string::npos); + ASSERT_NE(msg.find(hs_a.label()), std::string::npos); + ASSERT_NE(msg.find(memory_space().name()), std::string::npos); + ASSERT_NE(msg.find(mirror_memory_space().name()), std::string::npos); } // Contiguous copies diff --git a/core/unit_test/TestViewCtorDimMatch.hpp b/core/unit_test/TestViewCtorDimMatch.hpp index d71841eef84..40b7737f2e4 100644 --- a/core/unit_test/TestViewCtorDimMatch.hpp +++ b/core/unit_test/TestViewCtorDimMatch.hpp @@ -19,33 +19,72 @@ namespace Test { -#define LIVE(EXPR, ARGS, DYNRANK) EXPECT_NO_THROW(EXPR) -#define DIE(EXPR, ARGS, DYNRANK) \ - ASSERT_DEATH( \ - EXPR, \ - "Constructor for Kokkos View 'v_" #ARGS \ - "' has mismatched number of arguments. Number of arguments = " #ARGS \ - " but dynamic rank = " #DYNRANK) +template +void test_matching_arguments_rank_helper(std::index_sequence) { + constexpr int nargs = sizeof...(Is); + using view_type = Kokkos::View; + if (nargs == rank || nargs == dynrank) { + EXPECT_NO_THROW({ view_type v("v", ((Is * 0) + 1)...); }); + EXPECT_NO_THROW({ view_type v(nullptr, ((Is * 0) + 1)...); }); + } else { + ASSERT_DEATH( + { view_type v("v", ((Is * 0) + 1)...); }, + "Constructor for Kokkos::View 'v' has mismatched number of arguments. " + "The number of arguments = " + + std::to_string(nargs) + + " neither matches the dynamic rank = " + std::to_string(dynrank) + + " nor the total rank = " + std::to_string(rank)); + ASSERT_DEATH( + { view_type v(nullptr, ((Is * 0) + 1)...); }, + "Constructor for Kokkos::View 'UNMANAGED' has mismatched number of " + "arguments. " + "The number of arguments = " + + std::to_string(nargs) + + " neither matches the dynamic rank = " + std::to_string(dynrank) + + " nor the total rank = " + std::to_string(rank)); + } +} -#define PARAM_0 -#define PARAM_1 1 -#define PARAM_2 1, 1 -#define PARAM_3 1, 1, 1 -#define PARAM_4 1, 1, 1, 1 -#define PARAM_5 1, 1, 1, 1, 1 -#define PARAM_6 1, 1, 1, 1, 1, 1 -#define PARAM_7 1, 1, 1, 1, 1, 1, 1 +template class RankType> +void test_matching_arguments_rank() { + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<0>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<1>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<2>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<3>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<4>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<5>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<6>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<7>()); + test_matching_arguments_rank_helper::type>( + std::make_index_sequence<8>()); +} -#define PARAM_0_RANK 0 -#define PARAM_1_RANK 1 -#define PARAM_2_RANK 2 -#define PARAM_3_RANK 3 -#define PARAM_4_RANK 4 -#define PARAM_5_RANK 5 -#define PARAM_6_RANK 6 -#define PARAM_7_RANK 7 +template +struct DynamicRank { + using type = typename DynamicRank::type*; +}; -using DType = int; +template <> +struct DynamicRank<0> { + using type = int; +}; // Skip test execution when KOKKOS_ENABLE_OPENMPTARGET is enabled until // Kokkos::abort() aborts properly on that backend @@ -53,348 +92,110 @@ using DType = int; TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_params_dyn) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - using DType_0 = DType; - using DType_1 = DType *; - using DType_2 = DType **; - using DType_3 = DType ***; - using DType_4 = DType ****; - using DType_5 = DType *****; - using DType_6 = DType ******; - using DType_7 = DType *******; - { - // test View parameters for View dim = 0, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 1, dynamic = 1 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 1); - LIVE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 1); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 1); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 1); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 1); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 1); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 1); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 1); - } - - { - // test View parameters for View dim = 2, dynamic = 2 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 2); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 2); - LIVE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 2); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 2); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 2); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 2); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 2); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 2); - } - - { - // test View parameters for View dim = 3, dynamic = 3 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 3); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 3); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 3); - LIVE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 3); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 3); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 3); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 3); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 3); - } - - { - // test View parameters for View dim = 4, dynamic = 4 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 4); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 4); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 4); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 4); - LIVE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 4); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 4); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 4); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 4); - } - - { - // test View parameters for View dim = 5, dynamic = 5 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 5); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 5); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 5); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 5); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 5); - LIVE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 5); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 5); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 5); - } +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECKS + test_matching_arguments_rank<0, 0, DynamicRank>(); // dim = 0, dynamic = 0 + test_matching_arguments_rank<1, 1, DynamicRank>(); // dim = 1, dynamic = 1 + test_matching_arguments_rank<2, 2, DynamicRank>(); // dim = 2, dynamic = 2 + test_matching_arguments_rank<3, 3, DynamicRank>(); // dim = 3, dynamic = 3 + test_matching_arguments_rank<4, 4, DynamicRank>(); // dim = 4, dynamic = 4 + test_matching_arguments_rank<5, 5, DynamicRank>(); // dim = 5, dynamic = 5 + test_matching_arguments_rank<6, 6, DynamicRank>(); // dim = 6, dynamic = 6 + test_matching_arguments_rank<7, 7, DynamicRank>(); // dim = 7, dynamic = 7 + test_matching_arguments_rank<8, 8, DynamicRank>(); // dim = 8, dynamic = 8 +#endif +} - { - // test View parameters for View dim = 6, dynamic = 6 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 6); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 6); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 6); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 6); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 6); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 6); - LIVE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 6); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 6); - } +template +struct StaticRank { + using type = typename StaticRank::type[1]; +}; - { - // test View parameters for View dim = 7, dynamic = 7 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 7); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 7); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 7); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 7); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 7); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 7); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 7); - LIVE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 7); - } -} +template <> +struct StaticRank<0> { + using type = int; +}; TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_params_stat) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - using DType_0 = DType; - using DType_1 = DType[1]; - using DType_2 = DType[1][1]; - using DType_3 = DType[1][1][1]; - using DType_4 = DType[1][1][1][1]; - using DType_5 = DType[1][1][1][1][1]; - using DType_6 = DType[1][1][1][1][1][1]; - using DType_7 = DType[1][1][1][1][1][1][1]; - { - // test View parameters for View dim = 0, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 1, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - LIVE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 2, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - LIVE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 3, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - LIVE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 4, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - LIVE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECKS + test_matching_arguments_rank<0, 0, StaticRank>(); // dim = 0, dynamic = 0 + test_matching_arguments_rank<1, 0, StaticRank>(); // dim = 1, dynamic = 0 + test_matching_arguments_rank<2, 0, StaticRank>(); // dim = 2, dynamic = 0 + test_matching_arguments_rank<3, 0, StaticRank>(); // dim = 3, dynamic = 0 + test_matching_arguments_rank<4, 0, StaticRank>(); // dim = 4, dynamic = 0 + test_matching_arguments_rank<5, 0, StaticRank>(); // dim = 5, dynamic = 0 + test_matching_arguments_rank<6, 0, StaticRank>(); // dim = 6, dynamic = 0 + test_matching_arguments_rank<7, 0, StaticRank>(); // dim = 7, dynamic = 0 + test_matching_arguments_rank<8, 0, StaticRank>(); // dim = 8, dynamic = 0 +#endif +} - { - // test View parameters for View dim = 5, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - LIVE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } +template +struct MixedRank { + using type = typename DynamicRank::type[1]; +}; - { - // test View parameters for View dim = 6, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - LIVE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 7, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - LIVE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } -} +template <> +struct MixedRank<0> { + using type = int; +}; TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_params_mix) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - using DType_0 = DType; - using DType_1 = DType[1]; - using DType_2 = DType * [1]; - using DType_3 = DType * * [1]; - using DType_4 = DType ** * [1]; - using DType_5 = DType *** * [1]; - using DType_6 = DType **** * [1]; - using DType_7 = DType ***** * [1]; - { - // test View parameters for View dim = 0, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 1, dynamic = 0 - LIVE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 0); - LIVE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 0); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 0); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 0); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 0); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 0); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 0); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 0); - } - - { - // test View parameters for View dim = 2, dynamic = 1 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 1); - LIVE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 1); - LIVE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 1); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 1); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 1); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 1); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 1); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 1); - } - - { - // test View parameters for View dim = 3, dynamic = 2 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 2); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 2); - LIVE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 2); - LIVE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 2); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 2); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 2); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 2); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 2); - } +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECKS + test_matching_arguments_rank<0, 0, MixedRank>(); // dim = 0, dynamic = 0 + test_matching_arguments_rank<1, 0, MixedRank>(); // dim = 1, dynamic = 0 + test_matching_arguments_rank<2, 1, MixedRank>(); // dim = 2, dynamic = 1 + test_matching_arguments_rank<3, 2, MixedRank>(); // dim = 3, dynamic = 2 + test_matching_arguments_rank<4, 3, MixedRank>(); // dim = 4, dynamic = 3 + test_matching_arguments_rank<5, 4, MixedRank>(); // dim = 5, dynamic = 4 + test_matching_arguments_rank<6, 5, MixedRank>(); // dim = 6, dynamic = 5 + test_matching_arguments_rank<7, 6, MixedRank>(); // dim = 7, dynamic = 6 + test_matching_arguments_rank<8, 7, MixedRank>(); // dim = 8, dynamic = 7 +#endif +} - { - // test View parameters for View dim = 4, dynamic = 3 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 3); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 3); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 3); - LIVE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 3); - LIVE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 3); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 3); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 3); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 3); - } +#define CHECK_DEATH(EXPR) \ + ASSERT_DEATH(EXPR, \ + "The specified run-time extent for Kokkos::View 'v' does not " \ + "match the compile-time extent in dimension 0. The given " \ + "extent is 2 but should be 1.") - { - // test View parameters for View dim = 5, dynamic = 4 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 4); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 4); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 4); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 4); - LIVE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 4); - LIVE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 4); - DIE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 4); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 4); - } +#define CHECK_DEATH_UNMANAGED(EXPR) \ + ASSERT_DEATH( \ + EXPR, \ + "The specified run-time extent for Kokkos::View 'UNMANAGED' does not " \ + "match the compile-time extent in dimension 0. The given " \ + "extent is 2 but should be 1.") - { - // test View parameters for View dim = 6, dynamic = 5 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 5); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 5); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 5); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 5); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 5); - LIVE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 5); - LIVE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 5); - DIE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 5); - } +TEST(TEST_CATEGORY_DEATH, view_construction_with_wrong_static_extents) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - { - // test View parameters for View dim = 7, dynamic = 6 - DIE({ Kokkos::View v_0("v_0" PARAM_0); }, 0, 6); - DIE({ Kokkos::View v_1("v_1", PARAM_1); }, 1, 6); - DIE({ Kokkos::View v_2("v_2", PARAM_2); }, 2, 6); - DIE({ Kokkos::View v_3("v_3", PARAM_3); }, 3, 6); - DIE({ Kokkos::View v_4("v_4", PARAM_4); }, 4, 6); - DIE({ Kokkos::View v_5("v_5", PARAM_5); }, 5, 6); - LIVE({ Kokkos::View v_6("v_6", PARAM_6); }, 6, 6); - LIVE({ Kokkos::View v_7("v_7", PARAM_7); }, 7, 6); - } +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECKS + // clang-format off + CHECK_DEATH({ Kokkos::View v("v", 2); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1, 1); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1, 1, 1); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1, 1, 1, 1); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1, 1, 1, 1, 1); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1, 1, 1, 1, 1, 1); }); + CHECK_DEATH({ Kokkos::View v("v", 2, 1, 1, 1, 1, 1, 1, 1); }); + + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1, 1, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1, 1, 1, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1, 1, 1, 1, 1, 1); }); + CHECK_DEATH_UNMANAGED({ Kokkos::View v(nullptr, 2, 1, 1, 1, 1, 1, 1, 1); }); + // clang-format on +#endif } -#endif // KOKKOS_ENABLE_OPENMPTARGET - -#undef PARAM_0 -#undef PARAM_1 -#undef PARAM_2 -#undef PARAM_3 -#undef PARAM_4 -#undef PARAM_5 -#undef PARAM_6 -#undef PARAM_7 -#undef PARAM_0_RANK -#undef PARAM_1_RANK -#undef PARAM_2_RANK -#undef PARAM_3_RANK -#undef PARAM_4_RANK -#undef PARAM_5_RANK -#undef PARAM_6_RANK -#undef PARAM_7_RANK - -#undef DType - -#undef LIVE -#undef DIE +#undef CHECK_DEATH +#endif // KOKKOS_ENABLE_OPENMPTARGET } // namespace Test diff --git a/core/unit_test/TestViewEmptyRuntimeUnmanaged.hpp b/core/unit_test/TestViewEmptyRuntimeUnmanaged.hpp new file mode 100644 index 00000000000..b156b72860e --- /dev/null +++ b/core/unit_test/TestViewEmptyRuntimeUnmanaged.hpp @@ -0,0 +1,55 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include + +namespace { + +template +void test_empty_view_runtime_unmanaged() { + T d{}; + auto* p = reinterpret_cast(0xABADBABE); + + (void)Kokkos::View(p); + (void)Kokkos::View(&d); + (void)Kokkos::View(nullptr); + (void)Kokkos::View(NULL); // NOLINT(modernize-use-nullptr) + (void)Kokkos::View(0); // NOLINT(modernize-use-nullptr) + + (void)Kokkos::View(p, 0); + (void)Kokkos::View(&d, 0); + (void)Kokkos::View(nullptr, 0); + (void)Kokkos::View(NULL, 0); // NOLINT(modernize-use-nullptr) + (void)Kokkos::View(0, 0); // NOLINT(modernize-use-nullptr) + + (void)Kokkos::View(p, 0, 0); + (void)Kokkos::View(&d, 0, 0); + (void)Kokkos::View(nullptr, 0, 0); + (void)Kokkos::View(NULL, 0, 0); // NOLINT(modernize-use-nullptr) + (void)Kokkos::View(0, 0, 0); // NOLINT(modernize-use-nullptr) +} + +TEST(TEST_CATEGORY, view_empty_runtime_unmanaged) { + test_empty_view_runtime_unmanaged(); + test_empty_view_runtime_unmanaged(); + test_empty_view_runtime_unmanaged(); + test_empty_view_runtime_unmanaged(); + test_empty_view_runtime_unmanaged(); +} + +} // namespace diff --git a/core/unit_test/TestViewMapping_a.hpp b/core/unit_test/TestViewMapping_a.hpp index 9173f0d4316..a4dfdb26e3f 100644 --- a/core/unit_test/TestViewMapping_a.hpp +++ b/core/unit_test/TestViewMapping_a.hpp @@ -73,67 +73,67 @@ void test_view_mapping() { ASSERT_LE(sizeof(dim_s0_s0_s0_s0_s0_s0_s0), 8 * sizeof(unsigned)); ASSERT_EQ(sizeof(dim_s0_s0_s0_s0_s0_s0_s0_s0), 8 * sizeof(unsigned)); #endif - static_assert(int(dim_0::rank) == int(0), ""); - static_assert(int(dim_0::rank_dynamic) == int(0), ""); - static_assert(int(dim_0::ArgN0) == 1, ""); - static_assert(int(dim_0::ArgN1) == 1, ""); - static_assert(int(dim_0::ArgN2) == 1, ""); - - static_assert(int(dim_s2::rank) == int(1), ""); - static_assert(int(dim_s2::rank_dynamic) == int(0), ""); - static_assert(int(dim_s2::ArgN0) == 2, ""); - static_assert(int(dim_s2::ArgN1) == 1, ""); - - static_assert(int(dim_s2_s3::rank) == int(2), ""); - static_assert(int(dim_s2_s3::rank_dynamic) == int(0), ""); - static_assert(int(dim_s2_s3::ArgN0) == 2, ""); - static_assert(int(dim_s2_s3::ArgN1) == 3, ""); - static_assert(int(dim_s2_s3::ArgN2) == 1, ""); - - static_assert(int(dim_s2_s3_s4::rank) == int(3), ""); - static_assert(int(dim_s2_s3_s4::rank_dynamic) == int(0), ""); - static_assert(int(dim_s2_s3_s4::ArgN0) == 2, ""); - static_assert(int(dim_s2_s3_s4::ArgN1) == 3, ""); - static_assert(int(dim_s2_s3_s4::ArgN2) == 4, ""); - static_assert(int(dim_s2_s3_s4::ArgN3) == 1, ""); - - static_assert(int(dim_s0::rank) == int(1), ""); - static_assert(int(dim_s0::rank_dynamic) == int(1), ""); - - static_assert(int(dim_s0_s3::rank) == int(2), ""); - static_assert(int(dim_s0_s3::rank_dynamic) == int(1), ""); - static_assert(int(dim_s0_s3::ArgN0) == 0, ""); - static_assert(int(dim_s0_s3::ArgN1) == 3, ""); - - static_assert(int(dim_s0_s3_s4::rank) == int(3), ""); - static_assert(int(dim_s0_s3_s4::rank_dynamic) == int(1), ""); - static_assert(int(dim_s0_s3_s4::ArgN0) == 0, ""); - static_assert(int(dim_s0_s3_s4::ArgN1) == 3, ""); - static_assert(int(dim_s0_s3_s4::ArgN2) == 4, ""); - - static_assert(int(dim_s0_s0_s4::rank) == int(3), ""); - static_assert(int(dim_s0_s0_s4::rank_dynamic) == int(2), ""); - static_assert(int(dim_s0_s0_s4::ArgN0) == 0, ""); - static_assert(int(dim_s0_s0_s4::ArgN1) == 0, ""); - static_assert(int(dim_s0_s0_s4::ArgN2) == 4, ""); - - static_assert(int(dim_s0_s0_s0::rank) == int(3), ""); - static_assert(int(dim_s0_s0_s0::rank_dynamic) == int(3), ""); - - static_assert(int(dim_s0_s0_s0_s0::rank) == int(4), ""); - static_assert(int(dim_s0_s0_s0_s0::rank_dynamic) == int(4), ""); - - static_assert(int(dim_s0_s0_s0_s0_s0::rank) == int(5), ""); - static_assert(int(dim_s0_s0_s0_s0_s0::rank_dynamic) == int(5), ""); - - static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank) == int(6), ""); - static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(6), ""); - - static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank) == int(7), ""); - static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(7), ""); - - static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank) == int(8), ""); - static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(8), ""); + static_assert(int(dim_0::rank) == int(0)); + static_assert(int(dim_0::rank_dynamic) == int(0)); + static_assert(int(dim_0::ArgN0) == 1); + static_assert(int(dim_0::ArgN1) == 1); + static_assert(int(dim_0::ArgN2) == 1); + + static_assert(int(dim_s2::rank) == int(1)); + static_assert(int(dim_s2::rank_dynamic) == int(0)); + static_assert(int(dim_s2::ArgN0) == 2); + static_assert(int(dim_s2::ArgN1) == 1); + + static_assert(int(dim_s2_s3::rank) == int(2)); + static_assert(int(dim_s2_s3::rank_dynamic) == int(0)); + static_assert(int(dim_s2_s3::ArgN0) == 2); + static_assert(int(dim_s2_s3::ArgN1) == 3); + static_assert(int(dim_s2_s3::ArgN2) == 1); + + static_assert(int(dim_s2_s3_s4::rank) == int(3)); + static_assert(int(dim_s2_s3_s4::rank_dynamic) == int(0)); + static_assert(int(dim_s2_s3_s4::ArgN0) == 2); + static_assert(int(dim_s2_s3_s4::ArgN1) == 3); + static_assert(int(dim_s2_s3_s4::ArgN2) == 4); + static_assert(int(dim_s2_s3_s4::ArgN3) == 1); + + static_assert(int(dim_s0::rank) == int(1)); + static_assert(int(dim_s0::rank_dynamic) == int(1)); + + static_assert(int(dim_s0_s3::rank) == int(2)); + static_assert(int(dim_s0_s3::rank_dynamic) == int(1)); + static_assert(int(dim_s0_s3::ArgN0) == 0); + static_assert(int(dim_s0_s3::ArgN1) == 3); + + static_assert(int(dim_s0_s3_s4::rank) == int(3)); + static_assert(int(dim_s0_s3_s4::rank_dynamic) == int(1)); + static_assert(int(dim_s0_s3_s4::ArgN0) == 0); + static_assert(int(dim_s0_s3_s4::ArgN1) == 3); + static_assert(int(dim_s0_s3_s4::ArgN2) == 4); + + static_assert(int(dim_s0_s0_s4::rank) == int(3)); + static_assert(int(dim_s0_s0_s4::rank_dynamic) == int(2)); + static_assert(int(dim_s0_s0_s4::ArgN0) == 0); + static_assert(int(dim_s0_s0_s4::ArgN1) == 0); + static_assert(int(dim_s0_s0_s4::ArgN2) == 4); + + static_assert(int(dim_s0_s0_s0::rank) == int(3)); + static_assert(int(dim_s0_s0_s0::rank_dynamic) == int(3)); + + static_assert(int(dim_s0_s0_s0_s0::rank) == int(4)); + static_assert(int(dim_s0_s0_s0_s0::rank_dynamic) == int(4)); + + static_assert(int(dim_s0_s0_s0_s0_s0::rank) == int(5)); + static_assert(int(dim_s0_s0_s0_s0_s0::rank_dynamic) == int(5)); + + static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank) == int(6)); + static_assert(int(dim_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(6)); + + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank) == int(7)); + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(7)); + + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank) == int(8)); + static_assert(int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(8)); dim_s0 d1(2, 3, 4, 5, 6, 7, 8, 9); dim_s0_s0 d2(2, 3, 4, 5, 6, 7, 8, 9); @@ -514,11 +514,11 @@ void test_view_mapping() { { using namespace Kokkos::Impl; - static_assert(rank_dynamic<>::value == 0, ""); - static_assert(rank_dynamic<1>::value == 0, ""); - static_assert(rank_dynamic<0>::value == 1, ""); - static_assert(rank_dynamic<0, 1>::value == 1, ""); - static_assert(rank_dynamic<0, 0, 1>::value == 2, ""); + static_assert(rank_dynamic<>::value == 0); + static_assert(rank_dynamic<1>::value == 0); + static_assert(rank_dynamic<0>::value == 1); + static_assert(rank_dynamic<0, 1>::value == 1); + static_assert(rank_dynamic<0, 0, 1>::value == 2); } { @@ -529,54 +529,48 @@ void test_view_mapping() { using a_const_int_r1 = ViewArrayAnalysis; using a_const_int_r5 = ViewArrayAnalysis; - static_assert(a_int_r1::dimension::rank == 1, ""); - static_assert(a_int_r1::dimension::rank_dynamic == 1, ""); - static_assert(a_int_r5::dimension::ArgN0 == 0, ""); - static_assert(a_int_r5::dimension::ArgN1 == 0, ""); - static_assert(a_int_r5::dimension::ArgN2 == 4, ""); - static_assert(a_int_r5::dimension::ArgN3 == 5, ""); - static_assert(a_int_r5::dimension::ArgN4 == 6, ""); - static_assert(a_int_r5::dimension::ArgN5 == 1, ""); + static_assert(a_int_r1::dimension::rank == 1); + static_assert(a_int_r1::dimension::rank_dynamic == 1); + static_assert(a_int_r5::dimension::ArgN0 == 0); + static_assert(a_int_r5::dimension::ArgN1 == 0); + static_assert(a_int_r5::dimension::ArgN2 == 4); + static_assert(a_int_r5::dimension::ArgN3 == 5); + static_assert(a_int_r5::dimension::ArgN4 == 6); + static_assert(a_int_r5::dimension::ArgN5 == 1); static_assert( - std::is_same >::value, - ""); + std::is_same >::value); static_assert( - std::is_same::value, ""); + std::is_same::value); - static_assert(a_const_int_r1::dimension::rank == 1, ""); - static_assert(a_const_int_r1::dimension::rank_dynamic == 1, ""); + static_assert(a_const_int_r1::dimension::rank == 1); + static_assert(a_const_int_r1::dimension::rank_dynamic == 1); static_assert(std::is_same >::value, - ""); - static_assert( - std::is_same::value, - ""); + ViewDimension<0> >::value); + static_assert(std::is_same::value); - static_assert(a_const_int_r5::dimension::rank == 5, ""); - static_assert(a_const_int_r5::dimension::rank_dynamic == 2, ""); + static_assert(a_const_int_r5::dimension::rank == 5); + static_assert(a_const_int_r5::dimension::rank_dynamic == 2); - static_assert(a_const_int_r5::dimension::ArgN0 == 0, ""); - static_assert(a_const_int_r5::dimension::ArgN1 == 0, ""); - static_assert(a_const_int_r5::dimension::ArgN2 == 4, ""); - static_assert(a_const_int_r5::dimension::ArgN3 == 5, ""); - static_assert(a_const_int_r5::dimension::ArgN4 == 6, ""); - static_assert(a_const_int_r5::dimension::ArgN5 == 1, ""); + static_assert(a_const_int_r5::dimension::ArgN0 == 0); + static_assert(a_const_int_r5::dimension::ArgN1 == 0); + static_assert(a_const_int_r5::dimension::ArgN2 == 4); + static_assert(a_const_int_r5::dimension::ArgN3 == 5); + static_assert(a_const_int_r5::dimension::ArgN4 == 6); + static_assert(a_const_int_r5::dimension::ArgN5 == 1); static_assert(std::is_same >::value, - ""); - static_assert( - std::is_same::value, - ""); + ViewDimension<0, 0, 4, 5, 6> >::value); + static_assert(std::is_same::value); - static_assert(a_int_r5::dimension::rank == 5, ""); - static_assert(a_int_r5::dimension::rank_dynamic == 2, ""); + static_assert(a_int_r5::dimension::rank == 5); + static_assert(a_int_r5::dimension::rank_dynamic == 2); static_assert(std::is_same >::value, - ""); + ViewDimension<0, 0, 4, 5, 6> >::value); static_assert( - std::is_same::value, ""); + std::is_same::value); } { @@ -587,15 +581,15 @@ void test_view_mapping() { // Dimensions of t_i4 are appended to the multdimensional array. using a_int_r5 = ViewArrayAnalysis; - static_assert(a_int_r5::dimension::rank == 5, ""); - static_assert(a_int_r5::dimension::rank_dynamic == 3, ""); - static_assert(a_int_r5::dimension::ArgN0 == 0, ""); - static_assert(a_int_r5::dimension::ArgN1 == 0, ""); - static_assert(a_int_r5::dimension::ArgN2 == 0, ""); - static_assert(a_int_r5::dimension::ArgN3 == 3, ""); - static_assert(a_int_r5::dimension::ArgN4 == 4, ""); + static_assert(a_int_r5::dimension::rank == 5); + static_assert(a_int_r5::dimension::rank_dynamic == 3); + static_assert(a_int_r5::dimension::ArgN0 == 0); + static_assert(a_int_r5::dimension::ArgN1 == 0); + static_assert(a_int_r5::dimension::ArgN2 == 0); + static_assert(a_int_r5::dimension::ArgN3 == 3); + static_assert(a_int_r5::dimension::ArgN4 == 4); static_assert( - std::is_same::value, ""); + std::is_same::value); } { @@ -603,71 +597,54 @@ void test_view_mapping() { using a_const_int_r1 = ViewDataAnalysis; - static_assert(std::is_void::value, ""); + static_assert(std::is_void::value); static_assert(std::is_same >::value, - ""); + Kokkos::Impl::ViewDimension<0> >::value); static_assert( - std::is_same::value, ""); + std::is_same::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert(std::is_same::value, - ""); + const int*>::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert(std::is_same::value, - ""); + const int>::value); static_assert(std::is_same::value, - ""); + const int*>::value); static_assert( - std::is_same::value, ""); - static_assert( - std::is_same::value, - ""); + std::is_same::value); + static_assert(std::is_same::value); using a_const_int_r3 = ViewDataAnalysis; - static_assert(std::is_void::value, ""); + static_assert(std::is_void::value); static_assert(std::is_same >::value, - ""); + Kokkos::Impl::ViewDimension<0, 0, 4> >::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert( - std::is_same::value, - ""); + std::is_same::value); static_assert(std::is_same::value, - ""); + const int* * [4]>::value); static_assert(std::is_same::value, - ""); + const int* * [4]>::value); static_assert(std::is_same::value, - ""); + const int>::value); static_assert(std::is_same::value, - ""); + const int* * [4]>::value); static_assert(std::is_same::value, - ""); - static_assert( - std::is_same::value, - ""); + int* * [4]>::value); + static_assert(std::is_same::value); static_assert( std::is_same::value, - ""); + int* * [4]>::value); // std::cout << "typeid( const int**[4] ).name() = " << typeid( const // int**[4] ).name() << std::endl; diff --git a/core/unit_test/TestViewMapping_b.hpp b/core/unit_test/TestViewMapping_b.hpp index 9ac4e7da845..4aee035d17a 100644 --- a/core/unit_test/TestViewMapping_b.hpp +++ b/core/unit_test/TestViewMapping_b.hpp @@ -156,7 +156,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using dst_traits = Kokkos::ViewTraits; using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(mapping::is_assignable, ""); + static_assert(mapping::is_assignable); Kokkos::View src; Kokkos::View dst(src); @@ -167,7 +167,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using dst_traits = Kokkos::ViewTraits; using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(mapping::is_assignable, ""); + static_assert(mapping::is_assignable); Kokkos::View src; Kokkos::View dst(src); @@ -180,7 +180,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(mapping::is_assignable, ""); + static_assert(mapping::is_assignable); Kokkos::View src; Kokkos::View dst(src); @@ -193,7 +193,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(mapping::is_assignable, ""); + static_assert(mapping::is_assignable); Kokkos::View src; Kokkos::View dst(src); @@ -206,7 +206,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(!mapping::is_assignable, ""); + static_assert(!mapping::is_assignable); } { // Assignment of rank-2 Right = Left @@ -215,7 +215,7 @@ TEST(TEST_CATEGORY, view_mapping_assignable) { using src_traits = Kokkos::ViewTraits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(!mapping::is_assignable, ""); + static_assert(!mapping::is_assignable); } } @@ -226,7 +226,7 @@ TEST(TEST_CATEGORY, view_mapping_trivially_copyable) { using src_traits = dst_traits; using mapping = Kokkos::Impl::ViewMapping; - static_assert(std::is_trivially_copyable{}, ""); + static_assert(std::is_trivially_copyable{}); } } // namespace Test diff --git a/core/unit_test/TestViewOutOfBoundsAccess.hpp b/core/unit_test/TestViewOutOfBoundsAccess.hpp new file mode 100644 index 00000000000..2716856c1fc --- /dev/null +++ b/core/unit_test/TestViewOutOfBoundsAccess.hpp @@ -0,0 +1,175 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +#include + +namespace { + +TEST(TEST_CATEGORY, append_formatted_multidimensional_index) { + using Kokkos::Impl::append_formatted_multidimensional_index; + { + char buffer[64] = "my prefix "; + append_formatted_multidimensional_index(buffer, 1); + EXPECT_STREQ(buffer, "my prefix [1]"); + } + { + char buffer[64] = "I was here"; + append_formatted_multidimensional_index(buffer, 1, 2, 3); + EXPECT_STREQ(buffer, "I was here[1,2,3]"); + } + { + char buffer[64] = "with mixed integer types "; + append_formatted_multidimensional_index(buffer, 1u, -2); + EXPECT_STREQ(buffer, "with mixed integer types [1,-2]"); + } +} + +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + +template +struct TestViewOutOfBoundAccess { + View v; + static constexpr auto rank = View::rank; + + template + KOKKOS_FUNCTION decltype(auto) bad_access(std::index_sequence) const { + return v((Is * 1 + Is == 0 ? v.extent(Is) + 3 : 0)...); + } + + KOKKOS_FUNCTION void operator()(int) const { + ++bad_access(std::make_index_sequence{}); + } + + template + std::string get_details(std::index_sequence) { + std::stringstream ss; + ss << "with indices \\["; + ((ss << (Is == 0 ? v.extent(Is) + 3 : 0) + << (Is == View::rank() - 1 ? "\\]" : ",")), + ...); + ss << " but extents \\["; + ((ss << v.extent(Is) << (Is == View::rank() - 1 ? "\\]" : ",")), ...); + return ss.str(); + } + + auto get_details() { + return get_details(std::make_index_sequence()); + } + + TestViewOutOfBoundAccess(View w, ExecutionSpace const& s, std::string matcher) + : v(std::move(w)) { + constexpr bool view_accessible_from_execution_space = + Kokkos::SpaceAccessibility< + /*AccessSpace=*/ExecutionSpace, + /*MemorySpace=*/typename View::memory_space>::accessible; + EXPECT_TRUE(view_accessible_from_execution_space); + + matcher += ".*" + get_details(); + + EXPECT_DEATH( + { + Kokkos::parallel_for(Kokkos::RangePolicy(s, 0, 1), + *this); + Kokkos::fence(); + }, + matcher); + } +}; + +template +auto make_view_impl(LblOrPtr x, std::index_sequence) { + return View(x, (Is + 1)...); +} + +template +auto make_view(LblOrPtr x) { + return make_view_impl(std::move(x), + std::make_index_sequence()); +} + +template +void test_view_out_of_bounds_access() { + ExecutionSpace const exec_space{}; + // clang-format off + using V1 = Kokkos::View; + using V2 = Kokkos::View; + using V3 = Kokkos::View; + using V4 = Kokkos::View; + using V5 = Kokkos::View; + using V6 = Kokkos::View; + using V7 = Kokkos::View; + using V8 = Kokkos::View; + std::string const prefix = "Kokkos::View ERROR: out of bounds access"; + std::string const lbl = "my_label"; + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + TestViewOutOfBoundAccess(make_view(lbl), exec_space, prefix + ".*" + lbl); + int* const ptr = nullptr; + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + TestViewOutOfBoundAccess(make_view(ptr), exec_space, prefix + ".*UNMANAGED"); + // clang-format on +} + +TEST(TEST_CATEGORY_DEATH, view_out_of_bounds_access) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + + using ExecutionSpace = TEST_EXECSPACE; + + if (false && Kokkos::SpaceAccessibility< + /*AccessSpace=*/ExecutionSpace, + /*MemorySpace=*/Kokkos::HostSpace>::accessible) { + GTEST_SKIP() << "skipping since no memory access violation would occur"; + } + +#if defined(KOKKOS_ENABLE_SYCL) && defined(NDEBUG) // FIXME_SYCL + if (std::is_same_v) { + GTEST_SKIP() << "skipping SYCL device-side abort does not work when NDEBUG " + "is defined"; + } +#endif +#if defined(KOKKOS_ENABLE_OPENMPTARGET) // FIXME_OPENMPTARGET + if (std::is_same_v) { + GTEST_SKIP() << "skipping because OpenMPTarget backend is currently not " + "able to abort from the device"; + } +#endif +#if defined(KOKKOS_ENABLE_OPENACC) // FIXME_OPENACC + if (std::is_same::value) { + GTEST_SKIP() << "skipping because OpenACC backend is currently not " + "able to abort from the device"; + } +#endif + + test_view_out_of_bounds_access(); +} + +#endif + +} // namespace diff --git a/core/unit_test/UnitTest_DeviceAndThreads.cpp b/core/unit_test/UnitTest_DeviceAndThreads.cpp index b522ac3e69b..25442146fba 100644 --- a/core/unit_test/UnitTest_DeviceAndThreads.cpp +++ b/core/unit_test/UnitTest_DeviceAndThreads.cpp @@ -19,22 +19,23 @@ #include #include -int get_device_count() { +int get_num_devices() { + int num_devices; #if defined(KOKKOS_ENABLE_CUDA) - int count; - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count)); - return count; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&num_devices)); #elif defined(KOKKOS_ENABLE_HIP) - int count; - KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDevice(&count)); - return count; + KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&num_devices)); #elif defined(KOKKOS_ENABLE_OPENMPTARGET) - return omp_get_num_devices(); + num_devices = omp_get_num_devices(); #elif defined(KOKKOS_ENABLE_OPENACC) - return acc_get_num_devices(acc_get_device_type()); + num_devices = acc_get_num_devices(acc_get_device_type()); +#elif defined(KOKKOS_ENABLE_SYCL) + num_devices = sycl::device::get_devices(sycl::info::device_type::gpu).size(); #else - return 0; + num_devices = -1; #endif + assert(num_devices == Kokkos::num_devices()); + return num_devices; } int get_device_id() { @@ -44,15 +45,17 @@ int get_device_id() { #elif defined(KOKKOS_ENABLE_HIP) KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDevice(&device_id)); #elif defined(KOKKOS_ENABLE_OPENMPTARGET) - device_id = omp_get_device_num(); + device_id = omp_get_default_device(); #elif defined(KOKKOS_ENABLE_OPENACC) - device_id = acc_get_device_num(acc_get_device_type()); + device_id = acc_get_device_num(acc_get_device_type()); #elif defined(KOKKOS_ENABLE_SYCL) - // FIXME_SYCL ? - assert(false); - return -2; + // Not able to query the underlying runtime because there is no such thing as + // device currently being used with SYCL. We go through the Kokkos runtime + // which makes the assert below pointless but it still let us check that + // Kokkos selected the device we asked for from the Python tests. + device_id = Kokkos::device_id(); #else - device_id = -1; + device_id = -1; #endif assert(device_id == Kokkos::device_id()); return device_id; @@ -68,6 +71,14 @@ int get_max_threads() { #endif } +int get_hwloc_enabled() { +#ifdef KOKKOS_ENABLE_HWLOC + return 1; +#else + return 0; +#endif +} + int get_num_threads() { int const num_threads = Kokkos::DefaultHostExecutionSpace().concurrency(); assert(num_threads == Kokkos::num_threads()); @@ -90,9 +101,10 @@ int print_flag(std::string const& flag) { KOKKOS_TEST_PRINT_FLAG(num_threads); KOKKOS_TEST_PRINT_FLAG(max_threads); KOKKOS_TEST_PRINT_FLAG(device_id); - KOKKOS_TEST_PRINT_FLAG(device_count); + KOKKOS_TEST_PRINT_FLAG(num_devices); KOKKOS_TEST_PRINT_FLAG(disable_warnings); KOKKOS_TEST_PRINT_FLAG(tune_internals); + KOKKOS_TEST_PRINT_FLAG(hwloc_enabled); #undef KOKKOS_TEST_PRINT_FLAG diff --git a/core/unit_test/configuration/test-code/test_config_arch_list.bash b/core/unit_test/configuration/test-code/test_config_arch_list.bash index 8fe8e2b5ece..8bc8ef21cd0 100755 --- a/core/unit_test/configuration/test-code/test_config_arch_list.bash +++ b/core/unit_test/configuration/test-code/test_config_arch_list.bash @@ -4,7 +4,7 @@ HostArch=(SNB HSW SKX KNL) DeviceArch=(Kepler35 Kepler37 Pascal60 Pascal61 Volta70) if [ ! -z "$KOKKOS_HOST_ARCH_TEST" ]; then export KOKKOS_ARCH_TEST=1 - HostArch=(WSM SNB HSW SKX WSM AMDAVX ARMv80 ARMv81 BDW KNC KNL BGQ Power7 Power8 Power9 Zen Zen2 Zen3 ARMv8_ThunderX ARMv8_ThunderX2) + HostArch=(SNB HSW SKX AMDAVX ARMv80 ARMv81 BDW KNC KNL Power8 Power9 Zen Zen2 Zen3 ARMv8_ThunderX ARMv8_ThunderX2) DeviceArch=() fi diff --git a/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp b/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp new file mode 100644 index 00000000000..d94735ceb23 --- /dev/null +++ b/core/unit_test/cuda/TestCuda_InterOp_StreamsMultiGPU.cpp @@ -0,0 +1,268 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +namespace { + +struct StreamsAndDevices { + std::array streams; + std::array devices; + + StreamsAndDevices() { + int n_devices; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&n_devices)); + + devices = {0, n_devices - 1}; + for (int i = 0; i < 2; ++i) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(devices[i])); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&streams[i])); + } + } + StreamsAndDevices(const StreamsAndDevices &) = delete; + StreamsAndDevices &operator=(const StreamsAndDevices &) = delete; + ~StreamsAndDevices() { + for (int i = 0; i < 2; ++i) { + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(devices[i])); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(streams[i])); + } + } +}; + +std::array get_execution_spaces( + const StreamsAndDevices &streams_and_devices) { + TEST_EXECSPACE exec0(streams_and_devices.streams[0]); + TEST_EXECSPACE exec1(streams_and_devices.streams[1]); + + // Must return void to use ASSERT_EQ + [&]() { + ASSERT_EQ(exec0.cuda_device(), streams_and_devices.devices[0]); + ASSERT_EQ(exec1.cuda_device(), streams_and_devices.devices[1]); + }(); + + return {exec0, exec1}; +} + +// Test Interoperability with Cuda Streams +void test_policies(TEST_EXECSPACE exec0, Kokkos::View v0, + TEST_EXECSPACE exec, Kokkos::View v) { + using MemorySpace = typename TEST_EXECSPACE::memory_space; + + Kokkos::deep_copy(exec, v, 5); + Kokkos::deep_copy(exec0, v0, 5); + + Kokkos::deep_copy(v, v0); + + int sum; + int sum0; + + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Range_0", + Kokkos::RangePolicy(exec0, 0, 100), + Test::FunctorRange(v0)); + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Range", + Kokkos::RangePolicy(exec, 0, 100), + Test::FunctorRange(v)); + Kokkos::parallel_reduce( + "Test::cuda::raw_cuda_stream::RangeReduce_0", + Kokkos::RangePolicy>(exec0, + 0, 100), + Test::FunctorRangeReduce(v0), sum0); + Kokkos::parallel_reduce( + "Test::cuda::raw_cuda_stream::RangeReduce", + Kokkos::RangePolicy>(exec, 0, + 100), + Test::FunctorRangeReduce(v), sum); + ASSERT_EQ(600, sum0); + ASSERT_EQ(600, sum); + + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::MDRange_0", + Kokkos::MDRangePolicy>( + exec0, {0, 0}, {10, 10}), + Test::FunctorMDRange(v0)); + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::MDRange", + Kokkos::MDRangePolicy>( + exec, {0, 0}, {10, 10}), + Test::FunctorMDRange(v)); + Kokkos::parallel_reduce("Test::cuda::raw_cuda_stream::MDRangeReduce_0", + Kokkos::MDRangePolicy, + Kokkos::LaunchBounds<128, 2>>( + exec0, {0, 0}, {10, 10}), + Test::FunctorMDRangeReduce(v0), sum0); + Kokkos::parallel_reduce("Test::cuda::raw_cuda_stream::MDRangeReduce", + Kokkos::MDRangePolicy, + Kokkos::LaunchBounds<128, 2>>( + exec, {0, 0}, {10, 10}), + Test::FunctorMDRangeReduce(v), sum); + ASSERT_EQ(700, sum0); + ASSERT_EQ(700, sum); + + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Team_0", + Kokkos::TeamPolicy(exec0, 10, 10), + Test::FunctorTeam(v0)); + Kokkos::parallel_for("Test::cuda::raw_cuda_stream::Team", + Kokkos::TeamPolicy(exec, 10, 10), + Test::FunctorTeam(v)); + Kokkos::parallel_reduce( + "Test::cuda::raw_cuda_stream::Team_0", + Kokkos::TeamPolicy>(exec0, + 10, 10), + Test::FunctorTeamReduce(v0), sum0); + Kokkos::parallel_reduce( + "Test::cuda::raw_cuda_stream::Team", + Kokkos::TeamPolicy>(exec, 10, + 10), + Test::FunctorTeamReduce(v), sum); + ASSERT_EQ(800, sum0); + ASSERT_EQ(800, sum); +} + +TEST(cuda_multi_gpu, managed_views) { + StreamsAndDevices streams_and_devices; + { + std::array execs = + get_execution_spaces(streams_and_devices); + + Kokkos::View view0( + Kokkos::view_alloc("v0", execs[0]), 100); + Kokkos::View view(Kokkos::view_alloc("v", execs[1]), + 100); + + test_policies(execs[0], view0, execs[1], view); + } +} + +TEST(cuda_multi_gpu, unmanaged_views) { + StreamsAndDevices streams_and_devices; + { + std::array execs = + get_execution_spaces(streams_and_devices); + + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(execs[0].cuda_device())); + int *p0; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMalloc(reinterpret_cast(&p0), sizeof(int) * 100)); + Kokkos::View view0(p0, 100); + + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(execs[1].cuda_device())); + int *p; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaMalloc(reinterpret_cast(&p), sizeof(int) * 100)); + Kokkos::View view(p, 100); + + test_policies(execs[0], view0, execs[1], view); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(p0)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(p)); + } +} + +struct ScratchFunctor { + int scratch_size; + int R; + + ScratchFunctor(int scratch_size_, int R_) + : scratch_size(scratch_size_), R(R_) {} + + KOKKOS_FUNCTION + void operator()(const Kokkos::TeamPolicy::member_type &team, + int &error_accum) const { + Kokkos::View scratch_mem( + team.team_scratch(1), scratch_size); + + // Initialize scratch memory + Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size), + [&](int i) { scratch_mem(i) = 0; }); + team.team_barrier(); + + // Increment each entry in scratch memory R times + for (int r = 0; r < R; ++r) { + Kokkos::parallel_for(Kokkos::TeamVectorRange(team, 0, scratch_size), + [&](int i) { scratch_mem(i) += 1; }); + } + team.team_barrier(); + + // Check that each scratch entry has been incremented exactly R times + int team_error_accum; + auto R_loc = R; // avoid implicit capture of this + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(team, 0, scratch_size), + [&](int i, int &tsum) { + if (scratch_mem(i) != R_loc) { + tsum += 1; + } + }, + team_error_accum); + Kokkos::single(Kokkos::PerTeam(team), + [&]() { error_accum += team_error_accum; }); + } +}; + +void test_scratch(TEST_EXECSPACE exec0, TEST_EXECSPACE exec1) { + constexpr int N = 10; + constexpr int R = 1000; + constexpr int scratch_size = 100; + using ScratchType = Kokkos::View; + + // Test allocating and using scratch space + ScratchFunctor f(scratch_size, R); + + auto policy0 = + Kokkos::TeamPolicy(exec0, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size))); + auto policy1 = + Kokkos::TeamPolicy(exec1, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(scratch_size))); + + int error0, error1; + + Kokkos::parallel_reduce("test_scratch_device_0", policy0, f, error0); + Kokkos::parallel_reduce("test_scratch_device_1", policy1, f, error1); + ASSERT_EQ(error0, 0); + ASSERT_EQ(error1, 0); + + // Request larger scratch size to trigger a realloc and test + const auto new_scratch_size = scratch_size + 10; + ScratchFunctor f_more_scratch(new_scratch_size, R); + + auto policy0_more_scratch = + Kokkos::TeamPolicy(exec0, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size))); + auto policy1_more_scratch = + Kokkos::TeamPolicy(exec1, N, 10) + .set_scratch_size( + 1, Kokkos::PerTeam(ScratchType::shmem_size(new_scratch_size))); + + Kokkos::parallel_reduce("test_realloc_scratch_device_0", policy0_more_scratch, + f_more_scratch, error0); + Kokkos::parallel_reduce("test_realloc_scratch_device_1", policy1_more_scratch, + f_more_scratch, error1); + ASSERT_EQ(error0, 0); + ASSERT_EQ(error1, 0); +} + +TEST(cuda_multi_gpu, scratch_space) { + StreamsAndDevices streams_and_devices; + { + std::array execs = + get_execution_spaces(streams_and_devices); + + test_scratch(execs[0], execs[1]); + } +} +} // namespace diff --git a/core/unit_test/cuda/TestCuda_Spaces.cpp b/core/unit_test/cuda/TestCuda_Spaces.cpp index ae603101abb..11fe6b8555b 100644 --- a/core/unit_test/cuda/TestCuda_Spaces.cpp +++ b/core/unit_test/cuda/TestCuda_Spaces.cpp @@ -29,200 +29,166 @@ __global__ void test_cuda_spaces_int_value(int *ptr) { TEST(cuda, space_access) { static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaHostPinnedSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaUVMSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaUVMSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaUVMSpace>::assignable); - static_assert( - !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert(!Kokkos::Impl::MemorySpaceAccess< + Kokkos::CudaSpace, Kokkos::CudaHostPinnedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::accessible); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); //-------------------------------------- static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaUVMSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaSpace>::accessible); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); - - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::accessible); + + static_assert(!Kokkos::Impl::MemorySpaceAccess< + Kokkos::CudaUVMSpace, Kokkos::CudaHostPinnedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::accessible); //-------------------------------------- static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaHostPinnedSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::CudaUVMSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::CudaUVMSpace>::accessible); //-------------------------------------- static_assert( - !Kokkos::SpaceAccessibility::accessible, - ""); + !Kokkos::SpaceAccessibility::accessible); static_assert( - Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::SpaceAccessibility::accessible); static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaUVMSpace>::accessible); static_assert( Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::accessible); static_assert(!Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaSpace>::accessible); static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaUVMSpace>::accessible); static_assert( Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::CudaHostPinnedSpace>::accessible); static_assert(std::is_same::Space, - Kokkos::HostSpace>::value, - ""); + Kokkos::HostSpace>::value); static_assert( std::is_same::Space, Kokkos::Device>::value, - ""); + Kokkos::CudaUVMSpace>>::value); static_assert( std::is_same::Space, - Kokkos::CudaHostPinnedSpace>::value, - ""); + Kokkos::CudaHostPinnedSpace>::value); static_assert(std::is_same, Kokkos::Device>::value, - ""); + Kokkos::CudaUVMSpace>>::value); static_assert( Kokkos::SpaceAccessibility::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); - static_assert( - Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + static_assert(Kokkos::SpaceAccessibility< + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); #ifdef KOKKOS_ENABLE_CUDA_UVM using uvm_view = Kokkos::View; static_assert(std::is_same::Space; static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert( - Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::SpaceAccessibility::accessible); static_assert( - Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::SpaceAccessibility::accessible); } } // namespace Test diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp deleted file mode 100644 index 348b9feeab0..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_1.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_01 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp deleted file mode 100644 index a77a55ea653..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_10.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_10 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp deleted file mode 100644 index 1b6a140920c..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_11.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_11 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp deleted file mode 100644 index 316bc85526f..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_12.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_12 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp deleted file mode 100644 index 6344960a1cf..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_13.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_13 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp deleted file mode 100644 index 4515174b82b..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_14.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_14 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp deleted file mode 100644 index 7ead50f0944..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_15.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_15 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp deleted file mode 100644 index e12b9b3894a..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_16.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_16 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_17.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_17.cpp deleted file mode 100644 index 959d0ab7503..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_17.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_17 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_18.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_18.cpp deleted file mode 100644 index 07d841519dc..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_18.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_18 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp deleted file mode 100644 index 042a515b16a..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_2.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_02 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp deleted file mode 100644 index dba401e5bcf..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_3.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_03 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp deleted file mode 100644 index a44c58bdb55..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_4.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_04 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp deleted file mode 100644 index cac0841dd83..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_5.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_05 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp deleted file mode 100644 index bafe3b3fd2a..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_6.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_06 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp deleted file mode 100644 index 3a4dd9d2533..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_7.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_07 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp deleted file mode 100644 index 4e92aae565a..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_8.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_08 -#include diff --git a/core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp b/core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp deleted file mode 100644 index 44b8f3428d9..00000000000 --- a/core/unit_test/default/TestDefaultDeviceTypeInit_9.cpp +++ /dev/null @@ -1,18 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#define KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_09 -#include diff --git a/core/unit_test/hip/TestHIP_Memory_Requirements.cpp b/core/unit_test/hip/TestHIP_Memory_Requirements.cpp index 8c72e9f2972..a213453ea18 100644 --- a/core/unit_test/hip/TestHIP_Memory_Requirements.cpp +++ b/core/unit_test/hip/TestHIP_Memory_Requirements.cpp @@ -48,6 +48,9 @@ TEST(hip, memory_requirements) { // we want all user-facing memory in hip to be coarse grained. As of // today(07.01.22) the documentation is not reliable/correct, we test the // memory on the device and host + // FIXME_HIP + GTEST_SKIP() << "skipping the test because the CI on MI100 returns: error( " + "hipErrorInvalidValue)"; KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::HIPSpace, int, 10); KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::HIPHostPinnedSpace, int, 10); KOKKOS_TEST_MEMORY_COARSEGRAINEDNESS(Kokkos::HIPManagedSpace, int, 10); diff --git a/core/unit_test/hip/TestHIP_Spaces.cpp b/core/unit_test/hip/TestHIP_Spaces.cpp index 14fd4e28837..8f7499c244b 100644 --- a/core/unit_test/hip/TestHIP_Spaces.cpp +++ b/core/unit_test/hip/TestHIP_Spaces.cpp @@ -29,198 +29,164 @@ __global__ void test_hip_spaces_int_value(int *ptr) { TEST(hip, space_access) { static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPHostPinnedSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPManagedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPManagedSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPHostPinnedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::accessible); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPManagedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPManagedSpace>::accessible); //-------------------------------------- static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPHostPinnedSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPManagedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPManagedSpace>::accessible); //-------------------------------------- static_assert( Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPManagedSpace>::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::assignable); - static_assert(!Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + static_assert( + !Kokkos::Impl::MemorySpaceAccess::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HIPHostPinnedSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::accessible); //-------------------------------------- static_assert( - !Kokkos::SpaceAccessibility::accessible, - ""); + !Kokkos::SpaceAccessibility::accessible); static_assert( - Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::SpaceAccessibility::accessible); static_assert( Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::accessible); - static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + static_assert( + Kokkos::SpaceAccessibility::accessible); static_assert(!Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HIPSpace>::accessible); static_assert( Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HIPHostPinnedSpace>::accessible); - static_assert(Kokkos::SpaceAccessibility::accessible, - ""); + static_assert( + Kokkos::SpaceAccessibility::accessible); static_assert(std::is_same::Space, - Kokkos::HostSpace>::value, - ""); + Kokkos::HostSpace>::value); static_assert( std::is_same::Space, - Kokkos::HIPHostPinnedSpace>::value, - ""); + Kokkos::HIPHostPinnedSpace>::value); static_assert( std::is_same::Space, Kokkos::Device>::value, - ""); + Kokkos::HIPManagedSpace>>::value); static_assert( Kokkos::SpaceAccessibility::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); } template diff --git a/core/unit_test/incremental/Test01_execspace.hpp b/core/unit_test/incremental/Test01_execspace.hpp index 25c7138ed3c..d7b2a57b442 100644 --- a/core/unit_test/incremental/Test01_execspace.hpp +++ b/core/unit_test/incremental/Test01_execspace.hpp @@ -62,8 +62,10 @@ struct TestIncrExecSpace { auto concurrency = ExecSpace().concurrency(); ASSERT_GT(concurrency, 0); +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 int in_parallel = ExecSpace::in_parallel(); ASSERT_FALSE(in_parallel); +#endif const char* name = ExecSpace::name(); std::cout << name << std::endl; diff --git a/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp b/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp deleted file mode 100644 index 92b8032bf0c..00000000000 --- a/core/unit_test/openmp/TestOpenMP_PartitionMaster.cpp +++ /dev/null @@ -1,105 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include -#include - -#include - -namespace Test { - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3 -TEST(openmp, partition_master) { - using Mutex = Kokkos::Experimental::MasterLock; - - Mutex mtx; - int errors = 0; - - auto master = [&errors, &mtx](int /*partition_id*/, int /*num_partitions*/) { - const int pool_size = Kokkos::OpenMP().impl_thread_pool_size(); - - { - std::unique_lock lock(mtx); - if (Kokkos::OpenMP::in_parallel()) { - ++errors; - } - if (Kokkos::OpenMP::impl_thread_pool_rank() != 0) { - ++errors; - } - } - - { - int local_errors = 0; - Kokkos::parallel_reduce( - Kokkos::RangePolicy(0, 1000), - [pool_size](const int, int& errs) { - if (Kokkos::OpenMP().impl_thread_pool_size() != pool_size) { - ++errs; - } - }, - local_errors); - Kokkos::atomic_add(&errors, local_errors); - } - - Kokkos::Experimental::UniqueToken token; - - Kokkos::View count("", token.size()); - - Kokkos::parallel_for(Kokkos::RangePolicy(0, 1000), - [=](const int) { - int i = token.acquire(); - ++count[i]; - token.release(i); - }); - - Kokkos::View sum(""); - Kokkos::parallel_for( - Kokkos::RangePolicy(0, token.size()), - [=](const int i) { Kokkos::atomic_add(sum.data(), count[i]); }); - - if (sum() != 1000) { - Kokkos::atomic_add(&errors, 1); - } - }; - - master(0, 1); - - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 4, 0); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 0, 4); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 2, 2); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 8, 0); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 0, 8); - ASSERT_EQ(errors, 0); - - Kokkos::OpenMP::partition_master(master, 8, 8); - ASSERT_EQ(errors, 0); -} -#endif - -} // namespace Test diff --git a/core/unit_test/sycl/TestSYCL_Spaces.cpp b/core/unit_test/sycl/TestSYCL_Spaces.cpp index 914f8432488..a4fd053e83d 100644 --- a/core/unit_test/sycl/TestSYCL_Spaces.cpp +++ b/core/unit_test/sycl/TestSYCL_Spaces.cpp @@ -21,235 +21,192 @@ namespace Test { TEST(sycl, space_access) { static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::assignable, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLDeviceUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); //-------------------------------------- static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::assignable); static_assert( !Kokkos::Impl::MemorySpaceAccess::assignable, - ""); + Kokkos::HostSpace>::assignable); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert(!Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLHostUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCLHostUSMSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::accessible); //-------------------------------------- static_assert(!Kokkos::SpaceAccessibility::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLSharedUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLSharedUSMSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Experimental::SYCL, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); static_assert(!Kokkos::SpaceAccessibility< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLSharedUSMSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); static_assert( std::is_same::Space, - Kokkos::HostSpace>::value, - ""); + Kokkos::HostSpace>::value); static_assert( std::is_same< Kokkos::Impl::HostMirror< Kokkos::Experimental::SYCLSharedUSMSpace>::Space, Kokkos::Device>::value, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace>>::value); static_assert( Kokkos::Impl::MemorySpaceAccess::accessible, - ""); + Kokkos::HostSpace>::accessible); static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::HostSpace, - Kokkos::Experimental::SYCLHostUSMSpace>::accessible, - ""); + Kokkos::HostSpace, + Kokkos::Experimental::SYCLHostUSMSpace>::accessible); static_assert(std::is_same::Space, - Kokkos::Experimental::SYCLHostUSMSpace>::value, - ""); + Kokkos::Experimental::SYCLHostUSMSpace>::value); static_assert( std::is_same< Kokkos::Device, Kokkos::Device>::value, - ""); + Kokkos::Experimental::SYCLSharedUSMSpace>>::value); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror< - Kokkos::Experimental::SYCLDeviceUSMSpace>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror< + Kokkos::Experimental::SYCLDeviceUSMSpace>::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror< - Kokkos::Experimental::SYCLSharedUSMSpace>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror< + Kokkos::Experimental::SYCLSharedUSMSpace>::Space, + Kokkos::HostSpace>::accessible); static_assert(Kokkos::SpaceAccessibility< - Kokkos::Impl::HostMirror< - Kokkos::Experimental::SYCLHostUSMSpace>::Space, - Kokkos::HostSpace>::accessible, - ""); + Kokkos::Impl::HostMirror< + Kokkos::Experimental::SYCLHostUSMSpace>::Space, + Kokkos::HostSpace>::accessible); } TEST(sycl, uvm) { diff --git a/core/unit_test/tools/TestEventCorrectness.hpp b/core/unit_test/tools/TestEventCorrectness.hpp index 3c85f661aae..946169a786d 100644 --- a/core/unit_test/tools/TestEventCorrectness.hpp +++ b/core/unit_test/tools/TestEventCorrectness.hpp @@ -409,14 +409,19 @@ TEST(kokkosp, parallel_scan_no_fence) { << "skipping since the OpenMPTarget backend has unexpected fences"; #endif + // Execute the parallel_scan first without looking for fence events. + // Depending on the backend implementation and the order of tests, + // it might be that the first call to parallel_scan is reallocating scratch + // memory which implies a fence when deallocating. We are not interested in + // detecting this event. + TestScanFunctor tf; + Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf); + using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableFences()); auto success = validate_absence( - [=]() { - TestScanFunctor tf; - Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf); - }, + [=]() { Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf); }, [=](BeginFenceEvent begin_event) { if (begin_event.name.find("Debug Only Check for Execution Error") != std::string::npos || @@ -450,13 +455,20 @@ TEST(kokkosp, parallel_scan_no_fence_view) { << "skipping since the OpenMPTarget backend has unexpected fences"; #endif + // Execute the parallel_scan first without looking for fence events. + // Depending on the backend implementation and the order of tests, + // it might be that the first call to parallel_scan is reallocating scratch + // memory which implies a fence when deallocating. We are not interested in + // detecting this event. + TestScanFunctor tf; + Kokkos::View v("scan_result"); + Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf, v); + using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableFences()); - Kokkos::View v("scan_result"); auto success = validate_absence( [=]() { - TestScanFunctor tf; Kokkos::parallel_scan("dogs", Kokkos::RangePolicy<>(0, 1), tf, v); }, [=](BeginFenceEvent begin_event) { diff --git a/core/unit_test/tools/TestLogicalSpaces.hpp b/core/unit_test/tools/TestLogicalSpaces.hpp deleted file mode 100644 index 4e56f8996a0..00000000000 --- a/core/unit_test/tools/TestLogicalSpaces.hpp +++ /dev/null @@ -1,177 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER -#include -#include -#include "Kokkos_Core.hpp" - -#include - -namespace Test { - -void debug_print(const Kokkos_Profiling_SpaceHandle hand, const char* name, - const void* ptr, const size_t size) { - std::cout << "Alloc: " << hand.name << ", [" << name << "," << ptr << "] " - << size << std::endl; -} -void debug_dealloc(const Kokkos_Profiling_SpaceHandle hand, const char* name, - const void* ptr, const size_t size) { - std::cout << "Dealloc: " << hand.name << ", [" << name << "," << ptr << "] " - << size << std::endl; -} - -void fail_on_event(const Kokkos::Profiling::SpaceHandle, const char*, - const void*, const uint64_t) { - ASSERT_TRUE(false) << "Unexpected memory event"; -} - -void expect_no_events() { - Kokkos::Tools::Experimental::set_allocate_data_callback(&fail_on_event); - Kokkos::Tools::Experimental::set_deallocate_data_callback(&fail_on_event); -} - -std::string expected_view_name; -std::string expected_space_name; -std::string error_message; -void expect_allocation_event(const std::string evn, const std::string esn, - const std::string em) { - expected_view_name = evn; - expected_space_name = esn; - error_message = em; - Kokkos::Tools::Experimental::set_allocate_data_callback( - [](const Kokkos_Profiling_SpaceHandle hand, const char* name, const void*, - const uint64_t) { - ASSERT_EQ(std::string(hand.name), expected_space_name) - << error_message << " (bad handle)"; - ASSERT_EQ(std::string(name), expected_view_name) - << error_message << " (bad view name)"; - expect_no_events(); - }); -} -void expect_deallocation_event(const std::string& evn, const std::string& esn, - const std::string em) { - expected_view_name = evn; - expected_space_name = esn; - error_message = em; - Kokkos::Tools::Experimental::set_deallocate_data_callback( - [](const Kokkos_Profiling_SpaceHandle hand, const char* name, const void*, - const uint64_t) { - ASSERT_EQ(std::string(hand.name), expected_space_name) - << error_message << " (bad handle)"; - ASSERT_EQ(std::string(name), expected_view_name) - << error_message << " (bad view name)"; - expect_no_events(); - }); -} - -struct TestSpaceNamer { - static constexpr const char* get_name() { return "TestSpace"; } -}; -struct TestSpaceNamerTwo { - static constexpr const char* get_name() { return "YoDawg"; } -}; -struct TestSpaceNamerThree { - static constexpr const char* get_name() { return "CustomAccessSpace"; } -}; -using fake_memory_space = Kokkos::Experimental::LogicalMemorySpace< - Kokkos::HostSpace, Kokkos::DefaultHostExecutionSpace, TestSpaceNamer, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>; - -void test_view_construct() { - { - expect_allocation_event("puppy_view", "TestSpace", "View allocation"); - Kokkos::View pup_view("puppy_view", 1000); - expect_deallocation_event("puppy_view", "TestSpace", "View free"); - } - Kokkos::Tools::Experimental::pause_tools(); -} -void test_malloc_free() { - expect_allocation_event("does_malloc_work", "TestSpace", - "Error in malloc event"); - auto* temp = - Kokkos::kokkos_malloc("does_malloc_work", 1000); - expect_deallocation_event("does_malloc_work", "TestSpace", "Error in free"); - Kokkos::kokkos_free(temp); - Kokkos::Tools::Experimental::pause_tools(); -} -void test_chained_spaces() { - using doubly_fake_memory_space = Kokkos::Experimental::LogicalMemorySpace< - fake_memory_space, Kokkos::DefaultHostExecutionSpace, TestSpaceNamerTwo, - Kokkos::Experimental::LogicalSpaceSharesAccess::shared_access>; - { - expect_allocation_event("xzibit_dot_jpeg", "YoDawg", - "Chained space view allocation"); - Kokkos::View pup_view("xzibit_dot_jpeg", - 1000); - expect_deallocation_event("xzibit_dot_jpeg", "YoDawg", - "Chained space free"); - } - Kokkos::Tools::Experimental::pause_tools(); -} -void test_space_allocations() { - fake_memory_space debug_space; - expect_allocation_event("allocation_from_space", "TestSpace", - "Space allocation"); - auto* temp = debug_space.allocate("allocation_from_space", 1000); - expect_deallocation_event("allocation_from_space", "TestSpace", - "Space deallocation"); - debug_space.deallocate("allocation_from_space", temp, 1000); - Kokkos::Tools::Experimental::pause_tools(); -} -template -struct AccessCheckKernel { - Kokkos::View data; - KOKKOS_FUNCTION void operator()(const int i) const { data[i] = i; } -}; - -template -void test_allowed_access() { - constexpr const int data_size = 1000; - // We use an unmananged View here since we want to detect a memory access - // violation in the parallel_for and not in the initialization of the View. - std::vector test_data(data_size); - Kokkos::View test_view(test_data.data(), data_size); - AccessCheckKernel functor{test_view}; - Kokkos::parallel_for( - "access_allowed", - Kokkos::RangePolicy(0, data_size), - functor); - Kokkos::fence(); -} - -using semantically_independent_logical_space = - Kokkos::Experimental::LogicalMemorySpace< - Kokkos::HostSpace, Kokkos::DefaultHostExecutionSpace, - TestSpaceNamerThree, - Kokkos::Experimental::LogicalSpaceSharesAccess::no_shared_access>; - -TEST(defaultdevicetype, logical_space_views) { test_view_construct(); } -TEST(defaultdevicetype, logical_space_malloc) { test_malloc_free(); } -TEST(defaultdevicetype, logical_space_alloc) { test_space_allocations(); } -TEST(defaultdevicetype, chained_logical_spaces) { test_chained_spaces(); } -TEST(defaultdevicetype, access_allowed) { - test_allowed_access(); -} -// FIXME_SYCL -#if !(defined(KOKKOS_COMPILER_INTEL_LLVM) && defined(KOKKOS_ENABLE_SYCL)) -TEST(defaultdevicetype_DeathTest, access_forbidden) { - ::testing::FLAGS_gtest_death_test_style = "threadsafe"; - ASSERT_DEATH( - { test_allowed_access(); }, - "Kokkos::View ERROR: attempt to access inaccessible memory space"); -} -#endif - -} // namespace Test diff --git a/core/unit_test/tools/TestProfilingSection.cpp b/core/unit_test/tools/TestProfilingSection.cpp index 318766ac455..9d35d67feb0 100644 --- a/core/unit_test/tools/TestProfilingSection.cpp +++ b/core/unit_test/tools/TestProfilingSection.cpp @@ -108,8 +108,8 @@ TEST(defaultdevicetype, profiling_section) { } using Kokkos::Profiling::ProfilingSection; -static_assert(!std::is_default_constructible::value, ""); -static_assert(!std::is_copy_constructible::value, ""); -static_assert(!std::is_move_constructible::value, ""); -static_assert(!std::is_copy_assignable::value, ""); -static_assert(!std::is_move_assignable::value, ""); +static_assert(!std::is_default_constructible::value); +static_assert(!std::is_copy_constructible::value); +static_assert(!std::is_move_constructible::value); +static_assert(!std::is_copy_assignable::value); +static_assert(!std::is_move_assignable::value); diff --git a/example/tutorial/01_hello_world/hello_world.cpp b/example/tutorial/01_hello_world/hello_world.cpp index 5b8a21af833..22b8b6d63c8 100644 --- a/example/tutorial/01_hello_world/hello_world.cpp +++ b/example/tutorial/01_hello_world/hello_world.cpp @@ -58,12 +58,7 @@ struct hello_world { // is unnecessary but harmless. KOKKOS_INLINE_FUNCTION void operator()(const int i) const { - // FIXME_SYCL needs workaround for printf -#ifndef __SYCL_DEVICE_ONLY__ - printf("Hello from i = %i\n", i); -#else - (void)i; -#endif + Kokkos::printf("Hello from i = %i\n", i); } }; diff --git a/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp b/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp index c78f3076361..909765e1fc3 100644 --- a/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp +++ b/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp @@ -76,13 +76,9 @@ int main(int argc, char* argv[]) { #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( 15, KOKKOS_LAMBDA(const int i) { - // FIXME_SYCL needs workaround for printf -#ifndef __SYCL_DEVICE_ONLY__ - // printf works in a CUDA parallel kernel; std::ostream does not. - printf("Hello from i = %i\n", i); -#else - (void)i; -#endif + // Kokko::printf works for all backends in a parallel kernel; + // std::ostream does not. + Kokkos::printf("Hello from i = %i\n", i); }); #endif // You must call finalize() after you are done using Kokkos. diff --git a/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp b/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp index b041f8d435b..ee3f4721d91 100644 --- a/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp +++ b/example/tutorial/Hierarchical_Parallelism/01_thread_teams/thread_teams.cpp @@ -47,13 +47,9 @@ struct hello_world { // The TeamPolicy<>::member_type provides functions to query the multi // dimensional index of a thread as well as the number of thread-teams and // the size of each team. -#ifndef __SYCL_DEVICE_ONLY__ - // FIXME_SYCL needs printf workaround - printf("Hello World: %i %i // %i %i\n", thread.league_rank(), - thread.team_rank(), thread.league_size(), thread.team_size()); -#else - (void)thread; -#endif + Kokkos::printf("Hello World: %i %i // %i %i\n", thread.league_rank(), + thread.team_rank(), thread.league_size(), + thread.team_size()); } }; diff --git a/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp b/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp index 933b254f7c7..1e6812adead 100644 --- a/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp +++ b/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp @@ -57,16 +57,12 @@ int main(int narg, char* args[]) { policy, KOKKOS_LAMBDA(const team_member& thread, int& lsum) { lsum += 1; - // TeamPolicy<>::member_type provides functions to query the - // multidimensional index of a thread, as well as the number of - // thread teams and the size of each team. -#ifndef __SYCL_DEVICE_ONLY__ - // FIXME_SYCL needs workaround for printf - printf("Hello World: %i %i // %i %i\n", thread.league_rank(), - thread.team_rank(), thread.league_size(), thread.team_size()); -#else - (void)thread; -#endif + // TeamPolicy<>::member_type provides functions to query the + // multidimensional index of a thread, as well as the number of + // thread teams and the size of each team. + Kokkos::printf("Hello World: %i %i // %i %i\n", thread.league_rank(), + thread.team_rank(), thread.league_size(), + thread.team_size()); }, sum); #endif diff --git a/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp b/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp index 398810d1331..75d6089e9af 100644 --- a/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp +++ b/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/nested_parallel_for.cpp @@ -43,16 +43,11 @@ struct hello_world { // the operator using a team_policy acts like a parallel region for the // team. That means that everything outside of the nested parallel_for is // also executed by all threads of the team. - Kokkos::parallel_for(Kokkos::TeamThreadRange(thread, 31), - [&](const int& i) { -#ifndef __SYCL_DEVICE_ONLY__ - // FIXME_SYCL needs printf workaround - printf("Hello World: (%i , %i) executed loop %i \n", - thread.league_rank(), thread.team_rank(), i); -#else - (void) i; -#endif - }); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(thread, 31), [&](const int& i) { + Kokkos::printf("Hello World: (%i , %i) executed loop %i \n", + thread.league_rank(), thread.team_rank(), i); + }); } }; diff --git a/generate_makefile.bash b/generate_makefile.bash index 301a1fceb5a..25370daa3f2 100755 --- a/generate_makefile.bash +++ b/generate_makefile.bash @@ -170,12 +170,9 @@ display_help_text() { echo " ARMV8_THUNDERX = ARMv8 Cavium ThunderX CPU" echo " ARMV8_THUNDERX2 = ARMv8 Cavium ThunderX2 CPU" echo " [IBM]" - echo " BGQ = IBM Blue Gene Q" - echo " Power7 = IBM POWER7 and POWER7+ CPUs" echo " Power8 = IBM POWER8 CPUs" echo " Power9 = IBM POWER9 CPUs" echo " [Intel]" - echo " WSM = Intel Westmere CPUs" echo " SNB = Intel Sandy/Ivy Bridge CPUs" echo " HSW = Intel Haswell CPUs" echo " BDW = Intel Broadwell Xeon E-class CPUs" @@ -226,7 +223,6 @@ display_help_text() { echo "--with-gtest=/Path/To/Gtest: Set path to gtest. (Used in unit and performance" echo " tests.)" echo "--with-hwloc=/Path/To/Hwloc: Set path to hwloc library." - echo "--with-memkind=/Path/To/MemKind: Set path to memkind library." echo "--with-options=[OPT]: Additional options to Kokkos:" echo " compiler_warnings" echo " aggressive_vectorization = add ivdep on loops" @@ -342,10 +338,6 @@ do KOKKOS_HWLOC=ON HWLOC_PATH="${key#*=}" ;; - --with-memkind*) - KOKKOS_MEMKIND=ON - MEMKIND_PATH="${key#*=}" - ;; --arch*) KOKKOS_ARCH="${key#*=}" ;; @@ -452,15 +444,6 @@ else KOKKOS_HWLOC_CMD= fi -if [ "$KOKKOS_MEMKIND" == "ON" ]; then - KOKKOS_MEMKIND_CMD=-DKokkos_ENABLE_MEMKIND=ON - if [ "$MEMKIND_PATH" != "" ]; then - KOKKOS_MEMKIND_PATH_CMD=-DMEMKIND_ROOT=$MEMKIND_PATH - fi -else - KOKKOS_MEMKIND_CMD= -fi - if [ ! -e ${KOKKOS_PATH}/CMakeLists.txt ]; then if [ "${KOKKOS_PATH}" == "" ]; then CM_SCRIPT=$0 @@ -506,5 +489,5 @@ if [[ ${COMPILER} == *clang* ]]; then fi fi -echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} -cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${PASSTHRU_CMAKE_FLAGS} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} +echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} +cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${PASSTHRU_CMAKE_FLAGS} -DKokkos_ENABLE_DEPRECATION_WARNINGS=${KOKKOS_DEPRECATED_CODE_WARNINGS} -DKokkos_ENABLE_DEPRECATED_CODE_4=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} diff --git a/gnu_generate_makefile.bash b/gnu_generate_makefile.bash index 5ea159cdd47..7a197bb71d4 100755 --- a/gnu_generate_makefile.bash +++ b/gnu_generate_makefile.bash @@ -74,9 +74,6 @@ do --with-hwloc*) HWLOC_PATH="${key#*=}" ;; - --with-memkind*) - MEMKIND_PATH="${key#*=}" - ;; --arch*) KOKKOS_ARCH="${key#*=}" ;; @@ -148,12 +145,9 @@ do echo " ARMv8-ThunderX = ARMv8 Cavium ThunderX CPU" echo " ARMv8-TX2 = ARMv8 Cavium ThunderX2 CPU" echo " [IBM]" - echo " BGQ = IBM Blue Gene Q" - echo " Power7 = IBM POWER7 and POWER7+ CPUs" echo " Power8 = IBM POWER8 CPUs" echo " Power9 = IBM POWER9 CPUs" echo " [Intel]" - echo " WSM = Intel Westmere CPUs" echo " SNB = Intel Sandy/Ivy Bridge CPUs" echo " HSW = Intel Haswell CPUs" echo " BDW = Intel Broadwell Xeon E-class CPUs" @@ -198,7 +192,6 @@ do echo "--with-gtest=/Path/To/Gtest: Set path to gtest. (Used in unit and performance" echo " tests.)" echo "--with-hwloc=/Path/To/Hwloc: Set path to hwloc library." - echo "--with-memkind=/Path/To/MemKind: Set path to memkind library." echo "--with-options=[OPT]: Additional options to Kokkos:" echo " compiler_warnings" echo " aggressive_vectorization = add ivdep on loops" @@ -298,11 +291,6 @@ if [ ${#HWLOC_PATH} -gt 0 ]; then KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},hwloc" fi -if [ ${#MEMKIND_PATH} -gt 0 ]; then - KOKKOS_SETTINGS="${KOKKOS_SETTINGS} MEMKIND_PATH=${MEMKIND_PATH}" - KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},experimental_memkind" -fi - if [ ${#KOKKOS_USE_TPLS} -gt 0 ]; then KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_USE_TPLS=${KOKKOS_USE_TPLS}" fi diff --git a/master_history.txt b/master_history.txt index a43b5276a83..bd122a456bd 100644 --- a/master_history.txt +++ b/master_history.txt @@ -35,3 +35,4 @@ tag: 4.0.01 date: 04:26:2023 master: aa1f48f3 release: 5893754f tag: 4.1.00 date: 06:20:2023 master: 62d2b6c8 release: adde1e6a tag: 4.2.00 date: 11:09:2023 master: 1a3ea28f release: abe01c88 tag: 4.2.01 date: 01:30:2024 master: 71a9bcae release: 221e5f7a +tag: 4.3.00 date: 04:03:2024 master: e0dc0128 release: f08217a4 diff --git a/scripts/docker/Dockerfile.clang b/scripts/docker/Dockerfile.clang index 5c6abc1c6de..b493c3bbff0 100644 --- a/scripts/docker/Dockerfile.clang +++ b/scripts/docker/Dockerfile.clang @@ -1,49 +1,13 @@ -FROM ubuntu:18.04 +FROM ubuntu:20.04 RUN apt-get update && apt-get install -y \ bc \ git \ build-essential \ + clang-format-8 \ wget \ - ccache \ && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN KEYDUMP_URL=https://cloud.cees.ornl.gov/download && \ - KEYDUMP_FILE=keydump && \ - wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE} && \ - wget --quiet ${KEYDUMP_URL}/${KEYDUMP_FILE}.sig && \ - gpg --import ${KEYDUMP_FILE} && \ - gpg --verify ${KEYDUMP_FILE}.sig ${KEYDUMP_FILE} && \ - rm ${KEYDUMP_FILE}* - -ARG CMAKE_VERSION=3.16.8 -ENV CMAKE_DIR=/opt/cmake -RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \ - CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \ - CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \ - wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \ - wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \ - wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \ - gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \ - grep -i ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sed -e s/linux/Linux/ | sha256sum --check && \ - mkdir -p ${CMAKE_DIR} && \ - sh ${CMAKE_SCRIPT} --skip-license --prefix=${CMAKE_DIR} && \ - rm cmake* -ENV PATH=${CMAKE_DIR}/bin:$PATH - -ENV LLVM_DIR=/opt/llvm -RUN LLVM_VERSION=8.0.0 && \ - LLVM_URL=https://releases.llvm.org/${LLVM_VERSION}/clang+llvm-${LLVM_VERSION}-x86_64-linux-gnu-ubuntu-18.04.tar.xz && \ - LLVM_ARCHIVE=llvm-${LLVM_VERSION}.tar.xz && \ - SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \ - wget --quiet ${LLVM_URL} --output-document=${LLVM_ARCHIVE} && \ - wget --quiet ${LLVM_URL}.sig --output-document=${LLVM_ARCHIVE}.sig && \ - gpg --verify ${LLVM_ARCHIVE}.sig ${LLVM_ARCHIVE} && \ - mkdir -p ${LLVM_DIR} && \ - tar -xvf ${LLVM_ARCHIVE} -C ${LLVM_DIR} --strip-components=1 && \ - echo "${LLVM_DIR}/lib" > /etc/ld.so.conf.d/llvm.conf && ldconfig && \ - rm -rf /root/.gnupg && \ - rm -rf ${SCRATCH_DIR} -ENV PATH=${LLVM_DIR}/bin:$PATH +ENV CLANG_FORMAT_EXE=clang-format-8 diff --git a/scripts/docker/Dockerfile.openmptarget b/scripts/docker/Dockerfile.openmptarget index 708cf533b8a..22edcda2a07 100644 --- a/scripts/docker/Dockerfile.openmptarget +++ b/scripts/docker/Dockerfile.openmptarget @@ -38,7 +38,7 @@ RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSIO rm ${CMAKE_SCRIPT} ENV PATH=${CMAKE_DIR}/bin:$PATH -ARG LLVM_VERSION=llvmorg-17.0.1 +ARG LLVM_VERSION=llvmorg-17.0.3 ENV LLVM_DIR=/opt/llvm RUN LLVM_URL=https://github.com/llvm/llvm-project/archive &&\ LLVM_ARCHIVE=${LLVM_VERSION}.tar.gz &&\ diff --git a/scripts/docker/Dockerfile.sycl b/scripts/docker/Dockerfile.sycl index 714461bfe6a..87864da1bf7 100644 --- a/scripts/docker/Dockerfile.sycl +++ b/scripts/docker/Dockerfile.sycl @@ -55,3 +55,12 @@ RUN wget https://registrationcenter-download.intel.com/akdlm/irc_nas/19133/l_one chmod +x ./l_oneDPL_p_2022.0.0.25335.sh && \ ./l_oneDPL_p_2022.0.0.25335.sh -a -s --eula accept && \ rm l_oneDPL_p_2022.0.0.25335.sh + +# clang++ +ENV PATH=/opt/intel/oneapi/compiler/latest/linux/bin-llvm/:$PATH +# sycl-ls, icpx +ENV PATH=/opt/intel/oneapi/compiler/latest/linux/bin/:$PATH +# libsycl +ENV LD_LIBRARY_PATH=/opt/intel/oneapi/compiler/latest/linux/lib:$LD_LIBRARY_PATH +# libsvml +ENV LD_LIBRARY_PATH=/opt/intel/oneapi/compiler/latest/linux/compiler/lib/intel64_lin:$LD_LIBRARY_PATH diff --git a/scripts/testing_scripts/generate_makefile.bash b/scripts/testing_scripts/generate_makefile.bash index ae1db3186f7..830d7b12d90 100755 --- a/scripts/testing_scripts/generate_makefile.bash +++ b/scripts/testing_scripts/generate_makefile.bash @@ -59,9 +59,6 @@ do --with-hwloc*) HWLOC_PATH="${key#*=}" ;; - --with-memkind*) - MEMKIND_PATH="${key#*=}" - ;; --arch*) KOKKOS_ARCH="${key#*=}" ;; @@ -136,12 +133,9 @@ do echo " ARMv8-ThunderX = ARMv8 Cavium ThunderX CPU" echo " ARMv8-TX2 = ARMv8 Cavium ThunderX2 CPU" echo " [IBM]" - echo " BGQ = IBM Blue Gene Q" - echo " Power7 = IBM POWER7 and POWER7+ CPUs" echo " Power8 = IBM POWER8 CPUs" echo " Power9 = IBM POWER9 CPUs" echo " [Intel]" - echo " WSM = Intel Westmere CPUs" echo " SNB = Intel Sandy/Ivy Bridge CPUs" echo " HSW = Intel Haswell CPUs" echo " BDW = Intel Broadwell Xeon E-class CPUs" @@ -177,7 +171,6 @@ do echo "--with-gtest=/Path/To/Gtest: Set path to gtest. (Used in unit and performance" echo " tests.)" echo "--with-hwloc=/Path/To/Hwloc: Set path to hwloc library." - echo "--with-memkind=/Path/To/MemKind: Set path to memkind library." echo "--with-options=[OPT]: Additional options to Kokkos:" echo " compiler_warnings" echo " aggressive_vectorization = add ivdep on loops" @@ -269,11 +262,6 @@ if [ ${#HWLOC_PATH} -gt 0 ]; then KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},hwloc" fi -if [ ${#MEMKIND_PATH} -gt 0 ]; then - KOKKOS_SETTINGS="${KOKKOS_SETTINGS} MEMKIND_PATH=${MEMKIND_PATH}" - KOKKOS_USE_TPLS="${KOKKOS_USE_TPLS},experimental_memkind" -fi - if [ ${#KOKKOS_USE_TPLS} -gt 0 ]; then KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_USE_TPLS=${KOKKOS_USE_TPLS}" fi diff --git a/simd/src/Kokkos_SIMD_AVX2.hpp b/simd/src/Kokkos_SIMD_AVX2.hpp index 521160b76fc..6d0956f3832 100644 --- a/simd/src/Kokkos_SIMD_AVX2.hpp +++ b/simd/src/Kokkos_SIMD_AVX2.hpp @@ -30,9 +30,11 @@ "Kokkos_SIMD_AVX2.hpp must be included before Kokkos_SIMD_Common_Math.hpp!" #endif -// FIXME_HIP ROCm 5.6 and 5.7 can't compile with the intrinsic used here. -#if defined(__HIPCC__) && (HIP_VERSION_MAJOR == 5) && \ - ((HIP_VERSION_MINOR == 6) || (HIP_VERSION_MINOR == 7)) +// FIXME_HIP ROCm 5.6, 5.7, and 6.0 can't compile with the intrinsic used here. +#if defined(__HIPCC__) && \ + (((HIP_VERSION_MAJOR == 5) && \ + ((HIP_VERSION_MINOR == 6) || (HIP_VERSION_MINOR == 7))) || \ + ((HIP_VERSION_MAJOR == 6) && ((HIP_VERSION_MINOR == 0)))) #define KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE #endif @@ -563,10 +565,18 @@ class simd> { element_aligned_tag) { m_value = _mm256_loadu_pd(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm256_load_pd(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm256_storeu_pd(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_store_pd(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256d() const { return m_value; @@ -818,10 +828,18 @@ class simd> { element_aligned_tag) { m_value = _mm_loadu_ps(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm_load_ps(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm_storeu_ps(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm_store_ps(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m128() const { return m_value; @@ -1059,17 +1077,31 @@ class simd> { } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, element_aligned_tag) { - // FIXME_HIP ROCm 5.6 can't compile with the intrinsic used here. + // FIXME_HIP ROCm 5.6, 5.7, and 6.0 can't compile with the intrinsic used + // here. #ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE m_value = _mm_loadu_si128(reinterpret_cast<__m128i const*>(ptr)); #else m_value = _mm_maskload_epi32(ptr, static_cast<__m128i>(mask_type(true))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + // FIXME_HIP ROCm 5.6 can't compile with the intrinsic used here. +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + m_value = _mm_load_si128(reinterpret_cast<__m128i const*>(ptr)); +#else + m_value = _mm_maskload_epi32(ptr, static_cast<__m128i>(mask_type(true))); #endif } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm_maskstore_epi32(ptr, static_cast<__m128i>(mask_type(true)), m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm_maskstore_epi32(ptr, static_cast<__m128i>(mask_type(true)), m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m128i() const { return m_value; @@ -1111,6 +1143,11 @@ class simd> { return simd( _mm_add_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd( + _mm_mullo_epi32(static_cast<__m128i>(lhs), static_cast<__m128i>(rhs))); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( simd const& lhs, int rhs) noexcept { @@ -1249,6 +1286,15 @@ class simd> { #else m_value = _mm256_maskload_epi64(reinterpret_cast(ptr), static_cast<__m256i>(mask_type(true))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + m_value = _mm256_load_si256(reinterpret_cast<__m256i const*>(ptr)); +#else + m_value = _mm256_maskload_epi64(reinterpret_cast(ptr), + static_cast<__m256i>(mask_type(true))); #endif } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( @@ -1256,6 +1302,11 @@ class simd> { _mm256_maskstore_epi64(reinterpret_cast(ptr), static_cast<__m256i>(mask_type(true)), m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_maskstore_epi64(reinterpret_cast(ptr), + static_cast<__m256i>(mask_type(true)), m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() const { return m_value; @@ -1278,6 +1329,13 @@ class simd> { _mm256_add_epi64(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); } + // fallback simd multiplication using generator constructor + // multiplying vectors of 64-bit signed integers is not available in AVX2 + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); + } + // AVX2 only has eq and gt comparisons for int64 [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator==(simd const& lhs, simd const& rhs) noexcept { @@ -1306,17 +1364,19 @@ class simd> { return !(lhs == rhs); } + // fallback simd shift right arithmetic using generator constructor // Shift right arithmetic for 64bit packed ints is not availalbe in AVX2 - // [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd( - // simd const& lhs, int rhs) noexcept { - // return simd(_mm256_srai_epi64(static_cast<__m256i>(lhs), rhs)); - // } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, int rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] >> rhs; }); + } - // [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd( - // simd const& lhs, simd const& rhs) noexcept { - // return simd(_mm256_srav_epi64(static_cast<__m256i>(lhs), - // static_cast<__m256i>(rhs)))); - // } + // fallback simd shift right arithmetic using generator constructor + // Shift right arithmetic for 64bit packed ints is not availalbe in AVX2 + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] >> rhs[i]; }); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator<<( simd const& lhs, int rhs) noexcept { @@ -1444,6 +1504,15 @@ class simd> { #else m_value = _mm256_maskload_epi64(reinterpret_cast(ptr), static_cast<__m256i>(mask_type(true))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + m_value = _mm256_load_si256(reinterpret_cast<__m256i const*>(ptr)); +#else + m_value = _mm256_maskload_epi64(reinterpret_cast(ptr), + static_cast<__m256i>(mask_type(true))); #endif } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() @@ -1460,6 +1529,14 @@ class simd> { return simd( _mm256_sub_epi64(static_cast<__m256i>(lhs), static_cast<__m256i>(rhs))); } + + // fallback simd multiplication using generator constructor + // multiplying vectors of 64-bit unsigned integers is not available in AVX2 + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); + } + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator>>( simd const& lhs, int rhs) noexcept { return _mm256_srli_epi64(static_cast<__m256i>(lhs), rhs); @@ -1588,6 +1665,11 @@ class const_where_expression>, static_cast<__m256d>(m_value)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(double* mem, vector_aligned_tag) const { + _mm256_maskstore_pd(mem, _mm256_castpd_si256(static_cast<__m256d>(m_mask)), + static_cast<__m256d>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( double* mem, simd> const& index) const { @@ -1624,6 +1706,11 @@ class where_expression>, mem, _mm256_castpd_si256(static_cast<__m256d>(m_mask)))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(double const* mem, vector_aligned_tag) { + m_value = value_type(_mm256_maskload_pd( + mem, _mm256_castpd_si256(static_cast<__m256d>(m_mask)))); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( double const* mem, simd> const& index) { @@ -1667,6 +1754,11 @@ class const_where_expression>, static_cast<__m128>(m_value)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, vector_aligned_tag) const { + _mm_maskstore_ps(mem, _mm_castps_si128(static_cast<__m128>(m_mask)), + static_cast<__m128>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( float* mem, simd> const& index) const { @@ -1703,6 +1795,11 @@ class where_expression>, _mm_maskload_ps(mem, _mm_castps_si128(static_cast<__m128>(m_mask)))); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(float const* mem, vector_aligned_tag) { + m_value = value_type( + _mm_maskload_ps(mem, _mm_castps_si128(static_cast<__m128>(m_mask)))); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( float const* mem, simd> const& index) { @@ -1746,6 +1843,12 @@ class const_where_expression< _mm_maskstore_epi32(mem, static_cast<__m128i>(m_mask), static_cast<__m128i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, vector_aligned_tag) const { + _mm_maskstore_epi32(mem, static_cast<__m128i>(m_mask), + static_cast<__m128i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int32_t* mem, @@ -1786,6 +1889,16 @@ class where_expression>, m_value = value_type(_mm_maskload_epi32(mem, static_cast<__m128i>(m_mask))); #endif } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int32_t const* mem, vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + __m128i tmp = _mm_load_si128(reinterpret_cast<__m128i const*>(mem)); + m_value = value_type(_mm_and_si128(tmp, static_cast<__m128i>(m_mask))); +#else + m_value = value_type(_mm_maskload_epi32(mem, static_cast<__m128i>(m_mask))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int32_t const* mem, @@ -1833,6 +1946,13 @@ class const_where_expression< static_cast<__m256i>(m_mask), static_cast<__m256i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(std::int64_t* mem, + vector_aligned_tag) const { + _mm256_maskstore_epi64(reinterpret_cast(mem), + static_cast<__m256i>(m_mask), + static_cast<__m256i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int64_t* mem, @@ -1874,6 +1994,17 @@ class where_expression>, reinterpret_cast(mem), static_cast<__m256i>(m_mask))); #endif } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(std::int64_t const* mem, + vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + __m256i tmp = _mm256_load_si256(reinterpret_cast<__m256i const*>(mem)); + m_value = value_type(_mm256_and_si256(tmp, static_cast<__m256i>(m_mask))); +#else + m_value = value_type(_mm256_maskload_epi64( + reinterpret_cast(mem), static_cast<__m256i>(m_mask))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int64_t const* mem, @@ -1922,6 +2053,13 @@ class const_where_expression< static_cast<__m256i>(m_mask), static_cast<__m256i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(std::uint64_t* mem, + vector_aligned_tag) const { + _mm256_maskstore_epi64(reinterpret_cast(mem), + static_cast<__m256i>(m_mask), + static_cast<__m256i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::uint64_t* mem, @@ -1963,6 +2101,17 @@ class where_expression>, reinterpret_cast(mem), static_cast<__m256i>(m_mask))); #endif } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(std::uint64_t const* mem, + vector_aligned_tag) { +#ifdef KOKKOS_IMPL_WORKAROUND_ROCM_AVX2_ISSUE + __m256i tmp = _mm256_load_si256(reinterpret_cast<__m256i const*>(mem)); + m_value = value_type(_mm256_and_si256(tmp, static_cast<__m256i>(m_mask))); +#else + m_value = value_type(_mm256_maskload_epi64( + reinterpret_cast(mem), static_cast<__m256i>(m_mask))); +#endif + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::uint64_t const* mem, diff --git a/simd/src/Kokkos_SIMD_AVX512.hpp b/simd/src/Kokkos_SIMD_AVX512.hpp index c5d1717ad4e..7fa35c204ae 100644 --- a/simd/src/Kokkos_SIMD_AVX512.hpp +++ b/simd/src/Kokkos_SIMD_AVX512.hpp @@ -193,10 +193,18 @@ class simd> { element_aligned_tag) { m_value = _mm512_loadu_pd(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm512_load_pd(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm512_storeu_pd(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm512_store_pd(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512d() const { return m_value; @@ -475,10 +483,18 @@ class simd> { element_aligned_tag) { m_value = _mm256_loadu_ps(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm256_load_ps(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm256_storeu_ps(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_store_ps(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256() const { return m_value; @@ -735,15 +751,25 @@ class simd> { operator[](std::size_t i) const { return reinterpret_cast(&m_value)[i]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + m_value = _mm256_mask_loadu_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm256_mask_load_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm256_mask_storeu_epi32(ptr, static_cast<__mmask8>(mask_type(true)), m_value); } - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, - element_aligned_tag) { - m_value = _mm256_mask_loadu_epi32( - _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_mask_store_epi32(ptr, static_cast<__mmask8>(mask_type(true)), + m_value); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() const { @@ -934,21 +960,30 @@ class simd> { operator[](std::size_t i) const { return reinterpret_cast(&m_value)[i]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + element_aligned_tag) { + m_value = _mm256_mask_loadu_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm256_mask_load_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm256_mask_storeu_epi32(ptr, static_cast<__mmask8>(mask_type(true)), m_value); } - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, - element_aligned_tag) { - m_value = _mm256_mask_loadu_epi32( - _mm256_set1_epi32(0), static_cast<__mmask8>(mask_type(true)), ptr); + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm256_mask_store_epi32(ptr, static_cast<__mmask8>(mask_type(true)), + m_value); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m256i() const { return m_value; } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( simd const& lhs, simd const& rhs) noexcept { return simd(_mm256_mullo_epi32(static_cast<__m256i>(lhs), @@ -1130,10 +1165,19 @@ class simd> { element_aligned_tag) { m_value = _mm512_loadu_si512(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm512_load_si512(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm512_storeu_si512(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm512_store_si512(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i() const { return m_value; @@ -1331,10 +1375,19 @@ class simd> { element_aligned_tag) { m_value = _mm512_loadu_si512(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = _mm512_load_si512(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { _mm512_storeu_si512(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + _mm512_store_si512(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator __m512i() const { return m_value; @@ -1505,6 +1558,11 @@ class const_where_expression>, static_cast<__m512d>(m_value)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(double* mem, vector_aligned_tag) const { + _mm512_mask_store_pd(mem, static_cast<__mmask8>(m_mask), + static_cast<__m512d>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( double* mem, simd> const& index) const { @@ -1541,6 +1599,11 @@ class where_expression>, _mm512_set1_pd(0.0), static_cast<__mmask8>(m_mask), mem)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(double const* mem, vector_aligned_tag) { + m_value = value_type(_mm512_mask_load_pd( + _mm512_set1_pd(0.0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( double const* mem, simd> const& index) { @@ -1584,6 +1647,11 @@ class const_where_expression>, static_cast<__m256>(m_value)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, vector_aligned_tag) const { + _mm256_mask_store_ps(mem, static_cast<__mmask8>(m_mask), + static_cast<__m256>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( float* mem, simd> const& index) const { @@ -1619,6 +1687,10 @@ class where_expression>, m_value = value_type(_mm256_mask_loadu_ps( _mm256_set1_ps(0.0), static_cast<__mmask8>(m_mask), mem)); } + void copy_from(float const* mem, vector_aligned_tag) { + m_value = value_type(_mm256_mask_load_ps( + _mm256_set1_ps(0.0), static_cast<__mmask8>(m_mask), mem)); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( float const* mem, @@ -1666,6 +1738,12 @@ class const_where_expression< _mm256_mask_storeu_epi32(mem, static_cast<__mmask8>(m_mask), static_cast<__m256i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, vector_aligned_tag) const { + _mm256_mask_store_epi32(mem, static_cast<__mmask8>(m_mask), + static_cast<__m256i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int32_t* mem, @@ -1702,6 +1780,11 @@ class where_expression>, m_value = value_type(_mm256_mask_loadu_epi32( _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem)); } + void copy_from(std::int32_t const* mem, vector_aligned_tag) { + m_value = value_type(_mm256_mask_load_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int32_t const* mem, @@ -1710,6 +1793,7 @@ class where_expression>, static_cast<__m256i>(m_value), static_cast<__mmask8>(m_mask), static_cast<__m256i>(index), mem, 4)); } + template (m_mask), static_cast<__m256i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::uint32_t* mem, vector_aligned_tag) const { + _mm256_mask_store_epi32(mem, static_cast<__mmask8>(m_mask), + static_cast<__m256i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::uint32_t* mem, @@ -1784,6 +1874,12 @@ class where_expression>, m_value = value_type(_mm256_mask_loadu_epi32( _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::uint32_t const* mem, vector_aligned_tag) { + m_value = value_type(_mm256_mask_load_epi32( + _mm256_set1_epi32(0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::uint32_t const* mem, @@ -1792,6 +1888,7 @@ class where_expression>, static_cast<__m256i>(m_value), static_cast<__mmask8>(m_mask), static_cast<__m256i>(index), mem, 4)); } + template (m_mask), static_cast<__m512i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int64_t* mem, vector_aligned_tag) const { + _mm512_mask_store_epi64(mem, static_cast<__mmask8>(m_mask), + static_cast<__m512i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int64_t* mem, @@ -1866,6 +1969,12 @@ class where_expression>, m_value = value_type(_mm512_mask_loadu_epi64( _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int64_t const* mem, vector_aligned_tag) { + m_value = value_type(_mm512_mask_load_epi64( + _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int64_t const* mem, @@ -1874,6 +1983,7 @@ class where_expression>, static_cast<__m512i>(m_value), static_cast<__mmask8>(m_mask), static_cast<__m256i>(index), mem, 8)); } + template (m_mask), static_cast<__m512i>(m_value)); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::uint64_t* mem, vector_aligned_tag) const { + _mm512_mask_store_epi64(mem, static_cast<__mmask8>(m_mask), + static_cast<__m512i>(m_value)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::uint64_t* mem, @@ -1949,6 +2065,11 @@ class where_expression>, _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem)); } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::uint64_t const* mem, vector_aligned_tag) { + m_value = value_type(_mm512_mask_load_epi64( + _mm512_set1_epi64(0.0), static_cast<__mmask8>(m_mask), mem)); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::uint64_t const* mem, simd> const& index) { @@ -1956,6 +2077,7 @@ class where_expression>, static_cast<__m512i>(m_value), static_cast<__mmask8>(m_mask), static_cast<__m256i>(index), mem, 8)); } + template class simd_mask; -struct element_aligned_tag {}; +class simd_alignment_vector_aligned {}; + +template +struct simd_flags {}; + +inline constexpr simd_flags<> simd_flag_default{}; +inline constexpr simd_flags simd_flag_aligned{}; + +using element_aligned_tag = simd_flags<>; +using vector_aligned_tag = simd_flags; // class template declarations for const_where_expression and where_expression @@ -117,48 +126,6 @@ template return const_where_expression(mask, value); } -// fallback simd multiplication using generator constructor -// At the time of this writing, this fallback is only used -// to multiply vectors of 64-bit signed integers for the AVX2 backend - -template -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator*( - simd const& lhs, simd const& rhs) { - return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); -} - -// fallback simd shift using generator constructor -// At the time of this edit, only the fallback for shift vectors of -// 64-bit signed integers for the AVX2 backend is used - -template >> -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator>>( - simd const& lhs, int rhs) { - return simd([&](std::size_t i) { return lhs[i] >> rhs; }); -} - -template >> -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator<<( - simd const& lhs, int rhs) { - return simd([&](std::size_t i) { return lhs[i] << rhs; }); -} - -template >> -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator>>( - simd const& lhs, simd const& rhs) { - return simd([&](std::size_t i) { return lhs[i] >> rhs[i]; }); -} - -template >> -[[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd operator<<( - simd const& lhs, simd const& rhs) { - return simd([&](std::size_t i) { return lhs[i] << rhs[i]; }); -} - // The code below provides: // operator@(simd, Arithmetic) // operator@(Arithmetic, simd) diff --git a/simd/src/Kokkos_SIMD_NEON.hpp b/simd/src/Kokkos_SIMD_NEON.hpp index 43ece203890..efc81135d16 100644 --- a/simd/src/Kokkos_SIMD_NEON.hpp +++ b/simd/src/Kokkos_SIMD_NEON.hpp @@ -363,10 +363,18 @@ class simd> { element_aligned_tag) { m_value = vld1q_f64(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1q_f64(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { vst1q_f64(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1q_f64(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator float64x2_t() const { return m_value; @@ -607,10 +615,18 @@ class simd> { element_aligned_tag) { m_value = vld1_f32(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1_f32(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { vst1_f32(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1_f32(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator float32x2_t() const { return m_value; @@ -844,10 +860,18 @@ class simd> { element_aligned_tag) { m_value = vld1_s32(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1_s32(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { vst1_s32(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1_s32(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator int32x2_t() const { return m_value; @@ -868,7 +892,11 @@ class simd> { return simd( vadd_s32(static_cast(lhs), static_cast(rhs))); } - + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd( + vmul_s32(static_cast(lhs), static_cast(rhs))); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator==(simd const& lhs, simd const& rhs) noexcept { return mask_type( @@ -1044,10 +1072,18 @@ class simd> { element_aligned_tag) { m_value = vld1q_s64(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1q_s64(ptr); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( value_type* ptr, element_aligned_tag) const { vst1q_s64(ptr, m_value); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1q_s64(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator int64x2_t() const { return m_value; @@ -1068,7 +1104,10 @@ class simd> { return simd( vaddq_s64(static_cast(lhs), static_cast(rhs))); } - + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend mask_type operator==(simd const& lhs, simd const& rhs) noexcept { return mask_type( @@ -1246,6 +1285,18 @@ class simd> { element_aligned_tag) { m_value = vld1q_u64(ptr); } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_from(value_type const* ptr, + vector_aligned_tag) { + m_value = vld1q_u64(ptr); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to( + value_type* ptr, element_aligned_tag) const { + vst1q_u64(ptr, m_value); + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void copy_to(value_type* ptr, + vector_aligned_tag) const { + vst1q_u64(ptr, m_value); + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION constexpr explicit operator uint64x2_t() const { return m_value; @@ -1261,7 +1312,10 @@ class simd> { return simd( vaddq_u64(static_cast(lhs), static_cast(rhs))); } - + [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator*( + simd const& lhs, simd const& rhs) noexcept { + return simd([&](std::size_t i) { return lhs[i] * rhs[i]; }); + } [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION friend simd operator&( simd const& lhs, simd const& rhs) noexcept { return simd( @@ -1386,6 +1440,11 @@ class const_where_expression>, if (m_mask[1]) mem[1] = m_value[1]; } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(double* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( double* mem, simd> const& index) const { @@ -1421,6 +1480,11 @@ class where_expression>, if (m_mask[1]) m_value[1] = mem[1]; } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(double const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( double const* mem, simd> const& index) { @@ -1464,6 +1528,11 @@ class const_where_expression>, if (m_mask[1]) mem[1] = m_value[1]; } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(float* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( float* mem, simd> const& index) const { @@ -1498,6 +1567,10 @@ class where_expression>, if (m_mask[0]) m_value[0] = mem[0]; if (m_mask[1]) m_value[1] = mem[1]; } + void copy_from(float const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( float const* mem, @@ -1542,6 +1615,12 @@ class const_where_expression< if (m_mask[0]) mem[0] = m_value[0]; if (m_mask[1]) mem[1] = m_value[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int32_t* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int32_t* mem, @@ -1577,6 +1656,12 @@ class where_expression>, if (m_mask[0]) m_value[0] = mem[0]; if (m_mask[1]) m_value[1] = mem[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int32_t const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int32_t const* mem, @@ -1584,6 +1669,7 @@ class where_expression>, if (m_mask[0]) m_value[0] = mem[index[0]]; if (m_mask[1]) m_value[1] = mem[index[1]]; } + template < class U, std::enable_if_t< @@ -1622,6 +1708,12 @@ class const_where_expression< if (m_mask[0]) mem[0] = m_value[0]; if (m_mask[1]) mem[1] = m_value[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_to(std::int64_t* mem, vector_aligned_tag) const { + if (m_mask[0]) mem[0] = m_value[0]; + if (m_mask[1]) mem[1] = m_value[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void scatter_to( std::int64_t* mem, @@ -1657,6 +1749,12 @@ class where_expression>, if (m_mask[0]) m_value[0] = mem[0]; if (m_mask[1]) m_value[1] = mem[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::int64_t const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::int64_t const* mem, @@ -1664,6 +1762,7 @@ class where_expression>, if (m_mask[0]) m_value[0] = mem[index[0]]; if (m_mask[1]) m_value[1] = mem[index[1]]; } + template < class U, std::enable_if_t>, if (m_mask[0]) m_value[0] = mem[0]; if (m_mask[1]) m_value[1] = mem[1]; } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION + void copy_from(std::uint64_t const* mem, vector_aligned_tag) { + if (m_mask[0]) m_value[0] = mem[0]; + if (m_mask[1]) m_value[1] = mem[1]; + } + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION void gather_from( std::uint64_t const* mem, @@ -1744,6 +1855,7 @@ class where_expression>, if (m_mask[0]) m_value[0] = mem[index[0]]; if (m_mask[1]) m_value[1] = mem[index[1]]; } + template { element_aligned_tag) { m_value = *ptr; } + KOKKOS_FORCEINLINE_FUNCTION void copy_from(T const* ptr, vector_aligned_tag) { + m_value = *ptr; + } KOKKOS_FORCEINLINE_FUNCTION void copy_to(T* ptr, element_aligned_tag) const { *ptr = m_value; } + KOKKOS_FORCEINLINE_FUNCTION void copy_to(T* ptr, vector_aligned_tag) const { + *ptr = m_value; + } + KOKKOS_FORCEINLINE_FUNCTION reference operator[](std::size_t) { return m_value; } @@ -308,6 +315,10 @@ class const_where_expression, void copy_to(T* mem, element_aligned_tag) const { if (static_cast(m_mask)) *mem = static_cast(m_value); } + KOKKOS_FORCEINLINE_FUNCTION + void copy_to(T* mem, vector_aligned_tag) const { + if (static_cast(m_mask)) *mem = static_cast(m_value); + } template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t> scatter_to(T* mem, simd const& index) const { @@ -315,13 +326,13 @@ class const_where_expression, mem[static_cast(index)] = static_cast(m_value); } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION value_type const& - impl_get_value() const { + [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION value_type const& impl_get_value() + const { return m_value; } - [[nodiscard]] KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION mask_type const& - impl_get_mask() const { + [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION mask_type const& impl_get_mask() + const { return m_mask; } }; @@ -344,6 +355,10 @@ class where_expression, void copy_from(T const* mem, element_aligned_tag) { if (static_cast(this->m_mask)) this->m_value = *mem; } + KOKKOS_FORCEINLINE_FUNCTION + void copy_from(T const* mem, vector_aligned_tag) { + if (static_cast(this->m_mask)) this->m_value = *mem; + } template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t> gather_from(T const* mem, simd const& index) { diff --git a/simd/unit_tests/TestSIMD.cpp b/simd/unit_tests/TestSIMD.cpp index 61c076e8246..7a1f9be2a0f 100644 --- a/simd/unit_tests/TestSIMD.cpp +++ b/simd/unit_tests/TestSIMD.cpp @@ -21,3 +21,4 @@ #include #include #include +#include diff --git a/simd/unit_tests/include/SIMDTesting_Ops.hpp b/simd/unit_tests/include/SIMDTesting_Ops.hpp index 6529f20e66a..c587ccf3046 100644 --- a/simd/unit_tests/include/SIMDTesting_Ops.hpp +++ b/simd/unit_tests/include/SIMDTesting_Ops.hpp @@ -209,4 +209,165 @@ class shift_left { } }; +class cbrt_op { + public: + template + auto on_host(T const& a) const { +#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + return Kokkos::Experimental::cbrt(a); +#else + return Kokkos::cbrt(a); +#endif + } + template + auto on_host_serial(T const& a) const { + return Kokkos::cbrt(a); + } +}; + +class exp_op { + public: + template + auto on_host(T const& a) const { +#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + return Kokkos::Experimental::exp(a); +#else + return Kokkos::exp(a); +#endif + } + template + auto on_host_serial(T const& a) const { + return Kokkos::exp(a); + } +}; + +class log_op { + public: + template + auto on_host(T const& a) const { +#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + return Kokkos::Experimental::log(a); +#else + return Kokkos::log(a); +#endif + } + template + auto on_host_serial(T const& a) const { + return Kokkos::log(a); + } +}; + +class hmin { + public: + template + auto on_host(T const& a) const { + return Kokkos::Experimental::hmin(a); + } + template + auto on_host_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity::min(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result = Kokkos::min(result, v[i]); + } + return result; + } + + template + KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const { + return Kokkos::Experimental::hmin(a); + } + template + KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity::min(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result = Kokkos::min(result, v[i]); + } + return result; + } +}; + +class hmax { + public: + template + auto on_host(T const& a) const { + return Kokkos::Experimental::hmax(a); + } + template + auto on_host_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity::max(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result = Kokkos::max(result, v[i]); + } + return result; + } + + template + KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const { + return Kokkos::Experimental::hmax(a); + } + template + KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity::max(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result = Kokkos::max(result, v[i]); + } + return result; + } +}; + +class reduce { + public: + template + auto on_host(T const& a) const { + using DataType = typename T::value_type::value_type; + return Kokkos::Experimental::reduce(a, DataType(0), std::plus<>()); + } + template + auto on_host_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity::sum(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result += v[i]; + } + return result; + } + + template + KOKKOS_INLINE_FUNCTION auto on_device(T const& a) const { + using DataType = typename T::value_type::value_type; + return Kokkos::Experimental::reduce(a, DataType(0), std::plus<>()); + } + template + KOKKOS_INLINE_FUNCTION auto on_device_serial(T const& a) const { + using DataType = typename T::value_type::value_type; + + auto const& v = a.impl_get_value(); + auto const& m = a.impl_get_mask(); + auto result = Kokkos::reduction_identity::sum(); + for (std::size_t i = 0; i < v.size(); ++i) { + if (m[i]) result += v[i]; + } + return result; + } +}; + #endif diff --git a/simd/unit_tests/include/SIMDTesting_Utilities.hpp b/simd/unit_tests/include/SIMDTesting_Utilities.hpp index ae2ab2c697c..d36e1e5afc5 100644 --- a/simd/unit_tests/include/SIMDTesting_Utilities.hpp +++ b/simd/unit_tests/include/SIMDTesting_Utilities.hpp @@ -93,7 +93,7 @@ class load_element_aligned { bool host_load(T const* mem, std::size_t n, Kokkos::Experimental::simd& result) const { if (n < result.size()) return false; - result.copy_from(mem, Kokkos::Experimental::element_aligned_tag()); + result.copy_from(mem, Kokkos::Experimental::simd_flag_default); return true; } template @@ -101,7 +101,26 @@ class load_element_aligned { T const* mem, std::size_t n, Kokkos::Experimental::simd& result) const { if (n < result.size()) return false; - result.copy_from(mem, Kokkos::Experimental::element_aligned_tag()); + result.copy_from(mem, Kokkos::Experimental::simd_flag_default); + return true; + } +}; + +class load_vector_aligned { + public: + template + bool host_load(T const* mem, std::size_t n, + Kokkos::Experimental::simd& result) const { + if (n < result.size()) return false; + result.copy_from(mem, Kokkos::Experimental::simd_flag_aligned); + return true; + } + template + KOKKOS_INLINE_FUNCTION bool device_load( + T const* mem, std::size_t n, + Kokkos::Experimental::simd& result) const { + if (n < result.size()) return false; + result.copy_from(mem, Kokkos::Experimental::simd_flag_aligned); return true; } }; @@ -116,8 +135,7 @@ class load_masked { for (std::size_t i = 0; i < n; ++i) { mask[i] = true; } - where(mask, result) - .copy_from(mem, Kokkos::Experimental::element_aligned_tag()); + where(mask, result).copy_from(mem, Kokkos::Experimental::simd_flag_default); where(!mask, result) = 0; return true; } @@ -130,8 +148,7 @@ class load_masked { for (std::size_t i = 0; i < n; ++i) { mask[i] = true; } - where(mask, result) - .copy_from(mem, Kokkos::Experimental::element_aligned_tag()); + where(mask, result).copy_from(mem, Kokkos::Experimental::simd_flag_default); where(!mask, result) = T(0); return true; } diff --git a/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp b/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp index 4af08c266bb..23e3826c752 100644 --- a/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp +++ b/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp @@ -37,10 +37,10 @@ inline void host_check_gen_ctor() { } simd_type rhs; - rhs.copy_from(init, Kokkos::Experimental::element_aligned_tag()); + rhs.copy_from(init, Kokkos::Experimental::simd_flag_default); simd_type blend; - blend.copy_from(expected, Kokkos::Experimental::element_aligned_tag()); + blend.copy_from(expected, Kokkos::Experimental::simd_flag_default); #if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC)) if constexpr (std::is_same_v) { @@ -98,7 +98,7 @@ KOKKOS_INLINE_FUNCTION void device_check_gen_ctor() { simd_type basic(KOKKOS_LAMBDA(std::size_t i) { return init[i]; }); simd_type rhs; - rhs.copy_from(init, Kokkos::Experimental::element_aligned_tag()); + rhs.copy_from(init, Kokkos::Experimental::simd_flag_default); device_check_equality(basic, rhs, lanes); simd_type lhs(KOKKOS_LAMBDA(std::size_t i) { return init[i] * 9; }); @@ -106,7 +106,7 @@ KOKKOS_INLINE_FUNCTION void device_check_gen_ctor() { KOKKOS_LAMBDA(std::size_t i) { return (mask[i]) ? lhs[i] : rhs[i]; }); simd_type blend; - blend.copy_from(expected, Kokkos::Experimental::element_aligned_tag()); + blend.copy_from(expected, Kokkos::Experimental::simd_flag_default); device_check_equality(result, blend, lanes); } diff --git a/simd/unit_tests/include/TestSIMD_MathOps.hpp b/simd/unit_tests/include/TestSIMD_MathOps.hpp index 802e41efe5f..59f2f6c18fd 100644 --- a/simd/unit_tests/include/TestSIMD_MathOps.hpp +++ b/simd/unit_tests/include/TestSIMD_MathOps.hpp @@ -61,13 +61,18 @@ void host_check_math_op_one_loader(UnaryOp unary_op, std::size_t n, simd_type arg; bool const loaded_arg = loader.host_load(args + i, nlanes, arg); if (!loaded_arg) continue; - auto computed_result = unary_op.on_host(arg); - decltype(computed_result) expected_result; + decltype(unary_op.on_host(arg)) expected_result; for (std::size_t lane = 0; lane < simd_type::size(); ++lane) { - if (lane < nlanes) + if (lane < nlanes) { + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) + arg[lane] = Kokkos::abs(arg[lane]); expected_result[lane] = unary_op.on_host_serial(T(arg[lane])); + } } + auto computed_result = unary_op.on_host(arg); host_check_equality(expected_result, computed_result, nlanes); } } @@ -78,6 +83,7 @@ inline void host_check_math_op_all_loaders(Op op, std::size_t n, host_check_math_op_one_loader(op, n, args...); host_check_math_op_one_loader(op, n, args...); host_check_math_op_one_loader(op, n, args...); + host_check_math_op_one_loader(op, n, args...); } template @@ -96,6 +102,13 @@ inline void host_check_all_math_ops(const DataType (&first_args)[n], // TODO: Place fallback implementations for all simd integer types if constexpr (std::is_floating_point_v) { host_check_math_op_all_loaders(divides(), n, first_args, second_args); + +#if defined(__INTEL_COMPILER) && \ + (defined(KOKKOS_ARCH_AVX2) || defined(KOKKOS_ARCH_AVX512XEON)) + host_check_math_op_all_loaders(cbrt_op(), n, first_args); + host_check_math_op_all_loaders(exp_op(), n, first_args); + host_check_math_op_all_loaders(log_op(), n, first_args); +#endif } } @@ -109,23 +122,29 @@ inline void host_check_abi_size() { template inline void host_check_math_ops() { constexpr size_t n = 11; + constexpr size_t alignment = + Kokkos::Experimental::simd::size() * sizeof(DataType); host_check_abi_size(); if constexpr (!std::is_integral_v) { - DataType const first_args[n] = {0.1, 0.4, 0.5, 0.7, 1.0, 1.5, - -2.0, 10.0, 0.0, 1.2, -2.8}; - DataType const second_args[n] = {1.0, 0.2, 1.1, 1.8, -0.1, -3.0, - -2.4, 1.0, 13.0, -3.2, -2.1}; + alignas(alignment) DataType const first_args[n] = { + 0.1, 0.4, 0.5, 0.7, 1.0, 1.5, -2.0, 10.0, 0.0, 1.2, -2.8}; + alignas(alignment) DataType const second_args[n] = { + 1.0, 0.2, 1.1, 1.8, -0.1, -3.0, -2.4, 1.0, 13.0, -3.2, -2.1}; host_check_all_math_ops(first_args, second_args); } else { if constexpr (std::is_signed_v) { - DataType const first_args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; - DataType const second_args[n] = {1, 2, 1, 1, 1, -3, -2, 1, 13, -3, -2}; + alignas(alignment) + DataType const first_args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; + alignas(alignment) DataType const second_args[n] = {1, 2, 1, 1, 1, -3, + -2, 1, 13, -3, -2}; host_check_all_math_ops(first_args, second_args); } else { - DataType const first_args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; - DataType const second_args[n] = {1, 2, 1, 1, 1, 3, 2, 1, 13, 3, 2}; + alignas(alignment) + DataType const first_args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; + alignas(alignment) + DataType const second_args[n] = {1, 2, 1, 1, 1, 3, 2, 1, 13, 3, 2}; host_check_all_math_ops(first_args, second_args); } } @@ -202,6 +221,7 @@ KOKKOS_INLINE_FUNCTION void device_check_math_op_all_loaders(Op op, device_check_math_op_one_loader(op, n, args...); device_check_math_op_one_loader(op, n, args...); device_check_math_op_one_loader(op, n, args...); + device_check_math_op_one_loader(op, n, args...); } template @@ -282,8 +302,13 @@ TEST(simd, host_math_ops) { } TEST(simd, device_math_ops) { - Kokkos::parallel_for(Kokkos::RangePolicy>(0, 1), - simd_device_math_ops_functor()); +#ifdef KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENMPTARGET + GTEST_SKIP() + << "skipping because of a non-deterministic failure reporting: " + "Failure to synchronize stream (nil): Error in " + "cuStreamSynchronize: an illegal memory access was encountered"; +#endif + Kokkos::parallel_for(1, simd_device_math_ops_functor()); } #endif diff --git a/simd/unit_tests/include/TestSIMD_Reductions.hpp b/simd/unit_tests/include/TestSIMD_Reductions.hpp new file mode 100644 index 00000000000..b3c7ac9a01e --- /dev/null +++ b/simd/unit_tests/include/TestSIMD_Reductions.hpp @@ -0,0 +1,184 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_TEST_SIMD_REDUCTIONS_HPP +#define KOKKOS_TEST_SIMD_REDUCTIONS_HPP + +#include +#include + +template +inline void host_check_reduction_one_loader(ReductionOp reduce_op, + std::size_t n, T const* args) { + Loader loader; + using simd_type = Kokkos::Experimental::simd; + using mask_type = typename Kokkos::Experimental::simd::mask_type; + constexpr std::size_t width = simd_type::size(); + + for (std::size_t i = 0; i < n; i += width) { + std::size_t const nremaining = n - i; + std::size_t const nlanes = Kokkos::min(nremaining, width); + simd_type arg; + bool const loaded_arg = loader.host_load(args + i, nlanes, arg); + if (!loaded_arg) continue; + + mask_type mask(false); + for (std::size_t j = 0; j < n; ++j) { + mask[j] = true; + } + auto value = where(mask, arg); + auto expected = reduce_op.on_host_serial(value); + auto computed = reduce_op.on_host(value); + + gtest_checker().equality(expected, computed); + } +} + +template +inline void host_check_reduction_all_loaders(ReductionOp reduce_op, + std::size_t n, T const* args) { + host_check_reduction_one_loader(reduce_op, n, + args); + host_check_reduction_one_loader(reduce_op, n, args); + host_check_reduction_one_loader(reduce_op, n, args); +} + +template +inline void host_check_all_reductions(const DataType (&args)[n]) { + host_check_reduction_all_loaders(hmin(), n, args); + host_check_reduction_all_loaders(hmax(), n, args); + host_check_reduction_all_loaders(reduce(), n, args); +} + +template +inline void host_check_reductions() { + constexpr size_t n = 11; + + if constexpr (std::is_signed_v) { + DataType const args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; + host_check_all_reductions(args); + } else { + DataType const args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; + host_check_all_reductions(args); + } +} + +template +inline void host_check_reductions_all_types( + Kokkos::Experimental::Impl::data_types) { + (host_check_reductions(), ...); +} + +template +inline void host_check_reductions_all_abis( + Kokkos::Experimental::Impl::abi_set) { + using DataTypes = Kokkos::Experimental::Impl::data_type_set; + (host_check_reductions_all_types(DataTypes()), ...); +} + +template +KOKKOS_INLINE_FUNCTION void device_check_reduction_one_loader( + ReductionOp reduce_op, std::size_t n, T const* args) { + Loader loader; + using simd_type = Kokkos::Experimental::simd; + using mask_type = typename Kokkos::Experimental::simd::mask_type; + constexpr std::size_t width = simd_type::size(); + + for (std::size_t i = 0; i < n; i += width) { + std::size_t const nremaining = n - i; + std::size_t const nlanes = Kokkos::min(nremaining, width); + simd_type arg; + bool const loaded_arg = loader.device_load(args + i, nlanes, arg); + if (!loaded_arg) continue; + + mask_type mask(false); + for (std::size_t j = 0; j < n; ++j) { + mask[j] = true; + } + auto value = where(mask, arg); + auto expected = reduce_op.on_device_serial(value); + auto computed = reduce_op.on_device(value); + + kokkos_checker().equality(expected, computed); + } +} + +template +KOKKOS_INLINE_FUNCTION void device_check_reduction_all_loaders( + ReductionOp reduce_op, std::size_t n, T const* args) { + device_check_reduction_one_loader(reduce_op, n, + args); + device_check_reduction_one_loader(reduce_op, n, args); + device_check_reduction_one_loader(reduce_op, n, args); +} + +template +KOKKOS_INLINE_FUNCTION void device_check_all_reductions( + const DataType (&args)[n]) { + device_check_reduction_all_loaders(hmin(), n, args); + device_check_reduction_all_loaders(hmax(), n, args); + device_check_reduction_all_loaders(reduce(), n, args); +} + +template +KOKKOS_INLINE_FUNCTION void device_check_reductions() { + constexpr size_t n = 11; + + if constexpr (std::is_signed_v) { + DataType const args[n] = {1, 2, -1, 10, 0, 1, -2, 10, 0, 1, -2}; + device_check_all_reductions(args); + } else { + DataType const args[n] = {1, 2, 1, 10, 0, 1, 2, 10, 0, 1, 2}; + device_check_all_reductions(args); + } +} + +template +KOKKOS_INLINE_FUNCTION void device_check_reductions_all_types( + Kokkos::Experimental::Impl::data_types) { + (device_check_reductions(), ...); +} + +template +KOKKOS_INLINE_FUNCTION void device_check_reductions_all_abis( + Kokkos::Experimental::Impl::abi_set) { + using DataTypes = Kokkos::Experimental::Impl::data_type_set; + (device_check_reductions_all_types(DataTypes()), ...); +} + +class simd_device_reduction_functor { + public: + KOKKOS_INLINE_FUNCTION void operator()(int) const { + device_check_reductions_all_abis( + Kokkos::Experimental::Impl::device_abi_set()); + } +}; + +TEST(simd, host_reductions) { + host_check_reductions_all_abis(Kokkos::Experimental::Impl::host_abi_set()); +} + +TEST(simd, device_reductions) { +#ifdef KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENMPTARGET + GTEST_SKIP() + << "skipping because of a non-deterministic failure reporting: " + "Failure to synchronize stream (nil): Error in " + "cuStreamSynchronize: an illegal memory access was encountered"; +#endif + Kokkos::parallel_for(1, simd_device_reduction_functor()); +} + +#endif diff --git a/simd/unit_tests/include/TestSIMD_ShiftOps.hpp b/simd/unit_tests/include/TestSIMD_ShiftOps.hpp index f6fdcb920ed..ffdd2cba4a0 100644 --- a/simd/unit_tests/include/TestSIMD_ShiftOps.hpp +++ b/simd/unit_tests/include/TestSIMD_ShiftOps.hpp @@ -85,10 +85,11 @@ inline void host_check_shift_op_all_loaders(ShiftOp shift_op, shift_by, n); host_check_shift_on_one_loader(shift_op, test_vals, shift_by, n); + host_check_shift_on_one_loader(shift_op, test_vals, + shift_by, n); Kokkos::Experimental::simd shift_by_lanes; - shift_by_lanes.copy_from(shift_by, - Kokkos::Experimental::element_aligned_tag()); + shift_by_lanes.copy_from(shift_by, Kokkos::Experimental::simd_flag_default); host_check_shift_by_lanes_on_one_loader( shift_op, test_vals, shift_by_lanes); @@ -96,6 +97,8 @@ inline void host_check_shift_op_all_loaders(ShiftOp shift_op, shift_by_lanes); host_check_shift_by_lanes_on_one_loader( shift_op, test_vals, shift_by_lanes); + host_check_shift_by_lanes_on_one_loader( + shift_op, test_vals, shift_by_lanes); } template @@ -104,12 +107,14 @@ inline void host_check_shift_ops() { using simd_type = Kokkos::Experimental::simd; constexpr std::size_t width = simd_type::size(); constexpr std::size_t num_cases = 8; + constexpr size_t alignment = + Kokkos::Experimental::simd::size() * sizeof(DataType); DataType max = std::numeric_limits::max(); - DataType shift_by[num_cases] = { + alignas(alignment) DataType shift_by[num_cases] = { 0, 1, 3, width / 2, width / 2 + 1, width - 1, width, width + 1}; - DataType test_vals[width]; + alignas(alignment) DataType test_vals[width]; for (std::size_t i = 0; i < width; ++i) { DataType inc = max / width; test_vals[i] = i * inc + 1; @@ -201,10 +206,11 @@ KOKKOS_INLINE_FUNCTION void device_check_shift_op_all_loaders( shift_by, n); device_check_shift_on_one_loader(shift_op, test_vals, shift_by, n); + device_check_shift_on_one_loader( + shift_op, test_vals, shift_by, n); Kokkos::Experimental::simd shift_by_lanes; - shift_by_lanes.copy_from(shift_by, - Kokkos::Experimental::element_aligned_tag()); + shift_by_lanes.copy_from(shift_by, Kokkos::Experimental::simd_flag_default); device_check_shift_by_lanes_on_one_loader( shift_op, test_vals, shift_by_lanes); @@ -212,6 +218,8 @@ KOKKOS_INLINE_FUNCTION void device_check_shift_op_all_loaders( shift_op, test_vals, shift_by_lanes); device_check_shift_by_lanes_on_one_loader( shift_op, test_vals, shift_by_lanes); + device_check_shift_by_lanes_on_one_loader( + shift_op, test_vals, shift_by_lanes); } template diff --git a/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp b/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp index 129f2b0d5c9..152fd9e9840 100644 --- a/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp +++ b/simd/unit_tests/include/TestSIMD_WhereExpressions.hpp @@ -29,7 +29,7 @@ inline void host_check_where_expr_scatter_to() { std::size_t nlanes = simd_type::size(); DataType init[] = {11, 13, 17, 19, 23, 29, 31, 37}; simd_type src; - src.copy_from(init, Kokkos::Experimental::element_aligned_tag()); + src.copy_from(init, Kokkos::Experimental::simd_flag_default); for (std::size_t idx = 0; idx < nlanes; ++idx) { mask_type mask(true); @@ -46,7 +46,7 @@ inline void host_check_where_expr_scatter_to() { where(mask, src).scatter_to(dst, index); simd_type dst_simd; - dst_simd.copy_from(dst, Kokkos::Experimental::element_aligned_tag()); + dst_simd.copy_from(dst, Kokkos::Experimental::simd_flag_default); host_check_equality(expected_result, dst_simd, nlanes); } @@ -107,7 +107,7 @@ KOKKOS_INLINE_FUNCTION void device_check_where_expr_scatter_to() { std::size_t nlanes = simd_type::size(); DataType init[] = {11, 13, 17, 19, 23, 29, 31, 37}; simd_type src; - src.copy_from(init, Kokkos::Experimental::element_aligned_tag()); + src.copy_from(init, Kokkos::Experimental::simd_flag_default); for (std::size_t idx = 0; idx < nlanes; ++idx) { mask_type mask(true); @@ -124,7 +124,7 @@ KOKKOS_INLINE_FUNCTION void device_check_where_expr_scatter_to() { where(mask, src).scatter_to(dst, index); simd_type dst_simd; - dst_simd.copy_from(dst, Kokkos::Experimental::element_aligned_tag()); + dst_simd.copy_from(dst, Kokkos::Experimental::simd_flag_default); device_check_equality(expected_result, dst_simd, nlanes); } diff --git a/tpls/desul/Config.hpp.cmake.in b/tpls/desul/Config.hpp.cmake.in index a7bc738191e..aed7ecfabc9 100644 --- a/tpls/desul/Config.hpp.cmake.in +++ b/tpls/desul/Config.hpp.cmake.in @@ -14,6 +14,8 @@ SPDX-License-Identifier: (BSD-3-Clause) #cmakedefine DESUL_ATOMICS_ENABLE_HIP #cmakedefine DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION #cmakedefine DESUL_ATOMICS_ENABLE_SYCL +#cmakedefine DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION #cmakedefine DESUL_ATOMICS_ENABLE_OPENMP +#cmakedefine DESUL_ATOMICS_ENABLE_OPENACC #endif diff --git a/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp b/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp index 082fc132de5..15c6d78d94b 100644 --- a/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp +++ b/tpls/desul/include/desul/atomics/Adapt_SYCL.hpp @@ -88,15 +88,18 @@ using sycl_atomic_ref = sycl::atomic_ref; #endif -// FIXME_SYCL Use SYCL_EXT_ONEAPI_DEVICE_GLOBAL when available instead #ifdef DESUL_SYCL_DEVICE_GLOBAL_SUPPORTED -// FIXME_SYCL The compiler forces us to use device_image_scope. Drop this when possible. +#ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL +template +using sycl_device_global = sycl::ext::oneapi::experimental::device_global; +#else template using sycl_device_global = sycl::ext::oneapi::experimental::device_global< T, decltype(sycl::ext::oneapi::experimental::properties( sycl::ext::oneapi::experimental::device_image_scope))>; #endif +#endif } // namespace Impl } // namespace desul diff --git a/tpls/desul/include/desul/atomics/Compare_Exchange.hpp b/tpls/desul/include/desul/atomics/Compare_Exchange.hpp index e91569e1dee..72639fc4932 100644 --- a/tpls/desul/include/desul/atomics/Compare_Exchange.hpp +++ b/tpls/desul/include/desul/atomics/Compare_Exchange.hpp @@ -26,6 +26,9 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifdef DESUL_HAVE_OPENMP_ATOMICS #include #endif +#ifdef DESUL_HAVE_OPENACC_ATOMICS +#include +#endif #ifdef DESUL_HAVE_SYCL_ATOMICS #include #endif diff --git a/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp b/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp new file mode 100644 index 00000000000..77149bd4741 --- /dev/null +++ b/tpls/desul/include/desul/atomics/Compare_Exchange_OpenACC.hpp @@ -0,0 +1,153 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_OPENACC_HPP_ +#define DESUL_ATOMICS_COMPARE_EXCHANGE_OPENACC_HPP_ + +#include + +#include +#include +#include + +namespace desul { +namespace Impl { + +#ifdef __NVCOMPILER + +#pragma acc routine seq +template +T device_atomic_exchange(T* dest, T value, MemoryOrder, MemoryScope /*scope*/) { + if constexpr (std::is_arithmetic_v && ((sizeof(T) == 4) || (sizeof(T) == 8))) { + T return_val; +#pragma acc atomic capture + { + return_val = *dest; + *dest = value; + } + return return_val; + } else { + // FIXME_OPENACC + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_exchange(): Not supported atomic operation in " + "the OpenACC backend\n"); + } + // Acquire a lock for the address + // while (!lock_address_openacc((void*)dest, scope)) { + // } + // device_atomic_thread_fence(MemoryOrderAcquire(), scope); + T return_val = *dest; + *dest = value; + // device_atomic_thread_fence(MemoryOrderRelease(), scope); + // unlock_address_openacc((void*)dest, scope); + return return_val; + } +} + +#pragma acc routine seq +template +T device_atomic_compare_exchange( + T* dest, T compare, T value, MemoryOrder, MemoryScope scope) { + // Floating point types treated separetely to work around compiler errors + // "parse invalid cast opcode for cast from 'i32' to 'float'". + // Also not just "forwarding" arguments to atomicCAS because it does not have an + // overload that takes int64_t + if constexpr (std::is_integral_v && ((sizeof(T) == 4) || (sizeof(T) == 8))) { + static_assert(sizeof(unsigned int) == 4); + static_assert(sizeof(unsigned long long int) == 8); + using cas_t = + std::conditional_t<(sizeof(T) == 4), unsigned int, unsigned long long int>; + cas_t return_val = atomicCAS(reinterpret_cast(dest), + reinterpret_cast(compare), + reinterpret_cast(value)); + return reinterpret_cast(return_val); +#ifdef DESUL_CUDA_ARCH_IS_PRE_PASCAL + } else if constexpr (std::is_same_v) { +#else + } else if constexpr (std::is_same_v || std::is_same_v) { +#endif + return atomicCAS(dest, compare, value); + } else { + // FIXME_OPENACC + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_compare_exchange(): Not supported atomic " + "operation in the OpenACC backend\n"); + } + T current_val = *dest; + // Acquire a lock for the address + // while (!lock_address_openacc((void*)dest, scope)) { + //} + // device_atomic_thread_fence(MemoryOrderAcquire(), scope); + if (current_val == compare) { + *dest = value; + // device_atomic_thread_fence(MemoryOrderRelease(), scope); + } + // unlock_address_openacc((void*)dest, scope); + return current_val; + } +} + +#else // not NVHPC + +#pragma acc routine seq +template +T device_atomic_exchange(T* dest, T value, MemoryOrder, MemoryScope) { + if constexpr (std::is_arithmetic_v) { + T return_val; +#pragma acc atomic capture + { + return_val = *dest; + *dest = value; + } + return return_val; + } else { + // FIXME_OPENACC + printf( + "DESUL error in device_atomic_exchange(): Not supported atomic operation in " + "the OpenACC backend\n"); + // Acquire a lock for the address + // while (!lock_address_openacc((void*)dest, scope)) { + // } + // device_atomic_thread_fence(MemoryOrderAcquire(), scope); + T return_val = *dest; + *dest = value; + // device_atomic_thread_fence(MemoryOrderRelease(), scope); + // unlock_address_openacc((void*)dest, scope); + return return_val; + } +} + +#pragma acc routine seq +template +T device_atomic_compare_exchange( + T* dest, T compare, T value, MemoryOrder, MemoryScope scope) { + // FIXME_OPENACC + printf( + "DESUL error in device_atomic_compare_exchange(): Not supported atomic operation " + "in the OpenACC backend\n"); + T current_val = *dest; + // Acquire a lock for the address + // while (!lock_address_openacc((void*)dest, scope)) { + //} + // device_atomic_thread_fence(MemoryOrderAcquire(), scope); + if (current_val == compare) { + *dest = value; + // device_atomic_thread_fence(MemoryOrderRelease(), scope); + } + // unlock_address_openacc((void*)dest, scope); + return current_val; +} + +#endif + +} // namespace Impl +} // namespace desul + +#endif diff --git a/tpls/desul/include/desul/atomics/Fetch_Op.hpp b/tpls/desul/include/desul/atomics/Fetch_Op.hpp index adf75c57437..1b161397c74 100644 --- a/tpls/desul/include/desul/atomics/Fetch_Op.hpp +++ b/tpls/desul/include/desul/atomics/Fetch_Op.hpp @@ -23,6 +23,9 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifdef DESUL_HAVE_OPENMP_ATOMICS #include #endif +#ifdef DESUL_HAVE_OPENACC_ATOMICS +#include +#endif #ifdef DESUL_HAVE_SYCL_ATOMICS #include #endif diff --git a/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp b/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp new file mode 100644 index 00000000000..ab570ac5787 --- /dev/null +++ b/tpls/desul/include/desul/atomics/Fetch_Op_OpenACC.hpp @@ -0,0 +1,431 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ +#ifndef DESUL_ATOMICS_FETCH_OP_OPENACC_HPP_ +#define DESUL_ATOMICS_FETCH_OP_OPENACC_HPP_ + +#include // min, max +#include +#include + +namespace desul { +namespace Impl { + +#ifdef __NVCOMPILER + +template +inline constexpr bool is_openacc_integral_type_v = + std::is_same_v || std::is_same_v || + std::is_same_v; + +template +inline constexpr bool is_openacc_arithmetic_type_v = std::is_same_v || +#ifndef DESUL_CUDA_ARCH_IS_PRE_PASCAL + std::is_same_v || +#endif + is_openacc_integral_type_v; + +#else + +template +inline constexpr bool is_openacc_integral_type_v = std::is_integral_v; + +template +inline constexpr bool is_openacc_arithmetic_type_v = std::is_arithmetic_v; + +#endif + +// +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_add( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr += val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_inc( + T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr += T(1); + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_sub( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr -= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_dec( + T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr -= T(1); + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_mul( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr *= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_div( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr /= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_lshift( + T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr = *ptr << val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_rshift( + T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr = *ptr >> val; + } + return old; +} + +#ifdef __NVCOMPILER +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_max( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; + old = atomicMax(ptr, val); + return old; +} +#endif + +#ifdef __NVCOMPILER +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_min( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + int old; + old = atomicMin(ptr, val); + return old; +} +#endif + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_and( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr &= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_or( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr |= val; + } + return old; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_fetch_xor( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T old; +#pragma acc atomic capture + { + old = *ptr; + *ptr ^= val; + } + return old; +} +// + +// +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_add_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr += val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_inc_fetch( + T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr += T(1); + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_sub_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr -= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_dec_fetch( + T* ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr -= T(1); + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_mul_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr *= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_div_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr /= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_lshift_fetch( + T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr = *ptr << val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_rshift_fetch( + T* ptr, const unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr = *ptr >> val; + tmp = *ptr; + } + return tmp; +} + +#ifdef __NVCOMPILER +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_max_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; + tmp = atomicMax(ptr, val); + tmp = std::max(tmp, val); + return tmp; +} +#endif + +#ifdef __NVCOMPILER +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_min_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; + tmp = atomicMin(ptr, val); + tmp = std::min(tmp, val); + return tmp; +} +#endif + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_and_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr &= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_or_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr |= val; + tmp = *ptr; + } + return tmp; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_xor_fetch( + T* ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { + T tmp; +#pragma acc atomic capture + { + *ptr ^= val; + tmp = *ptr; + } + return tmp; +} +// + +// +#pragma acc routine seq +template +std::enable_if_t, void> device_atomic_store( + T* const ptr, const T val, MemoryOrderRelaxed, MemoryScopeDevice) { +#pragma acc atomic write + *ptr = val; +} + +#pragma acc routine seq +template +std::enable_if_t, void> device_atomic_store( + T* const ptr, const T val, MemoryOrderRelease, MemoryScopeDevice) { + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_store(MemoryOrderRelease): Not supported atomic " + "operation in the OpenACC backend\n"); + } +#pragma acc atomic write + *ptr = val; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_load( + const T* const ptr, MemoryOrderRelaxed, MemoryScopeDevice) { + T retval; +#pragma acc atomic read + retval = *ptr; + return retval; +} + +#pragma acc routine seq +template +std::enable_if_t, T> device_atomic_load( + const T* const ptr, MemoryOrderAcquire, MemoryScopeDevice) { + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_load(MemoryOrderAcquire): Not supported atomic " + "operation in the OpenACC backend\n"); + } + T retval; +#pragma acc atomic read + retval = *ptr; + return retval; +} +// + +} // namespace Impl +} // namespace desul + +#endif diff --git a/tpls/desul/include/desul/atomics/Generic.hpp b/tpls/desul/include/desul/atomics/Generic.hpp index fef10222e34..fa71477c299 100644 --- a/tpls/desul/include/desul/atomics/Generic.hpp +++ b/tpls/desul/include/desul/atomics/Generic.hpp @@ -18,11 +18,14 @@ SPDX-License-Identifier: (BSD-3-Clause) namespace desul { +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_thread_fence(MemoryOrder order, MemoryScope scope) { DESUL_IF_ON_DEVICE(return Impl::device_atomic_thread_fence(order, scope);) DESUL_IF_ON_HOST(return Impl::host_atomic_thread_fence(order, scope);) } + +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_exchange(T* dest, T val, MemoryOrder order, MemoryScope scope) { @@ -30,6 +33,7 @@ atomic_exchange(T* dest, T val, MemoryOrder order, MemoryScope scope) { DESUL_IF_ON_HOST(return Impl::host_atomic_exchange(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_compare_exchange(T* dest, T cmp, T val, MemoryOrder order, MemoryScope scope) { @@ -40,6 +44,7 @@ atomic_compare_exchange(T* dest, T cmp, T val, MemoryOrder order, MemoryScope sc } // Fetch_Oper atomics: return value before operation +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_add(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -47,6 +52,7 @@ atomic_fetch_add(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_add(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_sub(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -54,6 +60,7 @@ atomic_fetch_sub(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_sub(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_max(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -61,6 +68,7 @@ atomic_fetch_max(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_max(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_min(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -68,6 +76,7 @@ atomic_fetch_min(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_min(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_mul(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -75,6 +84,7 @@ atomic_fetch_mul(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_mul(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_div(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -82,6 +92,7 @@ atomic_fetch_div(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_div(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_mod(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -89,6 +100,7 @@ atomic_fetch_mod(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_mod(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_and(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -96,6 +108,7 @@ atomic_fetch_and(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_and(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_or(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -103,6 +116,7 @@ atomic_fetch_or(T* const dest, const T val, MemoryOrder order, MemoryScope scope DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_or(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_xor(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -110,6 +124,7 @@ atomic_fetch_xor(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_xor(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_nand(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -117,6 +132,7 @@ atomic_fetch_nand(T* const dest, const T val, MemoryOrder order, MemoryScope sco DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_nand(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_lshift(T* const dest, const unsigned int val, @@ -126,6 +142,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_lshift(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_lshift(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_rshift(T* const dest, const unsigned int val, @@ -136,6 +153,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_rshift(T* const dest, } // Oper Fetch atomics: return value after operation +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_add_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -143,6 +161,7 @@ atomic_add_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_add_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_sub_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -150,6 +169,7 @@ atomic_sub_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_sub_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_max_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -157,6 +177,7 @@ atomic_max_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_max_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_min_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -164,6 +185,7 @@ atomic_min_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_min_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_mul_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -171,6 +193,7 @@ atomic_mul_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_mul_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_div_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -178,6 +201,7 @@ atomic_div_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_div_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_mod_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -185,6 +209,7 @@ atomic_mod_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_mod_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_and_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -192,6 +217,7 @@ atomic_and_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_and_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_or_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -199,6 +225,7 @@ atomic_or_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope DESUL_IF_ON_HOST(return Impl::host_atomic_or_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_xor_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -206,6 +233,7 @@ atomic_xor_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scop DESUL_IF_ON_HOST(return Impl::host_atomic_xor_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_nand_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) { @@ -213,6 +241,7 @@ atomic_nand_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope sco DESUL_IF_ON_HOST(return Impl::host_atomic_nand_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_lshift_fetch(T* const dest, const unsigned int val, @@ -222,6 +251,7 @@ DESUL_INLINE_FUNCTION T atomic_lshift_fetch(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_lshift_fetch(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_rshift_fetch(T* const dest, const unsigned int val, @@ -233,6 +263,7 @@ DESUL_INLINE_FUNCTION T atomic_rshift_fetch(T* const dest, // Other atomics +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_load(const T* const dest, MemoryOrder order, @@ -241,6 +272,7 @@ DESUL_INLINE_FUNCTION T atomic_load(const T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_load(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_store(T* const dest, const T val, @@ -250,6 +282,7 @@ DESUL_INLINE_FUNCTION void atomic_store(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_store(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_add(T* const dest, const T val, @@ -259,6 +292,7 @@ DESUL_INLINE_FUNCTION void atomic_add(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_add(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_sub(T* const dest, const T val, @@ -268,6 +302,7 @@ DESUL_INLINE_FUNCTION void atomic_sub(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_sub(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_mul(T* const dest, const T val, @@ -277,6 +312,7 @@ DESUL_INLINE_FUNCTION void atomic_mul(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_mul(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_div(T* const dest, const T val, @@ -286,6 +322,7 @@ DESUL_INLINE_FUNCTION void atomic_div(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_div(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_min(T* const dest, const T val, @@ -295,6 +332,7 @@ DESUL_INLINE_FUNCTION void atomic_min(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_min(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_max(T* const dest, const T val, @@ -304,6 +342,7 @@ DESUL_INLINE_FUNCTION void atomic_max(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_max(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_inc_fetch(T* const dest, MemoryOrder order, @@ -312,6 +351,7 @@ DESUL_INLINE_FUNCTION T atomic_inc_fetch(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_inc_fetch(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_dec_fetch(T* const dest, MemoryOrder order, @@ -320,6 +360,7 @@ DESUL_INLINE_FUNCTION T atomic_dec_fetch(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_dec_fetch(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_inc(T* const dest, MemoryOrder order, @@ -328,6 +369,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_inc(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_inc(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_inc_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) { @@ -335,6 +377,7 @@ atomic_fetch_inc_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_inc_mod(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_dec(T* const dest, MemoryOrder order, @@ -343,6 +386,7 @@ DESUL_INLINE_FUNCTION T atomic_fetch_dec(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_dec(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION T atomic_fetch_dec_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) { @@ -350,6 +394,7 @@ atomic_fetch_dec_mod(T* const dest, T val, MemoryOrder order, MemoryScope scope) DESUL_IF_ON_HOST(return Impl::host_atomic_fetch_dec_mod(dest, val, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_inc(T* const dest, MemoryOrder order, @@ -358,6 +403,7 @@ DESUL_INLINE_FUNCTION void atomic_inc(T* const dest, DESUL_IF_ON_HOST(return Impl::host_atomic_inc(dest, order, scope);) } +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template DESUL_INLINE_FUNCTION void atomic_dec(T* const dest, MemoryOrder order, @@ -367,6 +413,7 @@ DESUL_INLINE_FUNCTION void atomic_dec(T* const dest, } // FIXME +DESUL_IMPL_ACC_ROUTINE_DIRECTIVE template SYCL_SPACE_ATOMIC_LOCKS_DEVICE; +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +SYCL_EXTERNAL extern +#else +static +#endif + sycl_device_global + SYCL_SPACE_ATOMIC_LOCKS_DEVICE; -SYCL_EXTERNAL extern sycl_device_global SYCL_SPACE_ATOMIC_LOCKS_NODE; +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +SYCL_EXTERNAL extern +#else +static +#endif + sycl_device_global + SYCL_SPACE_ATOMIC_LOCKS_NODE; #define SYCL_SPACE_ATOMIC_MASK 0x1FFFF @@ -128,6 +149,34 @@ inline void unlock_address_sycl(void* ptr, MemoryScopeNode) { lock_node_ref.exchange(0); } +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +inline +#else +inline static +#endif + void + copy_sycl_lock_arrays_to_device(sycl::queue q) { + static bool once = [&q]() { +#ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL + q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_DEVICE, + &SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h, + sizeof(int32_t*)); + q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_NODE, + &SYCL_SPACE_ATOMIC_LOCKS_NODE_h, + sizeof(int32_t*)); +#else + auto device_ptr = SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h; + auto node_ptr = SYCL_SPACE_ATOMIC_LOCKS_NODE_h; + q.single_task([=] { + SYCL_SPACE_ATOMIC_LOCKS_DEVICE.get() = device_ptr; + SYCL_SPACE_ATOMIC_LOCKS_NODE.get() = node_ptr; + }); +#endif + return true; + }(); + (void)once; +} + #else // not supported template @@ -155,7 +204,26 @@ inline bool lock_address_sycl(void*, MemoryScopeNode) { inline void unlock_address_sycl(void*, MemoryScopeDevice) { assert(false); } inline void unlock_address_sycl(void*, MemoryScopeNode) { assert(false); } + +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +inline +#else +inline static +#endif + void + copy_sycl_lock_arrays_to_device(sycl::queue) { +} + #endif } // namespace Impl + +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION +inline void ensure_sycl_lock_arrays_on_device(sycl::queue) {} +#else +static inline void ensure_sycl_lock_arrays_on_device(sycl::queue q) { + Impl::copy_sycl_lock_arrays_to_device(q); +} +#endif + } // namespace desul #endif diff --git a/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp b/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp index cb97f4a906d..b6a399100b1 100644 --- a/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp +++ b/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op.hpp @@ -17,6 +17,9 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifdef DESUL_HAVE_HIP_ATOMICS #include #endif +#ifdef DESUL_HAVE_OPENACC_ATOMICS +#include +#endif #ifdef DESUL_HAVE_SYCL_ATOMICS #include #endif diff --git a/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp b/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp new file mode 100644 index 00000000000..d4dd74588bd --- /dev/null +++ b/tpls/desul/include/desul/atomics/Lock_Based_Fetch_Op_OpenACC.hpp @@ -0,0 +1,81 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_LOCK_BASED_FETCH_OP_OPENACC_HPP_ +#define DESUL_ATOMICS_LOCK_BASED_FETCH_OP_OPENACC_HPP_ + +#include +#include +#include +#include + +namespace desul { +namespace Impl { + +template = 0> +inline T device_atomic_fetch_oper(const Oper& op, + T* const dest, + dont_deduce_this_parameter_t val, + MemoryOrder /*order*/, + MemoryScope scope) { + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_fetch_oper(): Not supported atomic operation in " + "the OpenACC backend\n"); + } + // Acquire a lock for the address + while (!lock_address((void*)dest, scope)) { + } + + device_atomic_thread_fence(MemoryOrderAcquire(), scope); + T return_val = *dest; + *dest = op.apply(return_val, val); + device_atomic_thread_fence(MemoryOrderRelease(), scope); + unlock_address((void*)dest, scope); + return return_val; +} + +template = 0> +inline T device_atomic_oper_fetch(const Oper& op, + T* const dest, + dont_deduce_this_parameter_t val, + MemoryOrder /*order*/, + MemoryScope scope) { + if (acc_on_device(acc_device_not_host)) { + printf( + "DESUL error in device_atomic_oper_fetch(): Not supported atomic operation in " + "the OpenACC backend\n"); + } + // Acquire a lock for the address + while (!lock_address((void*)dest, scope)) { + } + + device_atomic_thread_fence(MemoryOrderAcquire(), scope); + T return_val = op.apply(*dest, val); + *dest = return_val; + device_atomic_thread_fence(MemoryOrderRelease(), scope); + unlock_address((void*)dest, scope); + return return_val; +} + +} // namespace Impl +} // namespace desul + +#endif diff --git a/tpls/desul/include/desul/atomics/Macros.hpp b/tpls/desul/include/desul/atomics/Macros.hpp index 3a14b93d323..d11beb0c805 100644 --- a/tpls/desul/include/desul/atomics/Macros.hpp +++ b/tpls/desul/include/desul/atomics/Macros.hpp @@ -57,6 +57,10 @@ SPDX-License-Identifier: (BSD-3-Clause) #define DESUL_HAVE_OPENMP_ATOMICS #endif +#if defined(DESUL_ATOMICS_ENABLE_OPENACC) +#define DESUL_HAVE_OPENACC_ATOMICS +#endif + // ONLY use GNUC atomics if not explicitly say to use OpenMP atomics #if !defined(DESUL_HAVE_OPENMP_ATOMICS) && defined(__GNUC__) #define DESUL_HAVE_GCC_ATOMICS @@ -123,6 +127,30 @@ static constexpr bool desul_impl_omp_on_host() { return false; } #endif #endif +#if defined(DESUL_HAVE_OPENACC_ATOMICS) +#include +#ifdef __NVCOMPILER +// FIXME_OPENACC We cannot determine in a constant expresion whether we are on host or +// on device with NVHPC. We use the device implementation on both sides. +#define DESUL_IF_ON_DEVICE(CODE) \ + { DESUL_IMPL_STRIP_PARENS(CODE) } +#define DESUL_IF_ON_HOST(CODE) \ + {} +#else +#define DESUL_IF_ON_DEVICE(CODE) \ + if constexpr (acc_on_device(acc_device_not_host)) { \ + DESUL_IMPL_STRIP_PARENS(CODE) \ + } +#define DESUL_IF_ON_HOST(CODE) \ + if constexpr (acc_on_device(acc_device_host)) { \ + DESUL_IMPL_STRIP_PARENS(CODE) \ + } +#endif +#define DESUL_IMPL_ACC_ROUTINE_DIRECTIVE _Pragma("acc routine seq") +#else +#define DESUL_IMPL_ACC_ROUTINE_DIRECTIVE +#endif + #if !defined(DESUL_IF_ON_HOST) && !defined(DESUL_IF_ON_DEVICE) #if (defined(DESUL_ATOMICS_ENABLE_CUDA) && defined(__CUDA_ARCH__)) || \ (defined(DESUL_ATOMICS_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) || \ diff --git a/tpls/desul/include/desul/atomics/Thread_Fence.hpp b/tpls/desul/include/desul/atomics/Thread_Fence.hpp index 24078aae07f..6a741f6d478 100644 --- a/tpls/desul/include/desul/atomics/Thread_Fence.hpp +++ b/tpls/desul/include/desul/atomics/Thread_Fence.hpp @@ -26,6 +26,9 @@ SPDX-License-Identifier: (BSD-3-Clause) #ifdef DESUL_HAVE_OPENMP_ATOMICS #include #endif +#ifdef DESUL_HAVE_OPENACC_ATOMICS +#include +#endif #ifdef DESUL_HAVE_SYCL_ATOMICS #include #endif diff --git a/tpls/desul/include/desul/atomics/Thread_Fence_OpenACC.hpp b/tpls/desul/include/desul/atomics/Thread_Fence_OpenACC.hpp new file mode 100644 index 00000000000..a5c8aa1c8a7 --- /dev/null +++ b/tpls/desul/include/desul/atomics/Thread_Fence_OpenACC.hpp @@ -0,0 +1,25 @@ +/* +Copyright (c) 2019, Lawrence Livermore National Security, LLC +and DESUL project contributors. See the COPYRIGHT file for details. +Source: https://github.com/desul/desul + +SPDX-License-Identifier: (BSD-3-Clause) +*/ + +#ifndef DESUL_ATOMICS_THREAD_FENCE_OPENACC_HPP_ +#define DESUL_ATOMICS_THREAD_FENCE_OPENACC_HPP_ + +namespace desul { +namespace Impl { + +#pragma acc routine seq +template +void device_atomic_thread_fence(MemoryOrder, MemoryScope) { + // FIXME_OPENACC: The current OpenACC standard does not support explicit thread fence + // operations. +} + +} // namespace Impl +} // namespace desul + +#endif diff --git a/tpls/desul/src/Lock_Array_SYCL.cpp b/tpls/desul/src/Lock_Array_SYCL.cpp index 9e84c60e41a..6660c76e11a 100644 --- a/tpls/desul/src/Lock_Array_SYCL.cpp +++ b/tpls/desul/src/Lock_Array_SYCL.cpp @@ -14,10 +14,12 @@ SPDX-License-Identifier: (BSD-3-Clause) namespace desul::Impl { +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION SYCL_EXTERNAL sycl_device_global SYCL_SPACE_ATOMIC_LOCKS_DEVICE; SYCL_EXTERNAL sycl_device_global SYCL_SPACE_ATOMIC_LOCKS_NODE; +#endif int32_t* SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr; int32_t* SYCL_SPACE_ATOMIC_LOCKS_NODE_h = nullptr; @@ -31,19 +33,7 @@ void init_lock_arrays_sycl(sycl::queue q) { SYCL_SPACE_ATOMIC_LOCKS_NODE_h = sycl::malloc_host(SYCL_SPACE_ATOMIC_MASK + 1, q); - // FIXME_SYCL Once supported, the following should be replaced by - // q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_DEVICE, - // &SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h, - // sizeof(int32_t*)); - // q.memcpy(SYCL_SPACE_ATOMIC_LOCKS_NODE, - // &SYCL_SPACE_ATOMIC_LOCKS_NODE_h, - // sizeof(int32_t*)); - auto device_ptr = SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h; - auto node_ptr = SYCL_SPACE_ATOMIC_LOCKS_NODE_h; - q.single_task([=] { - SYCL_SPACE_ATOMIC_LOCKS_DEVICE.get() = device_ptr; - SYCL_SPACE_ATOMIC_LOCKS_NODE.get() = node_ptr; - }); + copy_sycl_lock_arrays_to_device(q); q.memset(SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h, 0, @@ -63,7 +53,10 @@ void finalize_lock_arrays_sycl(sycl::queue q) { sycl::free(SYCL_SPACE_ATOMIC_LOCKS_NODE_h, q); SYCL_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr; SYCL_SPACE_ATOMIC_LOCKS_NODE_h = nullptr; +#ifdef DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION + copy_sycl_lock_arrays_to_device(q); +#endif } -} // namespace desul::Impl +} // namespace desul::Impl #endif diff --git a/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp b/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp index ab1561bd47f..25389a2fa5e 100644 --- a/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp +++ b/tpls/mdspan/include/experimental/__p0009_bits/compressed_pair.hpp @@ -27,165 +27,165 @@ namespace detail { // For no unique address emulation, this is the case taken when neither are empty. // For real `[[no_unique_address]]`, this case is always taken. -template struct __compressed_pair { - _MDSPAN_NO_UNIQUE_ADDRESS _T __t_val; - _MDSPAN_NO_UNIQUE_ADDRESS _U __u_val; - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept { return __t_val; } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _T const &__first() const noexcept { - return __t_val; +template struct __compressed_pair { + _MDSPAN_NO_UNIQUE_ADDRESS _T1 __t1_val{}; + _MDSPAN_NO_UNIQUE_ADDRESS _T2 __t2_val{}; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T1 &__first() noexcept { return __t1_val; } + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T1 const &__first() const noexcept { + return __t1_val; } - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _U &__second() noexcept { return __u_val; } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _U const &__second() const noexcept { - return __u_val; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T2 &__second() noexcept { return __t2_val; } + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T2 const &__second() const noexcept { + return __t2_val; } MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair() noexcept = default; + constexpr __compressed_pair() = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair const &) noexcept = default; + constexpr __compressed_pair(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair &&) noexcept = default; + constexpr __compressed_pair(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair const &) noexcept = default; + operator=(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair &&) noexcept = default; + operator=(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - ~__compressed_pair() noexcept = default; - template - MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike &&__t, _ULike &&__u) - : __t_val((_TLike &&) __t), __u_val((_ULike &&) __u) {} + ~__compressed_pair() = default; + template + MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_T1Like &&__t1, _T2Like &&__t2) + : __t1_val((_T1Like &&) __t1), __t2_val((_T2Like &&) __t2) {} }; #if !defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) // First empty. -template +template struct __compressed_pair< - _T, _U, - std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T) && !_MDSPAN_TRAIT(std::is_empty, _U)>> - : private _T { - _U __u_val; - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept { - return *static_cast<_T *>(this); + _T1, _T2, + std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T1) && !_MDSPAN_TRAIT(std::is_empty, _T2)>> + : private _T1 { + _T2 __t2_val{}; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T1 &__first() noexcept { + return *static_cast<_T1 *>(this); } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _T const &__first() const noexcept { - return *static_cast<_T const *>(this); + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T1 const &__first() const noexcept { + return *static_cast<_T1 const *>(this); } - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _U &__second() noexcept { return __u_val; } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _U const &__second() const noexcept { - return __u_val; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T2 &__second() noexcept { return __t2_val; } + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T2 const &__second() const noexcept { + return __t2_val; } MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair() noexcept = default; + constexpr __compressed_pair() = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair const &) noexcept = default; + constexpr __compressed_pair(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair &&) noexcept = default; + constexpr __compressed_pair(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair const &) noexcept = default; + operator=(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair &&) noexcept = default; + operator=(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - ~__compressed_pair() noexcept = default; - template - MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike &&__t, _ULike &&__u) - : _T((_TLike &&) __t), __u_val((_ULike &&) __u) {} + ~__compressed_pair() = default; + template + MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_T1Like &&__t1, _T2Like &&__t2) + : _T1((_T1Like &&) __t1), __t2_val((_T2Like &&) __t2) {} }; // Second empty. -template +template struct __compressed_pair< - _T, _U, - std::enable_if_t> - : private _U { - _T __t_val; - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept { return __t_val; } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _T const &__first() const noexcept { - return __t_val; + _T1, _T2, + std::enable_if_t> + : private _T2 { + _T1 __t1_val{}; + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T1 &__first() noexcept { return __t1_val; } + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T1 const &__first() const noexcept { + return __t1_val; } - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _U &__second() noexcept { - return *static_cast<_U *>(this); + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T2 &__second() noexcept { + return *static_cast<_T2 *>(this); } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _U const &__second() const noexcept { - return *static_cast<_U const *>(this); + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T2 const &__second() const noexcept { + return *static_cast<_T2 const *>(this); } MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair() noexcept = default; + constexpr __compressed_pair() = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair const &) noexcept = default; + constexpr __compressed_pair(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair &&) noexcept = default; + constexpr __compressed_pair(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair const &) noexcept = default; + operator=(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair &&) noexcept = default; + operator=(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - ~__compressed_pair() noexcept = default; + ~__compressed_pair() = default; - template - MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike &&__t, _ULike &&__u) - : _U((_ULike &&) __u), __t_val((_TLike &&) __t) {} + template + MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_T1Like &&__t1, _T2Like &&__t2) + : _T2((_T2Like &&) __t2), __t1_val((_T1Like &&) __t1) {} }; // Both empty. -template +template struct __compressed_pair< - _T, _U, - std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T) && _MDSPAN_TRAIT(std::is_empty, _U)>> + _T1, _T2, + std::enable_if_t<_MDSPAN_TRAIT(std::is_empty, _T1) && _MDSPAN_TRAIT(std::is_empty, _T2)>> // We need to use the __no_unique_address_emulation wrapper here to avoid // base class ambiguities. #ifdef _MDSPAN_COMPILER_MSVC // MSVC doesn't allow you to access public static member functions of a type // when you *happen* to privately inherit from that type. - : protected __no_unique_address_emulation<_T, 0>, - protected __no_unique_address_emulation<_U, 1> + : protected __no_unique_address_emulation<_T1, 0>, + protected __no_unique_address_emulation<_T2, 1> #else - : private __no_unique_address_emulation<_T, 0>, - private __no_unique_address_emulation<_U, 1> + : private __no_unique_address_emulation<_T1, 0>, + private __no_unique_address_emulation<_T2, 1> #endif { - using __first_base_t = __no_unique_address_emulation<_T, 0>; - using __second_base_t = __no_unique_address_emulation<_U, 1>; + using __first_base_t = __no_unique_address_emulation<_T1, 0>; + using __second_base_t = __no_unique_address_emulation<_T2, 1>; - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T &__first() noexcept { + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T1 &__first() noexcept { return this->__first_base_t::__ref(); } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _T const &__first() const noexcept { + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T1 const &__first() const noexcept { return this->__first_base_t::__ref(); } - MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _U &__second() noexcept { + MDSPAN_FORCE_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 _T2 &__second() noexcept { return this->__second_base_t::__ref(); } - MDSPAN_FORCE_INLINE_FUNCTION constexpr _U const &__second() const noexcept { + MDSPAN_FORCE_INLINE_FUNCTION constexpr _T2 const &__second() const noexcept { return this->__second_base_t::__ref(); } MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair() noexcept = default; + constexpr __compressed_pair() = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair const &) noexcept = default; + constexpr __compressed_pair(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - constexpr __compressed_pair(__compressed_pair &&) noexcept = default; + constexpr __compressed_pair(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair const &) noexcept = default; + operator=(__compressed_pair const &) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED __compressed_pair & - operator=(__compressed_pair &&) noexcept = default; + operator=(__compressed_pair &&) = default; MDSPAN_INLINE_FUNCTION_DEFAULTED - ~__compressed_pair() noexcept = default; - template - MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_TLike &&__t, _ULike &&__u) noexcept - : __first_base_t(_T((_TLike &&) __t)), - __second_base_t(_U((_ULike &&) __u)) + ~__compressed_pair() = default; + template + MDSPAN_INLINE_FUNCTION constexpr __compressed_pair(_T1Like &&__t1, _T2Like &&__t2) noexcept + : __first_base_t(_T1((_T1Like &&) __t1)), + __second_base_t(_T2((_T2Like &&) __t2)) { } }; diff --git a/tpls/mdspan/include/experimental/__p0009_bits/config.hpp b/tpls/mdspan/include/experimental/__p0009_bits/config.hpp index d35e201cebd..8e42a37ba7c 100644 --- a/tpls/mdspan/include/experimental/__p0009_bits/config.hpp +++ b/tpls/mdspan/include/experimental/__p0009_bits/config.hpp @@ -35,10 +35,17 @@ #define MDSPAN_CXX_STD_14 201402L #define MDSPAN_CXX_STD_17 201703L #define MDSPAN_CXX_STD_20 202002L +// Note GCC has not updated this in version 13 +#ifdef __clang__ +#define MDSPAN_CXX_STD_23 202302L +#else +#define MDSPAN_CXX_STD_23 202100L +#endif #define MDSPAN_HAS_CXX_14 (_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14) #define MDSPAN_HAS_CXX_17 (_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_17) #define MDSPAN_HAS_CXX_20 (_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_20) +#define MDSPAN_HAS_CXX_23 (_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_23) static_assert(_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14, "mdspan requires C++14 or later."); @@ -224,7 +231,7 @@ static_assert(_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14, "mdspan requires C++14 or #endif #ifndef MDSPAN_CONDITIONAL_EXPLICIT -# if MDSPAN_HAS_CXX_20 && !defined(_MDSPAN_COMPILER_MSVC) +# if MDSPAN_HAS_CXX_20 # define MDSPAN_CONDITIONAL_EXPLICIT(COND) explicit(COND) # else # define MDSPAN_CONDITIONAL_EXPLICIT(COND) diff --git a/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp b/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp index 0dd31c4cd0a..9a28c3ed5ca 100644 --- a/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp +++ b/tpls/mdspan/include/experimental/__p0009_bits/extents.hpp @@ -55,6 +55,14 @@ __check_compatible_extents( return {}; } +template +MDSPAN_INLINE_FUNCTION +static constexpr bool are_valid_indices() { + return + (std::is_convertible::value && ... && true) && + (std::is_nothrow_constructible::value && ... && true); +} + // ------------------------------------------------------------------ // ------------ static_array ---------------------------------------- // ------------------------------------------------------------------ @@ -140,7 +148,8 @@ struct index_sequence_scan_impl { template struct index_sequence_scan_impl { -#if defined(__NVCC__) || defined(__NVCOMPILER) +#if defined(__NVCC__) || defined(__NVCOMPILER) || \ + defined(_MDSPAN_COMPILER_INTEL) // NVCC warns about pointless comparison with 0 for R==0 and r being const // evaluatable and also 0. MDSPAN_INLINE_FUNCTION @@ -167,7 +176,7 @@ template <> struct index_sequence_scan_impl<0> { // all static values. template struct possibly_empty_array { - T vals[N]; + T vals[N]{}; MDSPAN_INLINE_FUNCTION constexpr T &operator[](size_t r) { return vals[r]; } MDSPAN_INLINE_FUNCTION @@ -251,12 +260,17 @@ struct maybe_static_array { #ifdef __cpp_lib_span MDSPAN_TEMPLATE_REQUIRES(class T, size_t N, - /* requires */ (N == m_size_dynamic)) + /* requires */ (N == m_size_dynamic && N > 0)) MDSPAN_INLINE_FUNCTION constexpr maybe_static_array(const std::span &vals) { for (size_t r = 0; r < N; r++) m_dyn_vals[r] = static_cast(vals[r]); } + + MDSPAN_TEMPLATE_REQUIRES(class T, size_t N, + /* requires */ (N == m_size_dynamic && N == 0)) + MDSPAN_INLINE_FUNCTION + constexpr maybe_static_array(const std::span &) : m_dyn_vals{} {} #endif // constructors from all values @@ -423,9 +437,9 @@ template class extents { class OtherIndexType, size_t N, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, OtherIndexType, index_type) && + _MDSPAN_TRAIT(std::is_convertible, const OtherIndexType&, index_type) && _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, - OtherIndexType) && + const OtherIndexType&) && (N == m_rank || N == m_rank_dynamic))) MDSPAN_INLINE_FUNCTION MDSPAN_CONDITIONAL_EXPLICIT(N != m_rank_dynamic) @@ -436,8 +450,8 @@ template class extents { MDSPAN_TEMPLATE_REQUIRES( class OtherIndexType, size_t N, /* requires */ - (_MDSPAN_TRAIT(std::is_convertible, OtherIndexType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, OtherIndexType) && + (_MDSPAN_TRAIT(std::is_convertible, const OtherIndexType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const OtherIndexType&) && (N == m_rank || N == m_rank_dynamic))) MDSPAN_INLINE_FUNCTION MDSPAN_CONDITIONAL_EXPLICIT(N != m_rank_dynamic) @@ -454,6 +468,7 @@ template class extents { size_t DynCount, size_t R, class OtherExtents, class... DynamicValues, /* requires */ ((R < m_rank) && (static_extent(R) == dynamic_extent))) MDSPAN_INLINE_FUNCTION + constexpr vals_t __construct_vals_from_extents(std::integral_constant, std::integral_constant, const OtherExtents &exts, @@ -468,6 +483,7 @@ template class extents { size_t DynCount, size_t R, class OtherExtents, class... DynamicValues, /* requires */ ((R < m_rank) && (static_extent(R) != dynamic_extent))) MDSPAN_INLINE_FUNCTION + constexpr vals_t __construct_vals_from_extents(std::integral_constant, std::integral_constant, const OtherExtents &exts, @@ -481,6 +497,7 @@ template class extents { size_t DynCount, size_t R, class OtherExtents, class... DynamicValues, /* requires */ ((R == m_rank) && (DynCount == m_rank_dynamic))) MDSPAN_INLINE_FUNCTION + constexpr vals_t __construct_vals_from_extents(std::integral_constant, std::integral_constant, const OtherExtents &, @@ -491,17 +508,20 @@ template class extents { public: // Converting constructor from other extents specializations - MDSPAN_TEMPLATE_REQUIRES( - class OtherIndexType, size_t... OtherExtents, - /* requires */ - ( - /* multi-stage check to protect from invalid pack expansion when sizes - don't match? */ - decltype(detail::__check_compatible_extents( - std::integral_constant{}, + MDSPAN_TEMPLATE_REQUIRES( + class OtherIndexType, size_t... OtherExtents, + /* requires */ + ( + /* multi-stage check to protect from invalid pack expansion when sizes + don't match? */ + decltype(detail::__check_compatible_extents( + // using: sizeof...(Extents) == sizeof...(OtherExtents) as the second argument fails with MSVC+NVCC with some obscure expansion error + // MSVC: 19.38.33133 NVCC: 12.0 + std::integral_constant::rank() == extents::rank()>{}, std::integer_sequence{}, - std::integer_sequence{}))::value)) + std::integer_sequence{}))::value + ) + ) MDSPAN_INLINE_FUNCTION MDSPAN_CONDITIONAL_EXPLICIT((((Extents != dynamic_extent) && (OtherExtents == dynamic_extent)) || @@ -518,10 +538,14 @@ template class extents { MDSPAN_INLINE_FUNCTION friend constexpr bool operator==(const extents &lhs, const extents &rhs) noexcept { - bool value = true; - for (size_type r = 0; r < m_rank; r++) - value &= rhs.extent(r) == lhs.extent(r); - return value; + if constexpr (rank() != extents::rank()) { + return false; + } else { + using common_t = std::common_type_t; + for (size_type r = 0; r < m_rank; r++) + if(static_cast(rhs.extent(r)) != static_cast(lhs.extent(r))) return false; + } + return true; } #if !(MDSPAN_HAS_CXX_20) @@ -570,7 +594,7 @@ using dextents = typename detail::__make_dextents::type; template extents(IndexTypes...) -> extents; + ((void) sizeof(IndexTypes), ::MDSPAN_IMPL_STANDARD_NAMESPACE::dynamic_extent)...>; #endif // Helper type traits for identifying a class as extents. diff --git a/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp b/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp index af44494a98d..83ed9ef7fe3 100644 --- a/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp +++ b/tpls/mdspan/include/experimental/__p0009_bits/layout_left.hpp @@ -18,6 +18,9 @@ #include "macros.hpp" #include "trait_backports.hpp" #include "extents.hpp" +#include "../__p2642_bits/layout_padded_fwd.hpp" +#include +#include namespace MDSPAN_IMPL_STANDARD_NAMESPACE { @@ -108,6 +111,36 @@ class layout_left::mapping { */ } +#if MDSPAN_HAS_CXX_17 + /** + * Converting constructor from `layout_left_padded::mapping`. + * + * This overload participates in overload resolution only if _Mapping is a layout_left_padded mapping and + * extents_type is constructible from _Mapping::extents_type. + * + * \note There is currently a difference from p2642r2, where this function is specified as taking + * `layout_left_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::is_layout_left_padded_mapping<_Mapping>::value + && std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v)) + mapping(const _Mapping& __other) noexcept + : __extents(__other.extents()) + { + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail:: + check_padded_layout_converting_constructor_mandates(); + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail:: + check_padded_layout_converting_constructor_preconditions< + extents_type>(__other); + } +#endif + MDSPAN_TEMPLATE_REQUIRES( class OtherExtents, /* requires */ ( @@ -124,13 +157,14 @@ class layout_left::mapping { * other.required_span_size() is a representable value of type index_type */ #if !defined(_MDSPAN_HAS_CUDA) && !defined(_MDSPAN_HAS_HIP) && !defined(NDEBUG) - index_type stride = 1; - for(rank_type r=0; r<__extents.rank(); r++) { - if(stride != static_cast(other.stride(r))) { - // Note this throw will lead to a terminate if triggered since this function is marked noexcept - throw std::runtime_error("Assigning layout_stride to layout_left with invalid strides."); + if constexpr (extents_type::rank() > 0) { + index_type stride = 1; + using common_t = std::common_type_t; + for(rank_type r=0; r<__extents.rank(); r++) { + if(static_cast(stride) != static_cast(other.stride(r))) + std::abort(); // ("Assigning layout_stride to layout_left with invalid strides."); + stride *= __extents.extent(r); } - stride *= __extents.extent(r); } #endif } @@ -155,10 +189,7 @@ class layout_left::mapping { class... Indices, /* requires */ ( (sizeof...(Indices) == extents_type::rank()) && - _MDSPAN_FOLD_AND( - (_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices)) - ) + (detail::are_valid_indices()) ) ) _MDSPAN_HOST_DEVICE @@ -172,9 +203,9 @@ class layout_left::mapping { MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { return true; } MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_exhaustive() noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; } MDSPAN_INLINE_FUNCTION constexpr index_type stride(rank_type i) const noexcept @@ -187,7 +218,10 @@ class layout_left::mapping { return value; } - template + MDSPAN_TEMPLATE_REQUIRES( + class OtherExtents, + /* requires */ ( Extents::rank() == OtherExtents::rank()) + ) MDSPAN_INLINE_FUNCTION friend constexpr bool operator==(mapping const& lhs, mapping const& rhs) noexcept { return lhs.extents() == rhs.extents(); @@ -195,7 +229,10 @@ class layout_left::mapping { // In C++ 20 the not equal exists if equal is found #if !(MDSPAN_HAS_CXX_20) - template + MDSPAN_TEMPLATE_REQUIRES( + class OtherExtents, + /* requires */ ( Extents::rank() == OtherExtents::rank()) + ) MDSPAN_INLINE_FUNCTION friend constexpr bool operator!=(mapping const& lhs, mapping const& rhs) noexcept { return lhs.extents() != rhs.extents(); @@ -215,6 +252,17 @@ class layout_left::mapping { private: _MDSPAN_NO_UNIQUE_ADDRESS extents_type __extents{}; + // [mdspan.submdspan.mapping], submdspan mapping specialization + template + MDSPAN_INLINE_FUNCTION + constexpr auto submdspan_mapping_impl( + SliceSpecifiers... slices) const; + + template + friend constexpr auto submdspan_mapping( + const mapping& src, SliceSpecifiers... slices) { + return src.submdspan_mapping_impl(slices...); + } }; diff --git a/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp b/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp index a0586484202..3d3927df7bc 100644 --- a/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp +++ b/tpls/mdspan/include/experimental/__p0009_bits/layout_right.hpp @@ -20,6 +20,7 @@ #include "extents.hpp" #include #include "layout_stride.hpp" +#include "../__p2642_bits/layout_padded_fwd.hpp" namespace MDSPAN_IMPL_STANDARD_NAMESPACE { @@ -113,6 +114,34 @@ class layout_right::mapping { */ } + /** + * Converting constructor from `layout_right_padded::mapping`. + * + * This overload participates in overload resolution only if _Mapping is a layout_right_padded mapping and + * extents_type is constructible from _Mapping::extents_type. + * + * \note There is currently a difference from p2642r2, where this function is specified as taking + * `layout_right_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + */ +#if MDSPAN_HAS_CXX_17 + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::is_layout_right_padded_mapping<_Mapping>::value + && std::is_constructible_v)) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v)) + mapping(const _Mapping &__other) noexcept + : __extents(__other.extents()) + { + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail:: + check_padded_layout_converting_constructor_mandates(); + MDSPAN_IMPL_PROPOSED_NAMESPACE::detail:: + check_padded_layout_converting_constructor_preconditions< + extents_type>(__other); + } +#endif + MDSPAN_TEMPLATE_REQUIRES( class OtherExtents, /* requires */ ( @@ -129,13 +158,14 @@ class layout_right::mapping { * other.required_span_size() is a representable value of type index_type */ #if !defined(_MDSPAN_HAS_CUDA) && !defined(_MDSPAN_HAS_HIP) && !defined(NDEBUG) - index_type stride = 1; - for(rank_type r=__extents.rank(); r>0; r--) { - if(stride != static_cast(other.stride(r-1))) { - // Note this throw will lead to a terminate if triggered since this function is marked noexcept - throw std::runtime_error("Assigning layout_stride to layout_right with invalid strides."); + if constexpr (extents_type::rank() > 0) { + index_type stride = 1; + using common_t = std::common_type_t; + for(rank_type r=__extents.rank(); r>0; r--) { + if(static_cast(stride) != static_cast(other.stride(r-1))) + std::abort(); // ("Assigning layout_stride to layout_right with invalid strides."); + stride *= __extents.extent(r-1); } - stride *= __extents.extent(r-1); } #endif } @@ -157,13 +187,10 @@ class layout_right::mapping { //-------------------------------------------------------------------------------- MDSPAN_TEMPLATE_REQUIRES( - class... Indices, + class ... Indices, /* requires */ ( - (sizeof...(Indices) == extents_type::rank()) && - _MDSPAN_FOLD_AND( - (_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices)) - ) + (sizeof...(Indices) == extents_type::rank()) && + (detail::are_valid_indices()) ) ) _MDSPAN_HOST_DEVICE @@ -174,9 +201,9 @@ class layout_right::mapping { MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { return true; } MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { return true; } MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { return true; } - MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_exhaustive() noexcept { return true; } + MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; } MDSPAN_INLINE_FUNCTION constexpr index_type stride(rank_type i) const noexcept @@ -189,7 +216,10 @@ class layout_right::mapping { return value; } - template + MDSPAN_TEMPLATE_REQUIRES( + class OtherExtents, + /* requires */ ( Extents::rank() == OtherExtents::rank()) + ) MDSPAN_INLINE_FUNCTION friend constexpr bool operator==(mapping const& lhs, mapping const& rhs) noexcept { return lhs.extents() == rhs.extents(); @@ -197,7 +227,10 @@ class layout_right::mapping { // In C++ 20 the not equal exists if equal is found #if !(MDSPAN_HAS_CXX_20) - template + MDSPAN_TEMPLATE_REQUIRES( + class OtherExtents, + /* requires */ (Extents::rank() == OtherExtents::rank()) + ) MDSPAN_INLINE_FUNCTION friend constexpr bool operator!=(mapping const& lhs, mapping const& rhs) noexcept { return lhs.extents() != rhs.extents(); @@ -217,6 +250,17 @@ class layout_right::mapping { private: _MDSPAN_NO_UNIQUE_ADDRESS extents_type __extents{}; + // [mdspan.submdspan.mapping], submdspan mapping specialization + template + MDSPAN_INLINE_FUNCTION + constexpr auto submdspan_mapping_impl( + SliceSpecifiers... slices) const; + + template + friend constexpr auto submdspan_mapping( + const mapping& src, SliceSpecifiers... slices) { + return src.submdspan_mapping_impl(slices...); + } }; } // end namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp b/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp index 030a494529b..15ad577d149 100644 --- a/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp +++ b/tpls/mdspan/include/experimental/__p0009_bits/layout_stride.hpp @@ -88,7 +88,7 @@ struct layout_stride { : private detail::__no_unique_address_emulation< detail::__compressed_pair< Extents, - std::array + detail::possibly_empty_array > > #endif @@ -109,7 +109,7 @@ struct layout_stride { //---------------------------------------------------------------------------- - using __strides_storage_t = std::array; + using __strides_storage_t = detail::possibly_empty_array; using __member_pair_t = detail::__compressed_pair; #if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) @@ -158,14 +158,16 @@ struct layout_stride { template MDSPAN_INLINE_FUNCTION static constexpr bool _eq_impl(mapping const& self, mapping const& other) noexcept { - return _MDSPAN_FOLD_AND((self.stride(Idxs) == other.stride(Idxs)) /* && ... */) - && _MDSPAN_FOLD_AND((self.extents().extent(Idxs) == other.extents().extent(Idxs)) /* || ... */); + using common_t = std::common_type_t; + return _MDSPAN_FOLD_AND((static_cast(self.stride(Idxs)) == static_cast(other.stride(Idxs))) /* && ... */) + && _MDSPAN_FOLD_AND((static_cast(self.extents().extent(Idxs)) == static_cast(other.extents().extent(Idxs))) /* || ... */); } template MDSPAN_INLINE_FUNCTION static constexpr bool _not_eq_impl(mapping const& self, mapping const& other) noexcept { - return _MDSPAN_FOLD_OR((self.stride(Idxs) != other.stride(Idxs)) /* || ... */) - || _MDSPAN_FOLD_OR((self.extents().extent(Idxs) != other.extents().extent(Idxs)) /* || ... */); + using common_t = std::common_type_t; + return _MDSPAN_FOLD_OR((static_cast(self.stride(Idxs)) != static_cast(other.stride(Idxs))) /* || ... */) + || _MDSPAN_FOLD_OR((static_cast(self.extents().extent(Idxs)) != static_cast(other.extents().extent(Idxs))) /* || ... */); } template @@ -205,6 +207,11 @@ struct layout_stride { } #endif + MDSPAN_INLINE_FUNCTION + static constexpr std::array return_strides(const __strides_storage_t& s) { + return std::array{s[Idxs]...}; + } + template MDSPAN_INLINE_FUNCTION static constexpr size_t __return_zero() { return 0; } @@ -218,6 +225,21 @@ struct layout_stride { // Can't use defaulted parameter in the __deduction_workaround template because of a bug in MSVC warning C4348. using __impl = __deduction_workaround>; + static constexpr __strides_storage_t strides_storage(std::true_type) { + __strides_storage_t s{}; + + extents_type e; + index_type stride = 1; + for(int r = static_cast(extents_type::rank() - 1); r >= 0; r--) { + s[r] = stride; + stride *= e.extent(r); + } + + return s; + } + static constexpr __strides_storage_t strides_storage(std::false_type) { + return {}; + } //---------------------------------------------------------------------------- @@ -233,7 +255,21 @@ struct layout_stride { //-------------------------------------------------------------------------------- - MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping() noexcept = default; + MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping() noexcept +#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) + : __members{ +#else + : __base_t(__base_t{__member_pair_t( +#endif + extents_type(), + __strides_storage_t(strides_storage(std::integral_constant 0)>{})) +#if defined(_MDSPAN_USE_ATTRIBUTE_NO_UNIQUE_ADDRESS) + } +#else + )}) +#endif + {} + MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(mapping const&) noexcept = default; MDSPAN_TEMPLATE_REQUIRES( @@ -332,10 +368,10 @@ struct layout_stride { ) #endif MDSPAN_CONDITIONAL_EXPLICIT( - (!std::is_convertible::value) && - (detail::__is_mapping_of || - detail::__is_mapping_of || - detail::__is_mapping_of) + !(std::is_convertible::value && + (detail::__is_mapping_of || + detail::__is_mapping_of || + detail::__is_mapping_of)) ) // needs two () due to comma MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 mapping(StridedLayoutMapping const& other) noexcept // NOLINT(google-explicit-constructor) @@ -374,7 +410,7 @@ struct layout_stride { MDSPAN_INLINE_FUNCTION constexpr std::array< index_type, extents_type::rank() > strides() const noexcept { - return __strides_storage(); + return __impl::return_strides(__strides_storage()); } MDSPAN_INLINE_FUNCTION @@ -393,8 +429,7 @@ struct layout_stride { class... Indices, /* requires */ ( sizeof...(Indices) == Extents::rank() && - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) /*&& ...*/ ) && - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices) /*&& ...*/) + (detail::are_valid_indices()) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -410,17 +445,37 @@ struct layout_stride { MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; } MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 bool is_exhaustive() const noexcept { - return required_span_size() == __get_size(extents(), std::make_index_sequence()); + if constexpr (extents_type::rank() == 0) + return true; + else { + index_type span_size = required_span_size(); + if (span_size == static_cast(0)) { + if constexpr (extents_type::rank() == 1) { + return stride(0) == 1; + } else { + rank_type r_largest = 0; + for (rank_type r = 1; r < extents_type::rank(); r++) { + if (stride(r) > stride(r_largest)) { + r_largest = r; + } + } + for (rank_type r = 0; r < extents_type::rank(); r++) { + if (extents().extent(r) == 0 && r != r_largest) { + return false; + } + } + return true; + } + } else { + return required_span_size() == __get_size(extents(), std::make_index_sequence()); + } + } } MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; } MDSPAN_INLINE_FUNCTION - constexpr index_type stride(rank_type r) const noexcept -#if MDSPAN_HAS_CXX_20 - requires ( Extents::rank() > 0 ) -#endif - { + constexpr index_type stride(rank_type r) const noexcept { return __strides_storage()[r]; } @@ -444,10 +499,13 @@ struct layout_stride { MDSPAN_INLINE_FUNCTION friend constexpr bool operator==(const mapping& x, const StridedLayoutMapping& y) noexcept { bool strides_match = true; - for(rank_type r = 0; r < extents_type::rank(); r++) - strides_match = strides_match && (x.stride(r) == y.stride(r)); + if constexpr (extents_type::rank() > 0) { + using common_t = std::common_type_t; + for(rank_type r = 0; r < extents_type::rank(); r++) + strides_match = strides_match && (static_cast(x.stride(r)) == static_cast(y.stride(r))); + } return (x.extents() == y.extents()) && - (__impl::__OFFSET(y)== static_cast(0)) && + (__impl::__OFFSET(y) == static_cast(0)) && strides_match; } @@ -489,6 +547,17 @@ struct layout_stride { } #endif + // [mdspan.submdspan.mapping], submdspan mapping specialization + template + MDSPAN_INLINE_FUNCTION + constexpr auto submdspan_mapping_impl( + SliceSpecifiers... slices) const; + + template + friend constexpr auto submdspan_mapping( + const mapping& src, SliceSpecifiers... slices) { + return src.submdspan_mapping_impl(slices...); + } }; }; diff --git a/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp b/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp index 6febe300215..d6ec49e65bf 100644 --- a/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp +++ b/tpls/mdspan/include/experimental/__p0009_bits/mdspan.hpp @@ -55,6 +55,13 @@ class mdspan ReferenceType __callop(mdspan const& __self, const std::array& indices) noexcept { return __self.__accessor_ref().access(__self.__ptr_ref(), __self.__mapping_ref()(indices[Idxs]...)); } +#ifdef __cpp_lib_span + template + MDSPAN_FORCE_INLINE_FUNCTION static constexpr + ReferenceType __callop(mdspan const& __self, const std::span& indices) noexcept { + return __self.__accessor_ref().access(__self.__ptr_ref(), __self.__mapping_ref()(indices[Idxs]...)); + } +#endif }; public: @@ -109,9 +116,8 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeTypes) /* && ... */) && ((sizeof...(SizeTypes) == rank()) || (sizeof...(SizeTypes) == rank_dynamic())) && + (detail::are_valid_indices()) && _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type) && _MDSPAN_TRAIT(std::is_default_constructible, accessor_type) ) @@ -125,8 +131,8 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class SizeType, size_t N, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) && + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) && ((N == rank()) || (N == rank_dynamic())) && _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type) && _MDSPAN_TRAIT(std::is_default_constructible, accessor_type) @@ -142,8 +148,8 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class SizeType, size_t N, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) && + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) && ((N == rank()) || (N == rank_dynamic())) && _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type) && _MDSPAN_TRAIT(std::is_default_constructible, accessor_type) @@ -160,7 +166,7 @@ class mdspan (MDSPAN_INLINE_FUNCTION constexpr), mdspan, (data_handle_type p, const extents_type& exts), , /* requires */ (_MDSPAN_TRAIT(std::is_default_constructible, accessor_type) && - _MDSPAN_TRAIT(std::is_constructible, mapping_type, extents_type)) + _MDSPAN_TRAIT(std::is_constructible, mapping_type, const extents_type&)) ) : __members(std::move(p), __map_acc_pair_t(mapping_type(exts), accessor_type())) { } @@ -179,10 +185,14 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class OtherElementType, class OtherExtents, class OtherLayoutPolicy, class OtherAccessor, /* requires */ ( - _MDSPAN_TRAIT(std::is_constructible, mapping_type, typename OtherLayoutPolicy::template mapping) && - _MDSPAN_TRAIT(std::is_constructible, accessor_type, OtherAccessor) + _MDSPAN_TRAIT(std::is_constructible, mapping_type, const typename OtherLayoutPolicy::template mapping&) && + _MDSPAN_TRAIT(std::is_constructible, accessor_type, const OtherAccessor&) ) ) + MDSPAN_CONDITIONAL_EXPLICIT( + !_MDSPAN_TRAIT(std::is_convertible, const typename OtherLayoutPolicy::template mapping&, mapping_type) || + !_MDSPAN_TRAIT(std::is_convertible, const OtherAccessor&, accessor_type) + ) MDSPAN_INLINE_FUNCTION constexpr mdspan(const mdspan& other) : __members(other.__ptr_ref(), __map_acc_pair_t(other.__mapping_ref(), other.__accessor_ref())) @@ -226,8 +236,8 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class SizeType, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -240,8 +250,8 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class SizeType, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -271,9 +281,8 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeTypes) /* && ... */) && - extents_type::rank() == sizeof...(SizeTypes) + extents_type::rank() == sizeof...(SizeTypes) && + (detail::are_valid_indices()) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -285,8 +294,8 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class SizeType, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -299,8 +308,8 @@ class mdspan MDSPAN_TEMPLATE_REQUIRES( class SizeType, /* requires */ ( - _MDSPAN_TRAIT(std::is_convertible, SizeType, index_type) && - _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, SizeType) + _MDSPAN_TRAIT(std::is_convertible, const SizeType&, index_type) && + _MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, const SizeType&) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -311,7 +320,7 @@ class mdspan #endif // __cpp_lib_span #endif // MDSPAN_USE_PAREN_OPERATOR - MDSPAN_INLINE_FUNCTION constexpr size_t size() const noexcept { + MDSPAN_INLINE_FUNCTION constexpr size_type size() const noexcept { return __impl::__size(*this); }; @@ -346,13 +355,13 @@ class mdspan //-------------------------------------------------------------------------------- // [mdspan.basic.obs], mdspan observers of the mapping - MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { return mapping_type::is_always_unique(); }; - MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept { return mapping_type::is_always_exhaustive(); }; - MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return mapping_type::is_always_strided(); }; + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() { return mapping_type::is_always_unique(); }; + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() { return mapping_type::is_always_exhaustive(); }; + MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() { return mapping_type::is_always_strided(); }; - MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const noexcept { return __mapping_ref().is_unique(); }; - MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const noexcept { return __mapping_ref().is_exhaustive(); }; - MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const noexcept { return __mapping_ref().is_strided(); }; + MDSPAN_INLINE_FUNCTION constexpr bool is_unique() const { return __mapping_ref().is_unique(); }; + MDSPAN_INLINE_FUNCTION constexpr bool is_exhaustive() const { return __mapping_ref().is_exhaustive(); }; + MDSPAN_INLINE_FUNCTION constexpr bool is_strided() const { return __mapping_ref().is_strided(); }; MDSPAN_INLINE_FUNCTION constexpr index_type stride(size_t r) const { return __mapping_ref().stride(r); }; private: @@ -374,7 +383,7 @@ class mdspan #if defined(_MDSPAN_USE_CLASS_TEMPLATE_ARGUMENT_DEDUCTION) MDSPAN_TEMPLATE_REQUIRES( class ElementType, class... SizeTypes, - /* requires */ _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_integral, SizeTypes) /* && ... */) && + /* requires */ _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, SizeTypes, size_t) /* && ... */) && (sizeof...(SizeTypes) > 0) ) MDSPAN_DEDUCTION_GUIDE explicit mdspan(ElementType*, SizeTypes...) diff --git a/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp b/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp index 3950273a83d..bdc5925f715 100644 --- a/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp +++ b/tpls/mdspan/include/experimental/__p1684_bits/mdarray.hpp @@ -103,8 +103,8 @@ class mdarray { MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) && + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) && + _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) && _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type) && (_MDSPAN_TRAIT( std::is_constructible, container_type, size_t) || container_is_array::value) && @@ -133,61 +133,29 @@ class mdarray { ) : map_(m), ctr_(container_is_array::construct(map_)) { } - // Constructors from container - MDSPAN_TEMPLATE_REQUIRES( - class... SizeTypes, - /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) && - _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type) - ) - ) - MDSPAN_INLINE_FUNCTION - explicit constexpr mdarray(const container_type& ctr, SizeTypes... dynamic_extents) - : map_(extents_type(dynamic_extents...)), ctr_(ctr) - { assert(ctr.size() >= static_cast(map_.required_span_size())); } - - MDSPAN_FUNCTION_REQUIRES( (MDSPAN_INLINE_FUNCTION constexpr), - mdarray, (const container_type& ctr, const extents_type& exts), , + mdarray, (const extents_type& exts, const container_type& ctr), , /* requires */ (_MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type)) ) : map_(exts), ctr_(ctr) { assert(ctr.size() >= static_cast(map_.required_span_size())); } - constexpr mdarray(const container_type& ctr, const mapping_type& m) + constexpr mdarray(const mapping_type& m, const container_type& ctr) : map_(m), ctr_(ctr) { assert(ctr.size() >= static_cast(map_.required_span_size())); } - - // Constructors from container - MDSPAN_TEMPLATE_REQUIRES( - class... SizeTypes, - /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - _MDSPAN_TRAIT( std::is_constructible, extents_type, SizeTypes...) && - _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type) - ) - ) - MDSPAN_INLINE_FUNCTION - explicit constexpr mdarray(container_type&& ctr, SizeTypes... dynamic_extents) - : map_(extents_type(dynamic_extents...)), ctr_(std::move(ctr)) - { assert(ctr_.size() >= static_cast(map_.required_span_size())); } - - MDSPAN_FUNCTION_REQUIRES( (MDSPAN_INLINE_FUNCTION constexpr), - mdarray, (container_type&& ctr, const extents_type& exts), , + mdarray, (const extents_type& exts, container_type&& ctr), , /* requires */ (_MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type)) ) : map_(exts), ctr_(std::move(ctr)) { assert(ctr_.size() >= static_cast(map_.required_span_size())); } - constexpr mdarray(container_type&& ctr, const mapping_type& m) + constexpr mdarray(const mapping_type& m, container_type&& ctr) : map_(m), ctr_(std::move(ctr)) { assert(ctr_.size() >= static_cast(map_.required_span_size())); } - MDSPAN_TEMPLATE_REQUIRES( class OtherElementType, class OtherExtents, class OtherLayoutPolicy, class OtherContainer, /* requires */ ( @@ -229,7 +197,7 @@ class mdarray { _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type)) ) MDSPAN_INLINE_FUNCTION - constexpr mdarray(const container_type& ctr, const extents_type& exts, const Alloc& a) + constexpr mdarray(const extents_type& exts, const container_type& ctr, const Alloc& a) : map_(exts), ctr_(ctr, a) { assert(ctr_.size() >= static_cast(map_.required_span_size())); } @@ -238,7 +206,7 @@ class mdarray { /* requires */ (_MDSPAN_TRAIT( std::is_constructible, container_type, size_t, Alloc)) ) MDSPAN_INLINE_FUNCTION - constexpr mdarray(const container_type& ctr, const mapping_type& map, const Alloc& a) + constexpr mdarray(const mapping_type& map, const container_type& ctr, const Alloc& a) : map_(map), ctr_(ctr, a) { assert(ctr_.size() >= static_cast(map_.required_span_size())); } @@ -248,7 +216,7 @@ class mdarray { _MDSPAN_TRAIT( std::is_constructible, mapping_type, extents_type)) ) MDSPAN_INLINE_FUNCTION - constexpr mdarray(container_type&& ctr, const extents_type& exts, const Alloc& a) + constexpr mdarray(const extents_type& exts, container_type&& ctr, const Alloc& a) : map_(exts), ctr_(std::move(ctr), a) { assert(ctr_.size() >= static_cast(map_.required_span_size())); } @@ -257,7 +225,7 @@ class mdarray { /* requires */ (_MDSPAN_TRAIT( std::is_constructible, container_type, size_t, Alloc)) ) MDSPAN_INLINE_FUNCTION - constexpr mdarray(container_type&& ctr, const mapping_type& map, const Alloc& a) + constexpr mdarray(const mapping_type& map, container_type&& ctr, const Alloc& a) : map_(map), ctr_(std::move(ctr), a) { assert(ctr_.size() >= map_.required_span_size()); } @@ -344,8 +312,8 @@ class mdarray { MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - extents_type::rank() == sizeof...(SizeTypes) + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) && + extents_type::rank() == sizeof...(SizeTypes) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -356,8 +324,8 @@ class mdarray { MDSPAN_TEMPLATE_REQUIRES( class... SizeTypes, /* requires */ ( - _MDSPAN_FOLD_AND(_MDSPAN_TRAIT( std::is_convertible, SizeTypes, index_type) /* && ... */) && - extents_type::rank() == sizeof...(SizeTypes) + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) && + extents_type::rank() == sizeof...(SizeTypes) ) ) MDSPAN_FORCE_INLINE_FUNCTION @@ -433,8 +401,9 @@ class mdarray { class OtherElementType, class OtherExtents, class OtherLayoutType, class OtherAccessorType, /* requires */ ( - _MDSPAN_TRAIT(std::is_assignable, mdspan_type, - mdspan) + _MDSPAN_TRAIT(std::is_assignable, + mdspan, + mdspan_type) ) ) constexpr operator mdspan () { @@ -445,8 +414,9 @@ class mdarray { class OtherElementType, class OtherExtents, class OtherLayoutType, class OtherAccessorType, /* requires */ ( - _MDSPAN_TRAIT(std::is_assignable, const_mdspan_type, - mdspan) + _MDSPAN_TRAIT(std::is_assignable, + mdspan, + const_mdspan_type) ) ) constexpr operator mdspan () const { diff --git a/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp b/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp index 58f38620ba1..89ba8202fb1 100644 --- a/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp +++ b/tpls/mdspan/include/experimental/__p2630_bits/strided_slice.hpp @@ -20,7 +20,6 @@ #include namespace MDSPAN_IMPL_STANDARD_NAMESPACE { -namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { namespace { template @@ -29,6 +28,7 @@ namespace { template struct __mdspan_is_integral_constant>: std::true_type {}; } + // Slice Specifier allowing for strides and compile time extent template struct strided_slice { @@ -36,14 +36,13 @@ struct strided_slice { using extent_type = ExtentType; using stride_type = StrideType; - OffsetType offset; - ExtentType extent; - StrideType stride; + _MDSPAN_NO_UNIQUE_ADDRESS OffsetType offset{}; + _MDSPAN_NO_UNIQUE_ADDRESS ExtentType extent{}; + _MDSPAN_NO_UNIQUE_ADDRESS StrideType stride{}; static_assert(std::is_integral_v || __mdspan_is_integral_constant::value); static_assert(std::is_integral_v || __mdspan_is_integral_constant::value); static_assert(std::is_integral_v || __mdspan_is_integral_constant::value); }; -} // MDSPAN_IMPL_PROPOSED_NAMESPACE } // MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp b/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp index b9672b7f9ac..abddd0b59df 100644 --- a/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp +++ b/tpls/mdspan/include/experimental/__p2630_bits/submdspan.hpp @@ -20,23 +20,21 @@ #include "submdspan_mapping.hpp" namespace MDSPAN_IMPL_STANDARD_NAMESPACE { -namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { template MDSPAN_INLINE_FUNCTION constexpr auto submdspan(const mdspan &src, SliceSpecifiers... slices) { - const auto sub_mapping_offset = submdspan_mapping(src.mapping(), slices...); + const auto sub_submdspan_mapping_result = submdspan_mapping(src.mapping(), slices...); // NVCC has a problem with the deduction so lets figure out the type - using sub_mapping_t = std::remove_cv_t; + using sub_mapping_t = std::remove_cv_t; using sub_extents_t = typename sub_mapping_t::extents_type; using sub_layout_t = typename sub_mapping_t::layout_type; using sub_accessor_t = typename AccessorPolicy::offset_policy; return mdspan( - src.accessor().offset(src.data_handle(), sub_mapping_offset.offset), - sub_mapping_offset.mapping, + src.accessor().offset(src.data_handle(), sub_submdspan_mapping_result.offset), + sub_submdspan_mapping_result.mapping, sub_accessor_t(src.accessor())); } -} // namespace MDSPAN_IMPL_PROPOSED_NAMESPACE } // namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp b/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp index f56ce023f16..c3b2f78fb99 100644 --- a/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp +++ b/tpls/mdspan/include/experimental/__p2630_bits/submdspan_extents.hpp @@ -20,7 +20,6 @@ #include "strided_slice.hpp" namespace MDSPAN_IMPL_STANDARD_NAMESPACE { -namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { namespace detail { // Mapping from submapping ranks to srcmapping ranks @@ -319,5 +318,4 @@ constexpr auto submdspan_extents(const extents &src_exts, return detail::extents_constructor::next_extent( src_exts, slices...); } -} // namespace MDSPAN_IMPL_PROPOSED_NAMESPACE } // namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp b/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp index 48778d57e75..ca6948c9a9f 100644 --- a/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp +++ b/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp @@ -22,21 +22,15 @@ #include // index_sequence namespace MDSPAN_IMPL_STANDARD_NAMESPACE { -namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { //****************************************** // Return type of submdspan_mapping overloads //****************************************** -template struct mapping_offset { - Mapping mapping; +template struct submdspan_mapping_result { + _MDSPAN_NO_UNIQUE_ADDRESS LayoutMapping mapping{}; size_t offset; }; -} // namespace MDSPAN_IMPL_PROPOSED_NAMESPACE namespace detail { -using MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::first_of; -using MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::stride_of; -using MDSPAN_IMPL_PROPOSED_NAMESPACE::detail::inv_map_rank; - // constructs sub strides template MDSPAN_INLINE_FUNCTION @@ -98,17 +92,15 @@ struct preserve_layout_left_mapping, SubRank, #pragma diag_suppress = implicit_return_from_non_void_function #endif // Actual submdspan mapping call -template +template +template MDSPAN_INLINE_FUNCTION constexpr auto -submdspan_mapping(const layout_left::mapping &src_mapping, - SliceSpecifiers... slices) { - using MDSPAN_IMPL_PROPOSED_NAMESPACE::submdspan_extents; - using MDSPAN_IMPL_PROPOSED_NAMESPACE::mapping_offset; +layout_left::mapping::submdspan_mapping_impl(SliceSpecifiers... slices) const { // compute sub extents using src_ext_t = Extents; - auto dst_ext = submdspan_extents(src_mapping.extents(), slices...); + auto dst_ext = submdspan_extents(extents(), slices...); using dst_ext_t = decltype(dst_ext); // figure out sub layout type @@ -121,18 +113,18 @@ submdspan_mapping(const layout_left::mapping &src_mapping, if constexpr (std::is_same_v) { // layout_left case - return mapping_offset{ + return submdspan_mapping_result{ dst_mapping_t(dst_ext), - static_cast(src_mapping(detail::first_of(slices)...))}; + static_cast(this->operator()(detail::first_of(slices)...))}; } else { // layout_stride case auto inv_map = detail::inv_map_rank( std::integral_constant(), std::index_sequence<>(), slices...); - return mapping_offset{ + return submdspan_mapping_result{ dst_mapping_t(dst_ext, detail::construct_sub_strides( - src_mapping, inv_map, + *this, inv_map, // HIP needs deduction guides to have markups so we need to be explicit // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have the issue #if defined(_MDSPAN_HAS_HIP) || (defined(__NVCC__) && (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120) @@ -140,7 +132,7 @@ submdspan_mapping(const layout_left::mapping &src_mapping, #else std::tuple{detail::stride_of(slices)...})), #endif - static_cast(src_mapping(detail::first_of(slices)...))}; + static_cast(this->operator()(detail::first_of(slices)...))}; } #if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__) __builtin_unreachable(); @@ -207,17 +199,15 @@ struct preserve_layout_right_mapping, SubRank, #pragma diagnostic push #pragma diag_suppress = implicit_return_from_non_void_function #endif -template +template +template MDSPAN_INLINE_FUNCTION constexpr auto -submdspan_mapping(const layout_right::mapping &src_mapping, - SliceSpecifiers... slices) { - using MDSPAN_IMPL_PROPOSED_NAMESPACE::submdspan_extents; - using MDSPAN_IMPL_PROPOSED_NAMESPACE::mapping_offset; - +layout_right::mapping::submdspan_mapping_impl( + SliceSpecifiers... slices) const { // get sub extents using src_ext_t = Extents; - auto dst_ext = submdspan_extents(src_mapping.extents(), slices...); + auto dst_ext = submdspan_extents(extents(), slices...); using dst_ext_t = decltype(dst_ext); // determine new layout type @@ -230,18 +220,18 @@ submdspan_mapping(const layout_right::mapping &src_mapping, if constexpr (std::is_same_v) { // layout_right case - return mapping_offset{ + return submdspan_mapping_result{ dst_mapping_t(dst_ext), - static_cast(src_mapping(detail::first_of(slices)...))}; + static_cast(this->operator()(detail::first_of(slices)...))}; } else { // layout_stride case auto inv_map = detail::inv_map_rank( std::integral_constant(), std::index_sequence<>(), slices...); - return mapping_offset{ + return submdspan_mapping_result{ dst_mapping_t(dst_ext, detail::construct_sub_strides( - src_mapping, inv_map, + *this, inv_map, // HIP needs deduction guides to have markups so we need to be explicit // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have the issue #if defined(_MDSPAN_HAS_HIP) || (defined(__NVCC__) && (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120) @@ -249,7 +239,7 @@ submdspan_mapping(const layout_right::mapping &src_mapping, #else std::tuple{detail::stride_of(slices)...})), #endif - static_cast(src_mapping(detail::first_of(slices)...))}; + static_cast(this->operator()(detail::first_of(slices)...))}; } #if defined(__NVCC__) && !defined(__CUDA_ARCH__) && defined(__GNUC__) __builtin_unreachable(); @@ -270,23 +260,22 @@ submdspan_mapping(const layout_right::mapping &src_mapping, //********************************** // layout_stride submdspan_mapping //********************************* -template +template +template MDSPAN_INLINE_FUNCTION constexpr auto -submdspan_mapping(const layout_stride::mapping &src_mapping, - SliceSpecifiers... slices) { - using MDSPAN_IMPL_PROPOSED_NAMESPACE::submdspan_extents; - using MDSPAN_IMPL_PROPOSED_NAMESPACE::mapping_offset; - auto dst_ext = submdspan_extents(src_mapping.extents(), slices...); +layout_stride::mapping::submdspan_mapping_impl( + SliceSpecifiers... slices) const { + auto dst_ext = submdspan_extents(extents(), slices...); using dst_ext_t = decltype(dst_ext); auto inv_map = detail::inv_map_rank( std::integral_constant(), std::index_sequence<>(), slices...); using dst_mapping_t = typename layout_stride::template mapping; - return mapping_offset{ + return submdspan_mapping_result{ dst_mapping_t(dst_ext, detail::construct_sub_strides( - src_mapping, inv_map, + *this, inv_map, // HIP needs deduction guides to have markups so we need to be explicit // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have the issue #if defined(_MDSPAN_HAS_HIP) || (defined(__NVCC__) && (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120) @@ -294,6 +283,7 @@ submdspan_mapping(const layout_stride::mapping &src_mapping, #else std::tuple(detail::stride_of(slices)...))), #endif - static_cast(src_mapping(detail::first_of(slices)...))}; + static_cast(this->operator()(detail::first_of(slices)...))}; } + } // namespace MDSPAN_IMPL_STANDARD_NAMESPACE diff --git a/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp b/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp new file mode 100644 index 00000000000..a8014867923 --- /dev/null +++ b/tpls/mdspan/include/experimental/__p2642_bits/layout_padded.hpp @@ -0,0 +1,793 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#pragma once + +#include +#include "layout_padded_fwd.hpp" +#include "../__p0009_bits/dynamic_extent.hpp" +#include "../__p0009_bits/extents.hpp" +#include "../__p0009_bits/mdspan.hpp" +#include "../__p0009_bits/layout_left.hpp" +#include "../__p0009_bits/layout_right.hpp" +#include "../__p0009_bits/layout_stride.hpp" + +namespace MDSPAN_IMPL_STANDARD_NAMESPACE { +namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { + +namespace detail { +template +MDSPAN_INLINE_FUNCTION +constexpr _T +find_next_multiple(_T alignment, _T offset) +{ + if ( alignment == 0 ) { + return _T(0); + } else { + return ( ( offset + alignment - 1 ) / alignment) * alignment; + } +} + +template +MDSPAN_INLINE_FUNCTION constexpr size_t get_actual_static_padding_value() { + constexpr auto rank = _ExtentsType::rank(); + + if constexpr (rank <= typename _ExtentsType::rank_type(1)) { + return 0; + } else if constexpr (_PaddingValue != dynamic_extent && + _ExtentsType::static_extent(_ExtentToPadIdx) != + dynamic_extent) { + static_assert( + (_PaddingValue != 0) || + (_ExtentsType::static_extent(_ExtentToPadIdx) == 0), + "padding stride can be 0 only if " + "extents_type::static_extent(extent-to-pad) is 0 or dynamic_extent"); + return find_next_multiple(_PaddingValue, + _ExtentsType::static_extent(_ExtentToPadIdx)); + } else { + return dynamic_extent; + } +} + +template +struct static_array_type_for_padded_extent +{ + static constexpr size_t padding_value = _PaddingValue; + using index_type = typename _Extents::index_type; + using extents_type = _Extents; + using type = ::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::maybe_static_array< + index_type, size_t, dynamic_extent, + detail::get_actual_static_padding_value()>; +}; + +template +struct static_array_type_for_padded_extent<_PaddingValue, _Extents, + _ExtentToPadIdx, Rank, std::enable_if_t> { + using index_type = typename _Extents::index_type; + using extents_type = _Extents; + using type = + ::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::maybe_static_array< + index_type, size_t, dynamic_extent, 0>; +}; + +template +struct padded_extent { + static constexpr size_t padding_value = _PaddingValue; + using index_type = typename _Extents::index_type; + using extents_type = _Extents; + using static_array_type = typename static_array_type_for_padded_extent< + padding_value, _Extents, _ExtentToPadIdx, _Extents::rank()>::type; + + static constexpr auto static_value() { return static_array_type::static_value(0); } + + MDSPAN_INLINE_FUNCTION + static constexpr static_array_type + init_padding(const _Extents &exts) { + if constexpr ((_Extents::rank() > 1) && (padding_value == dynamic_extent)) { + return {exts.extent(_ExtentToPadIdx)}; + } else { + return init_padding(exts, padding_value); + } + } + + MDSPAN_INLINE_FUNCTION static constexpr static_array_type + init_padding([[maybe_unused]] const _Extents &exts, + [[maybe_unused]] index_type pv) { + if constexpr (_Extents::rank() > 1) { + return {find_next_multiple(pv, + exts.extent(_ExtentToPadIdx))}; + } else { + return {}; + } + } + + template + MDSPAN_INLINE_FUNCTION static constexpr static_array_type + init_padding([[maybe_unused]] const _Mapping &other_mapping, + std::integral_constant) { + if constexpr (_Extents::rank() > 1) { + return {other_mapping.stride(_PaddingStrideIdx)}; + } else { + return {}; + } + } +}; +} // namespace detail + +template +template +class layout_left_padded::mapping { +public: + static constexpr size_t padding_value = PaddingValue; + + using extents_type = Extents; + using index_type = typename extents_type::index_type; + using size_type = typename extents_type::size_type; + using rank_type = typename extents_type::rank_type; + using layout_type = layout_left_padded; + +#ifndef MDSPAN_INTERNAL_TEST +private: +#endif // MDSPAN_INTERNAL_TEST + + static constexpr rank_type padded_stride_idx = detail::layout_padded_constants::padded_stride_idx; + static constexpr rank_type extent_to_pad_idx = detail::layout_padded_constants::extent_to_pad_idx; + + static_assert((padding_value != 0) + || (extents_type::static_extent(extent_to_pad_idx) == 0) + || (extents_type::static_extent(extent_to_pad_idx) == dynamic_extent), + "out of bounds access for rank 0"); + + using padded_stride_type = detail::padded_extent< padding_value, extents_type, extent_to_pad_idx >; + + static constexpr size_t static_padding_stride = padded_stride_type::static_value(); + + typename padded_stride_type::static_array_type padded_stride = {}; + extents_type exts = {}; + + constexpr index_type compute_offset(std::index_sequence<>) const { + return 0; + } + + template + constexpr index_type compute_offset(std::index_sequence, + IndexOffset index_offset) const { + return index_offset; + } + + template + constexpr index_type compute_offset(std::index_sequence, + IndexOffsets... index_offsets) const { + index_type indices[] = {static_cast(index_offsets)...}; + // self-recursive fold trick from + // https://github.com/llvm/llvm-project/blob/96e1914aa2e6d8966acbfbe2f4d184201f1aa318/libcxx/include/mdspan/layout_left.h#L144 + index_type res = 0; + ((res = indices[extents_type::rank() - 1 - Ranks] + + ((extents_type::rank() - 1 - Ranks) == extent_to_pad_idx + ? padded_stride.value(0) + : exts.extent(extents_type::rank() - 1 - Ranks)) * + res), + ...); + return res; + } + +public: +#if !MDSPAN_HAS_CXX_20 + MDSPAN_INLINE_FUNCTION_DEFAULTED + constexpr mapping() + : mapping(extents_type{}) + {} +#else + MDSPAN_INLINE_FUNCTION_DEFAULTED + constexpr mapping() + requires(static_padding_stride != dynamic_extent) = default; + + MDSPAN_INLINE_FUNCTION + constexpr mapping() + requires(static_padding_stride == dynamic_extent) + : mapping(extents_type{}) + {} +#endif + + MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(const mapping&) noexcept = default; + MDSPAN_INLINE_FUNCTION_DEFAULTED mapping& operator=(const mapping&) noexcept = default; + + /** + * Initializes the mapping with the given extents. + * + * \param ext the given extents + */ + MDSPAN_INLINE_FUNCTION + constexpr mapping(const extents_type& ext) + : padded_stride(padded_stride_type::init_padding(ext)), exts(ext) + {} + + /** + * Initializes the mapping with the given extents and the specified padding value. + * + * This overload participates in overload resolution only if `is_convertible_v` + * is `true` and `is_nothrow_constructible_v` is `true` + * + * \param ext the given extents + * \param padding_value the padding value + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Size, + /* requires */ ( + std::is_convertible_v<_Size, index_type> + && std::is_nothrow_constructible_v + ) + ) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const extents_type &ext, _Size dynamic_padding_value) + : padded_stride(padded_stride_type::init_padding(ext, dynamic_padding_value)), exts(ext) + { + assert((padding_value == dynamic_extent) || (static_cast(padding_value) == static_cast(dynamic_padding_value))); + } + + /** + * Converting constructor from `layout_left::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v` is true. + * If `OtherExtents::rank() > 1` then one of `padding_value`, `static_extent(0)`, or `OtherExtents::static_extent(0)` must be `dynamic_extent`; + * otherwise, `OtherExtents::static_extent(0)` must be equal to the least multiple of `padding_value` greater than or equal to `extents_type::static_extent(0)` + */ + MDSPAN_TEMPLATE_REQUIRES( + class _OtherExtents, + /* requires */ ( + std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v<_OtherExtents, extents_type>)) + constexpr mapping(const layout_left::mapping<_OtherExtents> &other_mapping) + : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), + exts(other_mapping.extents()) + { + static_assert((_OtherExtents::rank() > 1) || (static_padding_stride != dynamic_extent) || (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) + || (static_padding_stride == _OtherExtents::static_extent(extent_to_pad_idx))); + } + + /** + * Converting constructor from `layout_stride::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v` is true + */ + MDSPAN_TEMPLATE_REQUIRES( + class _OtherExtents, + /* requires */ ( + std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0)) + constexpr mapping(const layout_stride::mapping<_OtherExtents> &other_mapping) + : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), + exts(other_mapping.extents()) + { + } + + /** + * Converting constructor from `layout_left_padded::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v` is true. + * Either `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or `padding_value == OtherPaddingStride`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_left_padded_mapping<_Mapping>::value + && std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 1 && (padding_value == dynamic_extent || _Mapping::padding_value == dynamic_extent))) + constexpr + mapping(const _Mapping &other_mapping) + : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), + exts(other_mapping.extents()) + { + static_assert(padding_value == dynamic_extent || + _Mapping::padding_value == dynamic_extent || + padding_value == _Mapping::padding_value); + } + + /** + * Converting constructor from `layout_right_padded::mapping`. + * + * This overload participates in overload resolution only if `extents_type::rank()` is 0 or 1 and `is_constructible_v` is `true`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_right_padded_mapping<_Mapping>::value + && extents_type::rank() <= 1 + && std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v)) + constexpr + mapping(const _Mapping &other_mapping) noexcept + : padded_stride(padded_stride_type::init_padding(other_mapping.extents(), other_mapping.extents().extent(extent_to_pad_idx))), + exts(other_mapping.extents()) + {} + + constexpr const extents_type &extents() const noexcept + { + return exts; + } + + constexpr std::array + strides() const noexcept + { + if constexpr ( extents_type::rank() == 0 ) { + return {}; + } else if constexpr ( extents_type::rank() == 1 ) { + return {1}; + } else { + index_type value = 1; + std::array s{}; + s[extent_to_pad_idx] = value; + value *= padded_stride.value(0); + for (rank_type r = extent_to_pad_idx + 1; r < extents_type::rank() - 1; ++r) + { + s[r] = value; + value *= exts.extent(r); + } + s[extents_type::rank() - 1] = value; + return s; + } + } + + constexpr index_type + required_span_size() const noexcept + { + if constexpr ( extents_type::rank() == 0 ) { + return 1; + } else if constexpr ( extents_type::rank() == 1 ) { + return exts.extent(0); + } else { + index_type value = padded_stride.value(0); + for (rank_type r = 1; r < extents_type::rank(); ++r) { + value *= exts.extent(r); + } + return value; + } + } + + /** + * Return the mapping given the provided indices per rank. + * + * This overload participates in overload resolution only if: + * - `sizeof...(Indices) == extents_type::rank()`, + * - `(is_convertible_v && ...) is true`, and + * - (is_nothrow_constructible_v && ...) is true. + */ + MDSPAN_TEMPLATE_REQUIRES( + class... _Indices, + /* requires */ ( + sizeof...(_Indices) == extents_type::rank() && + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) + ) + ) + constexpr size_t operator()(_Indices... idxs) const noexcept + { + return compute_offset(std::index_sequence_for<_Indices...>{}, idxs...); + } + + static constexpr bool is_always_unique() noexcept { return true; } + static constexpr bool is_always_exhaustive() noexcept + { + return (extents_type::rank() <= rank_type(1)) + || (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent + && extents_type::static_extent(extent_to_pad_idx) == padded_stride_type::static_value()); + } + static constexpr bool is_always_strided() noexcept { return true; } + + static constexpr bool is_unique() noexcept { return true; } + constexpr bool is_exhaustive() const noexcept + { + return (extents_type::rank() < 2) + || (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + } + static constexpr bool is_strided() noexcept { return true; } + + constexpr index_type stride(rank_type r) const noexcept + { + assert(r < extents_type::rank()); + if(r == 0) return index_type(1); + + index_type value = padded_stride.value(0); + for (rank_type k = 1; k < r; k++) value *= exts.extent(k); + + return value; + } + + /** + * Equality operator between `layout_left_padded`s + * + * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + * + * \note There is currently a difference from p2642r2, where this function is specified as taking + * `layout_left_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_left_padded_mapping<_Mapping>::value + && (_Mapping::extents_type::rank() == extents_type::rank()) + ) + ) + friend constexpr bool operator==(const mapping &left, const _Mapping &right) noexcept + { + // Workaround for some compilers not short-circuiting properly with compile-time checks + // i.e. we can't access stride(_padding_stride_idx) of a rank 0 mapping + bool strides_equal = true; + if constexpr (extents_type::rank() > rank_type(1)) + { + strides_equal = left.stride(padded_stride_idx) == right.stride(padded_stride_idx); + } + return (left.extents() == right.extents()) && strides_equal; + } + +#if !MDSPAN_HAS_CXX_20 + /** + * Inequality operator between `layout_left_padded`s + * + * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_left_padded_mapping<_Mapping>::value + && (_Mapping::extents_type::rank() == extents_type::rank()) + ) + ) + friend constexpr bool operator!=(const mapping &left, const _Mapping &right) noexcept + { + return !(left == right); + } +#endif +}; + +template +template +class layout_right_padded::mapping { +public: + static constexpr size_t padding_value = PaddingValue; + + using extents_type = Extents; + using index_type = typename extents_type::index_type; + using size_type = typename extents_type::size_type; + using rank_type = typename extents_type::rank_type; + using layout_type = layout_right_padded; + +#ifndef MDSPAN_INTERNAL_TEST + private: +#endif // MDSPAN_INTERNAL_TEST + + static constexpr rank_type padded_stride_idx = detail::layout_padded_constants::padded_stride_idx; + static constexpr rank_type extent_to_pad_idx = detail::layout_padded_constants::extent_to_pad_idx; + + static_assert((padding_value != 0) + || (extents_type::static_extent(extent_to_pad_idx) == 0) + || (extents_type::static_extent(extent_to_pad_idx) == dynamic_extent), + "if padding stride is 0, static_extent(extent-to-pad-rank) must also be 0 or dynamic_extent"); + + using padded_stride_type = detail::padded_extent< padding_value, extents_type, extent_to_pad_idx >; + static constexpr size_t static_padding_stride = padded_stride_type::static_value(); + + typename padded_stride_type::static_array_type padded_stride = {}; + extents_type exts = {}; + + constexpr index_type compute_offset(std::index_sequence<>) const { + return 0; + } + + template + constexpr index_type compute_offset(std::index_sequence, + IndexOffset index_offset) const { + return index_offset; + } + + template + constexpr index_type compute_offset(std::index_sequence, + IndexOffsets... index_offsets) const { + // self-recursive fold trick from + // https://github.com/llvm/llvm-project/blob/4d9771741d40cc9cfcccb6b033f43689d36b705a/libcxx/include/mdspan/layout_right.h#L141 + index_type res = 0; + ((res = static_cast(index_offsets) + + (Ranks == extent_to_pad_idx ? padded_stride.value(0) + : exts.extent(Ranks)) * + res), + ...); + return res; + } + +public: +#if !MDSPAN_HAS_CXX_20 + MDSPAN_INLINE_FUNCTION_DEFAULTED + constexpr mapping() + : mapping(extents_type{}) + {} +#else + MDSPAN_INLINE_FUNCTION_DEFAULTED + constexpr mapping() + requires(static_padding_stride != dynamic_extent) = default; + + MDSPAN_INLINE_FUNCTION + constexpr mapping() + requires(static_padding_stride == dynamic_extent) + : mapping(extents_type{}) + {} +#endif + + MDSPAN_INLINE_FUNCTION_DEFAULTED constexpr mapping(const mapping&) noexcept = default; + MDSPAN_INLINE_FUNCTION_DEFAULTED mapping& operator=(const mapping&) noexcept = default; + + /** + * Initializes the mapping with the given extents. + * + * \param ext the given extents + */ + MDSPAN_INLINE_FUNCTION + constexpr mapping(const extents_type &ext) + : padded_stride(padded_stride_type::init_padding(ext)), exts(ext) {} + + /** + * Initializes the mapping with the given extents and the specified padding value. + * + * This overload participates in overload resolution only if `is_convertible_v` + * is `true` and `is_nothrow_constructible_v` is `true` + * + * \param ext the given extents + * \param padding_value the padding value + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Size, + /* requires */ ( + std::is_convertible_v<_Size, index_type> + && std::is_nothrow_constructible_v + ) + ) + MDSPAN_INLINE_FUNCTION + constexpr mapping(const extents_type &ext, _Size dynamic_padding_value) + : padded_stride(padded_stride_type::init_padding(ext, static_cast(dynamic_padding_value))), + exts(ext) { + assert((padding_value == dynamic_extent) || + (static_cast(padding_value) == static_cast(dynamic_padding_value))); + } + + /** + * Converting constructor from `layout_right::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v` is true. + * If `OtherExtents::rank() > 1` then one of `padding_value`, `static_extent(0)`, or `OtherExtents::static_extent(0)` must be `dynamic_extent`; + * otherwise, `OtherExtents::static_extent(0)` must be equal to the least multiple of `padding_value` greater than or equal to `extents_type::static_extent(0)` + */ + MDSPAN_TEMPLATE_REQUIRES( + class _OtherExtents, + /* requires */ ( + std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v<_OtherExtents, extents_type>)) + constexpr mapping(const layout_right::mapping<_OtherExtents> &other_mapping) + : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), + exts(other_mapping.extents()) + { + static_assert((_OtherExtents::rank() > 1) || (padded_stride_type::static_value() != dynamic_extent) || (_OtherExtents::static_extent(extent_to_pad_idx) != dynamic_extent) + || (padded_stride_type::static_value() == _OtherExtents::static_extent(extent_to_pad_idx))); + } + + /** + * Converting constructor from `layout_stride::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v` is true + */ + MDSPAN_TEMPLATE_REQUIRES( + class _OtherExtents, + /* requires */ ( + std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 0)) + constexpr mapping(const layout_stride::mapping<_OtherExtents> &other_mapping) + : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), + exts(other_mapping.extents()) + {} + + /** + * Converting constructor from `layout_right_padded::mapping`. + * + * This overload participates in overload resolution only if `is_constructible_v` is true. + * Either `padding_value` or `OtherPaddingStride` must be `std::dynamic_extent`, or `padding_value == OtherPaddingStride`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_right_padded_mapping<_Mapping>::value + && std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((extents_type::rank() > 1 && + (padding_value == dynamic_extent || + _Mapping::padding_value == dynamic_extent))) + constexpr mapping(const _Mapping &other_mapping) + : padded_stride(padded_stride_type::init_padding(other_mapping, std::integral_constant{})), + exts(other_mapping.extents()) + { + static_assert(padding_value == dynamic_extent || + _Mapping::padding_value == dynamic_extent || + padding_value == _Mapping::padding_value); + } + + /** + * Converting constructor from `layout_left_padded::mapping`. + * + * This overload participates in overload resolution only if `extents_type::rank()` is 0 or 1 and `is_constructible_v` is `true`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_left_padded_mapping<_Mapping>::value + && extents_type::rank() <= 1 + && std::is_constructible_v + ) + ) + MDSPAN_CONDITIONAL_EXPLICIT((!std::is_convertible_v)) + constexpr mapping(const _Mapping &other_mapping) noexcept + : padded_stride(padded_stride_type::init_padding(other_mapping.extents(), other_mapping.extents().extent(extent_to_pad_idx))), + exts(other_mapping.extents()) + {} + + constexpr const extents_type &extents() const noexcept + { + return exts; + } + + constexpr std::array + strides() const noexcept + { + if constexpr ( extents_type::rank() == 0 ) { + return {}; + } else if constexpr ( extents_type::rank() == 1 ) { + return {1}; + } else { + index_type value = 1; + std::array s{}; + s[extent_to_pad_idx] = value; + value *= padded_stride.value(0); + for (rank_type r = extent_to_pad_idx - 1; r > 0; --r) + { + s[r] = value; + value *= exts.extent(r); + } + s[0] = value; + return s; + } + } + + constexpr index_type + required_span_size() const noexcept + { + if constexpr ( extents_type::rank() == 0 ) { + return 1; + } else if constexpr ( extents_type::rank() == 1 ) { + return exts.extent(0); + } else { + index_type value = 1; + for (rank_type r = 0; r < extent_to_pad_idx; ++r) + { + value *= exts.extent(r); + } + return value * padded_stride.value(0); + } + } + + /** + * Return the mapping given the provided indices per rank. + * + * This overload participates in overload resolution only if: + * - `sizeof...(Indices) == extents_type::rank()`, + * - `(is_convertible_v && ...) is true`, and + * - (is_nothrow_constructible_v && ...) is true. + */ + MDSPAN_TEMPLATE_REQUIRES( + class... _Indices, + /* requires */ ( + sizeof...(_Indices) == extents_type::rank() && + (::MDSPAN_IMPL_STANDARD_NAMESPACE::detail::are_valid_indices()) + ) + ) + constexpr size_t operator()(_Indices... idxs) const noexcept + { + return compute_offset(std::index_sequence_for<_Indices...>{}, idxs...); + } + + static constexpr bool is_always_unique() noexcept { return true; } + static constexpr bool is_always_exhaustive() noexcept + { + return (extents_type::rank() <= rank_type(1)) + || (extents_type::static_extent(extent_to_pad_idx) != dynamic_extent + && extents_type::static_extent(extent_to_pad_idx) == padded_stride_type::static_value()); + } + static constexpr bool is_always_strided() noexcept { return true; } + + static constexpr bool is_unique() noexcept { return true; } + constexpr bool is_exhaustive() const noexcept + { + return (extents_type::rank() < 2) + || (exts.extent(extent_to_pad_idx) == padded_stride.value(0)); + } + static constexpr bool is_strided() noexcept { return true; } + + constexpr index_type stride(rank_type r) const noexcept + { + assert(r < extents_type::rank()); + if(r == extents_type::rank() - 1) return index_type(1); + + index_type value = padded_stride.value(0); + for (rank_type k = extents_type::rank() - 2; k > r; k--) value *= exts.extent(k); + + return value; + } + + /** + * Equality operator between `layout_right_padded`s + * + * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + * + * \note There is currently a difference from p2642r2, where this function is specified as taking + * `layout_right_padded< padding_value >::mapping< Extents>`. However, this makes `padding_value` non-deducible. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_right_padded_mapping<_Mapping>::value + && (_Mapping::extents_type::rank() == extents_type::rank()) + ) + ) + friend constexpr bool operator==(const mapping &left, const _Mapping &right) noexcept + { + // Workaround for some compilers not short-circuiting properly with compile-time checks + // i.e. we can't access stride(_padding_stride_idx) of a rank 0 mapping + bool strides_equal = true; + if constexpr (extents_type::rank() > rank_type(1)) + { + strides_equal = left.stride(padded_stride_idx) == right.stride(padded_stride_idx); + } + return (left.extents() == right.extents()) && strides_equal; + } + +#if !MDSPAN_HAS_CXX_20 + /** + * Inequality operator between `layout_right_padded`s + * + * This overload only participates in overload resolution if `OtherExtents::rank() == extents_type::rank()`. + */ + MDSPAN_TEMPLATE_REQUIRES( + class _Mapping, + /* requires */ ( + detail::is_layout_right_padded_mapping<_Mapping>::value + && (_Mapping::extents_type::rank() == extents_type::rank()) + ) + ) + friend constexpr bool operator!=(const mapping &left, const _Mapping &right) noexcept + { + return !(left == right); + } +#endif +}; +} +} diff --git a/tpls/mdspan/include/experimental/__p2642_bits/layout_padded_fwd.hpp b/tpls/mdspan/include/experimental/__p2642_bits/layout_padded_fwd.hpp new file mode 100644 index 00000000000..945f091a2dc --- /dev/null +++ b/tpls/mdspan/include/experimental/__p2642_bits/layout_padded_fwd.hpp @@ -0,0 +1,117 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +#pragma once + +#include +#include "../__p0009_bits/dynamic_extent.hpp" + +namespace MDSPAN_IMPL_STANDARD_NAMESPACE { +namespace MDSPAN_IMPL_PROPOSED_NAMESPACE { + +template +struct layout_left_padded { + template + class mapping; +}; + +template +struct layout_right_padded { + template + class mapping; +}; + +namespace detail { +// The layout_padded_constants structs are only useful if rank > 1, otherwise they may wrap +template +struct layout_padded_constants; + +template +struct layout_padded_constants, _ExtentsType> +{ + using rank_type = typename _ExtentsType::rank_type; + static constexpr rank_type padded_stride_idx = 1; + static constexpr rank_type extent_to_pad_idx = 0; +}; + +template +struct layout_padded_constants, _ExtentsType> +{ + using rank_type = typename _ExtentsType::rank_type; + static constexpr rank_type padded_stride_idx = _ExtentsType::rank() - 2; + static constexpr rank_type extent_to_pad_idx = _ExtentsType::rank() - 1; +}; + +template +struct is_layout_left_padded : std::false_type {}; + +template +struct is_layout_left_padded> : std::true_type {}; + +template +struct is_layout_left_padded_mapping : std::false_type {}; + +template +struct is_layout_left_padded_mapping<_Mapping, + std::enable_if_t::template mapping>::value>> + : std::true_type {}; + +template +struct is_layout_right_padded : std::false_type {}; + +template +struct is_layout_right_padded> : std::true_type {}; + +template +struct is_layout_right_padded_mapping : std::false_type {}; + +template +struct is_layout_right_padded_mapping<_Mapping, + std::enable_if_t::template mapping>::value>> + : std::true_type {}; + +template +constexpr void check_padded_layout_converting_constructor_mandates() +{ + if constexpr (_LayoutExtentsType::rank() > 1) { + using extents_type = typename _PaddedLayoutMappingType::extents_type; + constexpr auto padding_value = _PaddedLayoutMappingType::padding_value; + constexpr auto idx = layout_padded_constants::extent_to_pad_idx; + if constexpr ((_LayoutExtentsType::static_extent(idx) != dynamic_extent) && + (extents_type::static_extent(idx) != dynamic_extent) && + (padding_value != dynamic_extent)) { + if constexpr (padding_value == 0) { + static_assert(_LayoutExtentsType::static_extent(idx) == 0); + } else { + static_assert( + _LayoutExtentsType::static_extent(idx) % padding_value == 0); + } + } + } +} + +template +constexpr void check_padded_layout_converting_constructor_preconditions([[maybe_unused]] const _OtherMapping &other_mapping) { + if constexpr (_ExtentsType::rank() > 1) { + constexpr auto padded_stride_idx = + layout_padded_constants::padded_stride_idx; + constexpr auto extent_to_pad_idx = layout_padded_constants::extent_to_pad_idx; + assert(other_mapping.stride(padded_stride_idx) == other_mapping.extents().extent(extent_to_pad_idx)); + } +} +} +} +} diff --git a/tpls/mdspan/include/mdspan/mdspan.hpp b/tpls/mdspan/include/mdspan/mdspan.hpp index b440873526a..ac72a1a4e64 100644 --- a/tpls/mdspan/include/mdspan/mdspan.hpp +++ b/tpls/mdspan/include/mdspan/mdspan.hpp @@ -35,6 +35,7 @@ #include "../experimental/__p0009_bits/layout_right.hpp" #include "../experimental/__p0009_bits/macros.hpp" #if MDSPAN_HAS_CXX_17 +#include "../experimental/__p2642_bits/layout_padded.hpp" #include "../experimental/__p2630_bits/submdspan.hpp" #endif